Runtime freeMemory not decreasing as I use more memory - java

There's a memory leak in my program, probably at the point where I call certain native functions using JNA. I decided to test a function that's causing it when I wasn't sure. I created this test:
public class MemoryLeakCheck {
public static final Runtime runtime = Runtime.getRuntime();
public static final double mb = 1024*1024;
public static void main(String[] args) throws Exception {
//Remember the starting memory for the final comparison
double usedMemoryStart = getMBUsed();
System.out.println("Starting with "+String.format("%4.1f", usedMemoryStart));
//This will be updated to keep track of changes during test
double lastMemory = usedMemoryStart;
//Memory change threshold - once the change is greater than this, info will appear in console
final double threshold = 10;
while(win.isValid()) {
//Run the tested operation
//something here
//Do not kill the CPU
Thread.sleep(200);
//Calculate memory changes
double mbnew = getMBUsed();
double diff = mbnew-lastMemory;
if(diff>=threshold || diff<=-threshold) {
System.out.println((diff>0?"+":"-")+" "+String.format("%3.3f", diff*(diff>0?01.0D:-1.0D)));
System.out.println(" = "+String.format("%4.1f", mbnew));
//Update lastMemory to keep track of the next change
lastMemory = mbnew;
}
}
//Final change sum
double mbnew = getMBUsed();
double diff = mbnew-usedMemoryStart;
System.out.println("Overall diff: "+String.format("%4.1f", diff));
}
/** Will return used memory in MBytes as double. Calculates from difference
* between total and free memory.
*
* #return used memory in MBytes.
*/
public static final double getMBUsed() {
return (runtime.totalMemory() - runtime.freeMemory())/mb;
}
}
I started the loop and went to make coffee. When I came back, the corresponding java.exe instance was using 500MB ram according to the Task Manager. But the output of the program looked like this:
Starting with 31,1
- 22,945
= 8,1
+ 10,754
= 18,9
+ 10,254
= 29,2
- 21,284
= 7,9
... repeats a lot ...
+ 10,587
= 52,2
- 10,579
= 41,6
+ 10,587
= 52,2
- 10,579
= 41,6
Overall diff: 15,9
------------------------------------------------------------------------
BUILD SUCCESS
------------------------------------------------------------------------
Total time: 2:46.265s
Finished at: Mon Apr 13 12:20:56 CEST 2015
Final Memory: 6M/106M
------------------------------------------------------------------------
As you can see, neither my results nor the maven output contain the correct value. I'm surprised. Is this memory leak even caused by my program?

Related

OpenCL kernel slower than normal Java loop

I've been looking into OpenCL for use with optimizing code and running tasks in parallel to achieve greater speed over pure Java. Now I'm having a bit of an issue.
I've put together a Java program using LWJGL, which as far as I can tell,should be able to do nearly identical tasks -- in this case adding elements from two arrays together and storing the result in another array -- two separate ways: one with pure Java, and the other with an OpenCL Kernel. I'm using System.currentTimeMillis() to keep track of how long each one takes for arrays with a large number of elements(~10,000,000). For whatever reason, the pure java loop seems to be executing around 3 to 10 times, depending on array size, faster than the CL program. My code is as follows(imports omitted):
public class TestCL {
private static final int SIZE = 9999999; //Size of arrays to test, this value is changed sometimes in between tests
private static CLContext context; //CL Context
private static CLPlatform platform; //CL platform
private static List<CLDevice> devices; //List of CL devices
private static CLCommandQueue queue; //Command Queue for context
private static float[] aData, bData, rData; //float arrays to store test data
//---Kernel Code---
//The actual kernel script is here:
//-----------------
private static String kernel = "kernel void sum(global const float* a, global const float* b, global float* result, int const size){\n" +
"const int itemId = get_global_id(0);\n" +
"if(itemId < size){\n" +
"result[itemId] = a[itemId] + b[itemId];\n" +
"}\n" +
"}";;
public static void main(String[] args){
aData = new float[SIZE];
bData = new float[SIZE];
rData = new float[SIZE]; //Only used for CPU testing
//arbitrary testing data
for(int i=0; i<SIZE; i++){
aData[i] = i;
bData[i] = SIZE - i;
}
try {
testCPU(); //How long does it take running in traditional Java code on the CPU?
testGPU(); //How long does the GPU take to run it w/ CL?
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* Test the CPU with pure Java code
*/
private static void testCPU(){
long time = System.currentTimeMillis();
for(int i=0; i<SIZE; i++){
rData[i] = aData[i] + bData[i];
}
//Print the time FROM THE START OF THE testCPU() FUNCTION UNTIL NOW
System.out.println("CPU processing time for " + SIZE + " elements: " + (System.currentTimeMillis() - time));
}
/**
* Test the GPU with OpenCL
* #throws LWJGLException
*/
private static void testGPU() throws LWJGLException {
CLInit(); //Initialize CL and CL Objects
//Create the CL Program
CLProgram program = CL10.clCreateProgramWithSource(context, kernel, null);
int error = CL10.clBuildProgram(program, devices.get(0), "", null);
Util.checkCLError(error);
//Create the Kernel
CLKernel sum = CL10.clCreateKernel(program, "sum", null);
//Error checker
IntBuffer eBuf = BufferUtils.createIntBuffer(1);
//Floatbuffer for the first array of floats
FloatBuffer aBuf = BufferUtils.createFloatBuffer(SIZE);
aBuf.put(aData);
aBuf.rewind();
CLMem aMem = CL10.clCreateBuffer(context, CL10.CL_MEM_WRITE_ONLY | CL10.CL_MEM_COPY_HOST_PTR, aBuf, eBuf);
Util.checkCLError(eBuf.get(0));
//And the second
FloatBuffer bBuf = BufferUtils.createFloatBuffer(SIZE);
bBuf.put(bData);
bBuf.rewind();
CLMem bMem = CL10.clCreateBuffer(context, CL10.CL_MEM_WRITE_ONLY | CL10.CL_MEM_COPY_HOST_PTR, bBuf, eBuf);
Util.checkCLError(eBuf.get(0));
//Memory object to store the result
CLMem rMem = CL10.clCreateBuffer(context, CL10.CL_MEM_READ_ONLY, SIZE * 4, eBuf);
Util.checkCLError(eBuf.get(0));
//Get time before setting kernel arguments
long time = System.currentTimeMillis();
sum.setArg(0, aMem);
sum.setArg(1, bMem);
sum.setArg(2, rMem);
sum.setArg(3, SIZE);
final int dim = 1;
PointerBuffer workSize = BufferUtils.createPointerBuffer(dim);
workSize.put(0, SIZE);
//Actually running the program
CL10.clEnqueueNDRangeKernel(queue, sum, dim, null, workSize, null, null, null);
CL10.clFinish(queue);
//Write results to a FloatBuffer
FloatBuffer res = BufferUtils.createFloatBuffer(SIZE);
CL10.clEnqueueReadBuffer(queue, rMem, CL10.CL_TRUE, 0, res, null, null);
//How long did it take?
//Print the time FROM THE SETTING OF KERNEL ARGUMENTS UNTIL NOW
System.out.println("GPU processing time for " + SIZE + " elements: " + (System.currentTimeMillis() - time));
//Cleanup objects
CL10.clReleaseKernel(sum);
CL10.clReleaseProgram(program);
CL10.clReleaseMemObject(aMem);
CL10.clReleaseMemObject(bMem);
CL10.clReleaseMemObject(rMem);
CLCleanup();
}
/**
* Initialize CL objects
* #throws LWJGLException
*/
private static void CLInit() throws LWJGLException {
IntBuffer eBuf = BufferUtils.createIntBuffer(1);
CL.create();
platform = CLPlatform.getPlatforms().get(0);
devices = platform.getDevices(CL10.CL_DEVICE_TYPE_GPU);
context = CLContext.create(platform, devices, eBuf);
queue = CL10.clCreateCommandQueue(context, devices.get(0), CL10.CL_QUEUE_PROFILING_ENABLE, eBuf);
Util.checkCLError(eBuf.get(0));
}
/**
* Cleanup after CL completion
*/
private static void CLCleanup(){
CL10.clReleaseCommandQueue(queue);
CL10.clReleaseContext(context);
CL.destroy();
}
}
Here are a few example console results from various tests:
CPU processing time for 10000000 elements: 24
GPU processing time for 10000000 elements: 88
CPU processing time for 1000000 elements: 7
GPU processing time for 1000000 elements: 10
CPU processing time for 100000000 elements: 193
GPU processing time for 100000000 elements: 943
Is there something wrong with my coding that's causing the CL to take faster, or is that actually to be expected in cases such as this? If the case is the latter, then when is CL preferable?
I revised the test to do something which I believe is more computationally expensive than simple addition.
Regarding the CPU test, the line:
rData[i] = aData[i] + bData[i];
was changed to:
rData[i] = (float)(Math.sin(aData[i]) * Math.cos(bData[i]));
And in the CL kernel, the line:
result[itemId] = a[itemId] + b[itemId];
was changed to:
result[itemId] = sin(a[itemId]) * cos(b[itemId]);
I'm now getting console results such as:
CPU processing time for 1000000 elements: 154
GPU processing time for 1000000 elements: 11
CPU processing time for 10000000 elements: 8699
GPU processing time for 10000000 elements: 98
(The CPU is taking longer than I'd like to bother with for tests of 100000000 elements.)
For checking accuracy, I added checks that compare an arbitrary element of rData and res to ensure they're the same. I omitted the result here, as it should suffice to say that they were equal.
Now that the function is more complicated(two trigonometric functions being multiplied together), it appears that the CL kernel is much more efficient than the pure Java loop.

Why is System.nanoTime not accurate?

import java.math.BigDecimal;
public class testtest {
public static final BigDecimal TWO = new BigDecimal(2);
public static final int digits = 1000;
public static final BigDecimal TOLERANCE = BigDecimal.ONE.scaleByPowerOfTen(-digits);
public static double MidpointMethod = 0;
public static long MidpointMethod(int n) {
BigDecimal k = new BigDecimal(n);
BigDecimal a = BigDecimal.ONE; // set a to be one
BigDecimal b = k; // set b to be two
long start = System.nanoTime(); // start the timer
while(a.multiply(a).subtract(k).abs().compareTo(TOLERANCE) >= 0) { // while our decimals aren't close enough to the square root of two
if(a.multiply(a).subtract(k).abs().compareTo(b.multiply(b).subtract(k).abs()) > 0) // if a is farther from the square root of two than b
a = a.add(b).divide(TWO); // set a to be the average of a and b
else // if a is closer to the square root of two than b
b = a.add(b).divide(TWO); // set b to be the average of a and b
}
return System.nanoTime() - start; // return the time taken
}
public static void main(String[] args) {
System.out.println(MidpointMethod(2)/10e6);
}
}
This program outputs 6224.5209, but when I ran it it took way, way over 20 seconds to run. Why does it display 6 seconds when it actually took more than 20 seconds?
is the 6 seconds an accurate and precise measure of how long the program took?
To convert nanoseconds to milliseconds (which I assume you were trying), divide by 1e6, not 10e6. You are off by a factor of 10.
The Syste.nanoTime() is fully accurate given that you work on a decent PC, which I'll assume you are. The problem is any kind of initialisation before you call for the first time the method, including JVM start up, stack set up, heap set up, the Big decimals initialisation takes some time. Also if you are using a lot of your RAM and it is almost full that boot up time can go even more.

StackOverflowError using Recursion

I'm supposed to be comparing a Recursive and a Non-Recursive function to see which one is quicker for a class project. The professor also wants us to time the iterations in milliseconds when the iterator is equal to 10,100,1000, etc. I got it all to work but was having loads of trouble in C++ getting the timer, so I switched to Java as it's much much easier to get millisecond output.
But now when I try to use any number over 8,000 I get a big fat stack overflow error from the Recursive algorithm. Can anyone give me any insight?
Bonus: I also can't figure out how to do the timer in the Recursive function like I did in the Non-Recursive. How would I approach this?
public class comparingTimes {
public static void main(String[] args) {
double num = 10000;
double result;
nonRec(num);
result = rec(num);
System.out.printf("Rec %.0f",(result));
}
public static void nonRec(double num)
{
double resultNum = 1;
double total = 0;
long startTime = System.currentTimeMillis();
long endTime;
for (double i = 1; i < num; i++)
{
total += i * (i+1);
if (i == resultNum)
{
endTime = System.currentTimeMillis();
System.out.printf("Total execution time: %f seconds - num = %.0f%n", (endTime - startTime)/1000.0, i);
resultNum *= 10;
}
}
System.out.printf("NonRec: %.0f%n", total);
}
public static double rec(double num)
{
if (num == 0)
return 0;
else
return num * (num-1) + rec(num-1);
}
}
The ideal use case for recursion is when you reduce the "search space" massively on each recursion level. For example, consider a binary search where each recursion level halves the remaining search space.
Your particular problem is that you're trying to do 8000 levels of recursion since each level simply decrements the value. That's going to require a fairly large chunk of stack space.
You can look into increasing the stack size for your JVM with the -ss or -oss options (depending on implementation, of course). But that will only buy you so much.
In terms of timing the whole recursive operation, I would simply store the time before the top-level call in main(), then compare that to the time after that top-level call returns, something like:
long startTime = System.currentTimeMillis();
result = rec(num);
long endTime = System.currentTimeMillis();
// Now calculate the elapsed time.
There's no need to try and do it within the recursive call itself.
If you want to do it at certain points within the recursive call, you can initialise a "global" counter variable (one outside the recursion itself, such as a class-level static variable) to 0 and have the recursive function increment it for every recursion level.
Then have it output the time deltas at the points you're interested in, such as when the variable is set to 10, 100, 1000 and so on.
Try increasing the stack size.
As for measuring time
public static void main(String[] args) {
double num = 10000;
double result;
long start = System.currentTimeMillis();
nonRec(num);
long finish = System.currentTimeMillis();
System.out.println("Time taken (non-recursive): " + (finish -start));
start = System.currentTimeMillis();
result = rec(num);
finish = System.currentTimeMillis();
System.out.println("Time taken (recursive): " + (finish -start));
System.out.printf("Rec %.0f",(result));
}

Profiling java code that calls Runtime.freeMemory()

I have some code that profiles Runtime.freeMemory. Here is my code:
package misc;
import java.util.ArrayList;
import java.util.Random;
public class FreeMemoryTest {
private final ArrayList<Double> l;
private final Random r;
public FreeMemoryTest(){
this.r = new Random();
this.l = new ArrayList<Double>();
}
public static boolean memoryCheck() {
double freeMem = Runtime.getRuntime().freeMemory();
double totalMem = Runtime.getRuntime().totalMemory();
double fptm = totalMem * 0.05;
boolean toReturn = fptm > freeMem;
return toReturn;
}
public void freeMemWorkout(int max){
for(int i = 0; i < max; i++){
memoryCheck();
l.add(r.nextDouble());
}
}
public void workout(int max){
for(int i = 0; i < max; i++){
l.add(r.nextDouble());
}
}
public static void main(String[] args){
FreeMemoryTest f = new FreeMemoryTest();
int count = Integer.parseInt(args[1]);
long startTime = System.currentTimeMillis();
if(args[0].equals("f")){
f.freeMemWorkout(count);
} else {
f.workout(count);
}
long endTime = System.currentTimeMillis();
System.out.println(endTime - startTime);
}
}
When I run the profiler using -Xrunhprof:cpu=samples, the vast majority of the calls are to the Runtime.freeMemory(), like this:
CPU SAMPLES BEGIN (total = 531) Fri Dec 7 00:17:20 2012
rank self accum count trace method
1 83.62% 83.62% 444 300274 java.lang.Runtime.freeMemory
2 9.04% 92.66% 48 300276 java.lang.Runtime.totalMemory
When I run the profiler using -Xrunhprof:cpu=time, I don't see any of the calls to Runtime.freeMemory at all, and the top five calls are as follows:
CPU TIME (ms) BEGIN (total = 10042) Fri Dec 7 00:29:51 2012
rank self accum count trace method
1 13.39% 13.39% 200000 307547 java.util.Random.next
2 9.69% 23.08% 1 307852 misc.FreeMemoryTest.freeMemWorkout
3 7.41% 30.49% 100000 307544 misc.FreeMemoryTest.memoryCheck
4 7.39% 37.88% 100000 307548 java.util.Random.nextDouble
5 4.35% 42.23% 100000 307561 java.util.ArrayList.add
These two profiles are so different from one another. I thought that samples was supposed to at least roughly approximate the results from the times, but here we see a very radical difference, something that consumes more than 80% of the samples doesn't even appear in the times profile. This does not make any sense to me, does anyone know why this is happening?
More on this:
$ java -Xmx1000m -Xms1000m -jar memtest.jar a 20000000 5524
//does not have the calls to Runtime.freeMemory()
$ java -Xmx1000m -Xms1000m -jar memtest.jar f 20000000 9442
//has the calls to Runtime.freeMemory()
Running with freemem requires approximately twice the amount of time as running without it. If 80% of the CPU time is spent in java.Runtime.freeMemory(), and I remove that call, I would expect the program to speed up by a factor of approximately 5. As we can see above, the program speeds up by a factor of approximately 2.
A slowdown of a factor of 5 is way worse than a slowdown of a factor of 2 that was observed empirically, so what I do not understand is how the sampling profiler is so far off from reality.
The Runtime freeMemory() and totalMemory() are native calls.
See http://www.docjar.com/html/api/java/lang/Runtime.java.html
The timer cannot time them, but the sampler can.

Attempting to create a stable game engine loop

I'm writing a fairly simple 2D multiplayer-over-network game. Right now, I find it nearly impossible for myself to create a stable loop. By stable I mean such kind of loop inside which certain calculations are done and which is repeated over strict periods of time (let's say, every 25 ms, that's what I'm fighting for right now). I haven't faced many severe hindrances this far except for this one.
In this game, several threads are running, both in server and client applications, assigned to various tasks. Let's take for example engine thread in my server application. In this thread, I try to create game loop using Thread.sleep, trying to take in account time taken by game calculations. Here's my loop, placed within run() method. Tick() function is payload of the loop. It simply contains ordered calls to other methods doing constant game updating.
long engFPS = 40;
long frameDur = 1000 / engFPS;
long lastFrameTime;
long nextFrame;
<...>
while(true)
{
lastFrameTime = System.currentTimeMillis();
nextFrame = lastFrameTime + frameDur;
Tick();
if(nextFrame - System.currentTimeMillis() > 0)
{
try
{
Thread.sleep(nextFrame - System.currentTimeMillis());
}
catch(Exception e)
{
System.err.println("TSEngine :: run :: " + e);
}
}
}
The major problem is that Thread.sleep just loves to betray your expectations about how much it will sleep. It can easily put thread to rest for much longer or much shorter time, especially on some machines with Windows XP (I've tested it myself, WinXP gives really nasty results compared to Win7 and other OS). I've poked around internets quite a lot, and result was disappointing. It seems to be fault of the thread scheduler of the OS we're running on, and its so-called granularity. As far as I understood, this scheduler constantly, over certain amount of time, checks demands of every thread in system, in particular, puts/awakes them from sleep. When re-checking time is low, like 1ms, things may seem smooth. Although, it is said that WinXP has granularity as high as 10 or 15 ms. I've also read that not only Java programmers, but those using other languages face this problem as well.
Knowing this, it seems almost impossible to make a stable, sturdy, reliable game engine. Nevertheless, they're everywhere.
I'm highly wondering by which means this problem can be fought or circumvented. Could someone more experienced give me a hint on this?
Don't rely on the OS or any timer mechanism to wake your thread or invoke some callback at a precise point in time or after a precise delay. It's just not going to happen.
The way to deal with this is instead of setting a sleep/callback/poll interval and then assuming that the interval is kept with a high degree of precision, keep track of the amount of time that has elapsed since the previous iteration and use that to determine what the current state should be. Pass this amount through to anything that updates state based upon the current "frame" (really you should design your engine in a way that the internal components don't know or care about anything as concrete as a frame; so that instead there is just state that moves fluidly through time, and when a new frame needs to be sent for rendering a snapshot of this state is used).
So for example, you might do:
long maxWorkingTimePerFrame = 1000 / FRAMES_PER_SECOND; //this is optional
lastStartTime = System.currentTimeMillis();
while(true)
{
long elapsedTime = System.currentTimeMillis() - lastStartTime;
lastStartTime = System.currentTimeMillis();
Tick(elapsedTime);
//enforcing a maximum framerate here is optional...you don't need to sleep the thread
long processingTimeForCurrentFrame = System.currentTimeMillis() - lastStartTime;
if(processingTimeForCurrentFrame < maxWorkingTimePerFrame)
{
try
{
Thread.sleep(maxWorkingTimePerFrame - processingTimeForCurrentFrame);
}
catch(Exception e)
{
System.err.println("TSEngine :: run :: " + e);
}
}
}
Also note that you can get greater timer granularity by using System.nanoTime() in place of System.currentTimeMillis().
You may getter better results with
LockSupport.parkNanos(long nanos)
altho it complicates the code a bit compared to sleep()
maybe this helps you.
its from david brackeen's bock developing games in java
and calculates average granularity to fake a more fluent framerate:
link
public class TimeSmoothie {
/**
How often to recalc the frame rate
*/
protected static final long FRAME_RATE_RECALC_PERIOD = 500;
/**
Don't allow the elapsed time between frames to be more than 100 ms
*/
protected static final long MAX_ELAPSED_TIME = 100;
/**
Take the average of the last few samples during the last 100ms
*/
protected static final long AVERAGE_PERIOD = 100;
protected static final int NUM_SAMPLES_BITS = 6; // 64 samples
protected static final int NUM_SAMPLES = 1 << NUM_SAMPLES_BITS;
protected static final int NUM_SAMPLES_MASK = NUM_SAMPLES - 1;
protected long[] samples;
protected int numSamples = 0;
protected int firstIndex = 0;
// for calculating frame rate
protected int numFrames = 0;
protected long startTime;
protected float frameRate;
public TimeSmoothie() {
samples = new long[NUM_SAMPLES];
}
/**
Adds the specified time sample and returns the average
of all the recorded time samples.
*/
public long getTime(long elapsedTime) {
addSample(elapsedTime);
return getAverage();
}
/**
Adds a time sample.
*/
public void addSample(long elapsedTime) {
numFrames++;
// cap the time
elapsedTime = Math.min(elapsedTime, MAX_ELAPSED_TIME);
// add the sample to the list
samples[(firstIndex + numSamples) & NUM_SAMPLES_MASK] =
elapsedTime;
if (numSamples == samples.length) {
firstIndex = (firstIndex + 1) & NUM_SAMPLES_MASK;
}
else {
numSamples++;
}
}
/**
Gets the average of the recorded time samples.
*/
public long getAverage() {
long sum = 0;
for (int i=numSamples-1; i>=0; i--) {
sum+=samples[(firstIndex + i) & NUM_SAMPLES_MASK];
// if the average period is already reached, go ahead and return
// the average.
if (sum >= AVERAGE_PERIOD) {
Math.round((double)sum / (numSamples-i));
}
}
return Math.round((double)sum / numSamples);
}
/**
Gets the frame rate (number of calls to getTime() or
addSample() in real time). The frame rate is recalculated
every 500ms.
*/
public float getFrameRate() {
long currTime = System.currentTimeMillis();
// calculate the frame rate every 500 milliseconds
if (currTime > startTime + FRAME_RATE_RECALC_PERIOD) {
frameRate = (float)numFrames * 1000 /
(currTime - startTime);
startTime = currTime;
numFrames = 0;
}
return frameRate;
}
}

Categories