OpenCL kernel slower than normal Java loop

OpenCL kernel slower than normal Java loop - java

I've been looking into OpenCL for use with optimizing code and running tasks in parallel to achieve greater speed over pure Java. Now I'm having a bit of an issue.
I've put together a Java program using LWJGL, which as far as I can tell,should be able to do nearly identical tasks -- in this case adding elements from two arrays together and storing the result in another array -- two separate ways: one with pure Java, and the other with an OpenCL Kernel. I'm using System.currentTimeMillis() to keep track of how long each one takes for arrays with a large number of elements(~10,000,000). For whatever reason, the pure java loop seems to be executing around 3 to 10 times, depending on array size, faster than the CL program. My code is as follows(imports omitted):
public class TestCL {
private static final int SIZE = 9999999; //Size of arrays to test, this value is changed sometimes in between tests
private static CLContext context; //CL Context
private static CLPlatform platform; //CL platform
private static List<CLDevice> devices; //List of CL devices
private static CLCommandQueue queue; //Command Queue for context
private static float[] aData, bData, rData; //float arrays to store test data
//---Kernel Code---
//The actual kernel script is here:
//-----------------
private static String kernel = "kernel void sum(global const float* a, global const float* b, global float* result, int const size){\n" +
"const int itemId = get_global_id(0);\n" +
"if(itemId < size){\n" +
"result[itemId] = a[itemId] + b[itemId];\n" +
"}\n" +
"}";;
public static void main(String[] args){
aData = new float[SIZE];
bData = new float[SIZE];
rData = new float[SIZE]; //Only used for CPU testing
//arbitrary testing data
for(int i=0; i<SIZE; i++){
aData[i] = i;
bData[i] = SIZE - i;
}
try {
testCPU(); //How long does it take running in traditional Java code on the CPU?
testGPU(); //How long does the GPU take to run it w/ CL?
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* Test the CPU with pure Java code
*/
private static void testCPU(){
long time = System.currentTimeMillis();
for(int i=0; i<SIZE; i++){
rData[i] = aData[i] + bData[i];
}
//Print the time FROM THE START OF THE testCPU() FUNCTION UNTIL NOW
System.out.println("CPU processing time for " + SIZE + " elements: " + (System.currentTimeMillis() - time));
}
/**
* Test the GPU with OpenCL
* #throws LWJGLException
*/
private static void testGPU() throws LWJGLException {
CLInit(); //Initialize CL and CL Objects
//Create the CL Program
CLProgram program = CL10.clCreateProgramWithSource(context, kernel, null);
int error = CL10.clBuildProgram(program, devices.get(0), "", null);
Util.checkCLError(error);
//Create the Kernel
CLKernel sum = CL10.clCreateKernel(program, "sum", null);
//Error checker
IntBuffer eBuf = BufferUtils.createIntBuffer(1);
//Floatbuffer for the first array of floats
FloatBuffer aBuf = BufferUtils.createFloatBuffer(SIZE);
aBuf.put(aData);
aBuf.rewind();
CLMem aMem = CL10.clCreateBuffer(context, CL10.CL_MEM_WRITE_ONLY | CL10.CL_MEM_COPY_HOST_PTR, aBuf, eBuf);
Util.checkCLError(eBuf.get(0));
//And the second
FloatBuffer bBuf = BufferUtils.createFloatBuffer(SIZE);
bBuf.put(bData);
bBuf.rewind();
CLMem bMem = CL10.clCreateBuffer(context, CL10.CL_MEM_WRITE_ONLY | CL10.CL_MEM_COPY_HOST_PTR, bBuf, eBuf);
Util.checkCLError(eBuf.get(0));
//Memory object to store the result
CLMem rMem = CL10.clCreateBuffer(context, CL10.CL_MEM_READ_ONLY, SIZE * 4, eBuf);
Util.checkCLError(eBuf.get(0));
//Get time before setting kernel arguments
long time = System.currentTimeMillis();
sum.setArg(0, aMem);
sum.setArg(1, bMem);
sum.setArg(2, rMem);
sum.setArg(3, SIZE);
final int dim = 1;
PointerBuffer workSize = BufferUtils.createPointerBuffer(dim);
workSize.put(0, SIZE);
//Actually running the program
CL10.clEnqueueNDRangeKernel(queue, sum, dim, null, workSize, null, null, null);
CL10.clFinish(queue);
//Write results to a FloatBuffer
FloatBuffer res = BufferUtils.createFloatBuffer(SIZE);
CL10.clEnqueueReadBuffer(queue, rMem, CL10.CL_TRUE, 0, res, null, null);
//How long did it take?
//Print the time FROM THE SETTING OF KERNEL ARGUMENTS UNTIL NOW
System.out.println("GPU processing time for " + SIZE + " elements: " + (System.currentTimeMillis() - time));
//Cleanup objects
CL10.clReleaseKernel(sum);
CL10.clReleaseProgram(program);
CL10.clReleaseMemObject(aMem);
CL10.clReleaseMemObject(bMem);
CL10.clReleaseMemObject(rMem);
CLCleanup();
}
/**
* Initialize CL objects
* #throws LWJGLException
*/
private static void CLInit() throws LWJGLException {
IntBuffer eBuf = BufferUtils.createIntBuffer(1);
CL.create();
platform = CLPlatform.getPlatforms().get(0);
devices = platform.getDevices(CL10.CL_DEVICE_TYPE_GPU);
context = CLContext.create(platform, devices, eBuf);
queue = CL10.clCreateCommandQueue(context, devices.get(0), CL10.CL_QUEUE_PROFILING_ENABLE, eBuf);
Util.checkCLError(eBuf.get(0));
}
/**
* Cleanup after CL completion
*/
private static void CLCleanup(){
CL10.clReleaseCommandQueue(queue);
CL10.clReleaseContext(context);
CL.destroy();
}
}
Here are a few example console results from various tests:
CPU processing time for 10000000 elements: 24
GPU processing time for 10000000 elements: 88
CPU processing time for 1000000 elements: 7
GPU processing time for 1000000 elements: 10
CPU processing time for 100000000 elements: 193
GPU processing time for 100000000 elements: 943
Is there something wrong with my coding that's causing the CL to take faster, or is that actually to be expected in cases such as this? If the case is the latter, then when is CL preferable?

I revised the test to do something which I believe is more computationally expensive than simple addition.
Regarding the CPU test, the line:
rData[i] = aData[i] + bData[i];
was changed to:
rData[i] = (float)(Math.sin(aData[i]) * Math.cos(bData[i]));
And in the CL kernel, the line:
result[itemId] = a[itemId] + b[itemId];
was changed to:
result[itemId] = sin(a[itemId]) * cos(b[itemId]);
I'm now getting console results such as:
CPU processing time for 1000000 elements: 154
GPU processing time for 1000000 elements: 11
CPU processing time for 10000000 elements: 8699
GPU processing time for 10000000 elements: 98
(The CPU is taking longer than I'd like to bother with for tests of 100000000 elements.)
For checking accuracy, I added checks that compare an arbitrary element of rData and res to ensure they're the same. I omitted the result here, as it should suffice to say that they were equal.
Now that the function is more complicated(two trigonometric functions being multiplied together), it appears that the CL kernel is much more efficient than the pure Java loop.

Related

Why does my Java program's performance drop significantly after startup?

I am writing a small Java application to analyze a large number of image files. For now, it finds the brightest image in a folder by averaging the brightness of every pixel in the image and comparing it to the other images in the folder.
Sometimes, I get a rate of 100+ images/second right after startup, but this almost always drops to < 20 images/second, and I'm not sure why. When it is at 100+ images/sec, the CPU usage is 100%, but then it drops to around 20%, which seems too low.
Here's the main class:
public class ImageAnalysis {
public static final ConcurrentLinkedQueue<File> queue = new ConcurrentLinkedQueue<>();
private static final ConcurrentLinkedQueue<ImageResult> results = new ConcurrentLinkedQueue<>();
private static int size;
private static AtomicInteger running = new AtomicInteger();
private static AtomicInteger completed = new AtomicInteger();
private static long lastPrint = 0;
private static int completedAtLastPrint;
public static void main(String[] args){
File rio = new File(IO.CAPTURES_DIRECTORY.getAbsolutePath() + File.separator + "Rio de Janeiro");
String month = "12";
Collections.addAll(queue, rio.listFiles((dir, name) -> {
return (name.substring(0, 2).equals(month));
}));
size = queue.size();
ExecutorService executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() + 1);
for (int i = 0; i < 8; i++){
AnalysisThread t = new AnalysisThread();
t.setPriority(Thread.MAX_PRIORITY);
executor.execute(t);
running.incrementAndGet();
}
}
public synchronized static void finished(){
if (running.decrementAndGet() <= 0){
ImageResult max = new ImageResult(null, 0);
for (ImageResult r : results){
if (r.averageBrightness > max.averageBrightness){
max = r;
}
}
System.out.println("Max Red: " + max.averageBrightness + " File: " + max.file.getAbsolutePath());
}
}
public synchronized static void finishedImage(ImageResult result){
results.add(result);
int c = completed.incrementAndGet();
if (System.currentTimeMillis() - lastPrint > 10000){
System.out.println("Completed: " + c + " / " + size + " = " + ((double) c / (double) size) * 100 + "%");
System.out.println("Rate: " + ((double) c - (double) completedAtLastPrint) / 10D + " images / sec");
completedAtLastPrint = c;
lastPrint = System.currentTimeMillis();
}
}
}
And the thread class:
public class AnalysisThread extends Thread {
#Override
public void run() {
while(!ImageAnalysis.queue.isEmpty()) {
File f = ImageAnalysis.queue.poll();
BufferedImage image;
try {
image = ImageIO.read(f);
double color = 0;
for (int x = 0; x < image.getWidth(); x++) {
for (int y = 0; y < image.getHeight(); y++) {
//Color c = new Color(image.getRGB(x, y));
color += image.getRGB(x,y);
}
}
color /= (image.getWidth() * image.getHeight());
ImageAnalysis.finishedImage((new ImageResult(f, color)));
} catch (IOException e) {
e.printStackTrace();
}
}
ImageAnalysis.finished();
}
}

You appear to have a mixed up both using a thread pool and creating threads of your own. I suggest you use on or the other. In fact I suggest you only use the fixed thread pool
Most likely what is happening is your threads are getting an exception which is being lost but killing the task which kills the thread.
I suggest you just the the thread pool, don't attempt to create your own threads, or queue as this is that the ExecutorService does for you. For each task, submit it to the pool, one per image and if you are not going to check the Error of any task, I suggest you trap all Throwable and log them otherwise you could get a RuntimeExcepion or Error and have no idea this happened.
If you have Java 8, a simpler approach would be to use parallelStream(). You can use this to analyse the images concurrently and collect the results without having to divide up the work and collect the results. e.g
List<ImageResults> results = Stream.of(rio.listFiles())
.parallel()
.filter(f -> checkFile(f))
.map(f -> getResultsFor(f))
.list(Collectors.toList());

I see two reasons why you may experience CPU usage deterioration:
your tasks are very I/O intensive (reading images - ImageIO.read(f));
there is thread contention over the synchronized method that your threads access;
Further the sizes of the images may influence execution times.
To exploit parallelism efficiently I would suggest that you redesign your app and implement two kind of tasks that would be submitted to the executor:
the first tasks (producers) would be I/O intensive and will read the image data and queue it for in-memory processing;
the other (consumers) will pull and analyze the image information;
Then with some profiling you will be able to determine the correct ratio between producers and consumers.

The problem I could see here is the usage of queues in the high-performance concurrency model you are looking for. Using a queue is not optimal while using with a modern CPU design. Queue implementations have write contention on the head, tail and size variables. They are either always close to full or close to empty due to differences in pace between consumers and producers, especially while using in a high I/O situation. This results in high levels of contention. Further, in Java queues are significant source of garbage.
What I suggest is to apply Mechanical Sympathy while designing your code. One of the best solution you can have is the usage of LMAX Disruptor, which is a high performance inter-thread messaging library, which is aimed to solve this concurrency problem
Additional References
http://lmax-exchange.github.io/disruptor/files/Disruptor-1.0.pdf
http://martinfowler.com/articles/lmax.html
https://dzone.com/articles/mechanical-sympathy
http://www.infoq.com/presentations/mechanical-sympathy

MultiThread runs slower than single process

for an assignment in school I was asked to create a simple program that creates 1000 text files, each with a random amount of lines, count how many lines are there via multi-thread\single process. than delete those files.
now a strange thing happens during testing - linear counting of all files is always a bit faster than counting them in a multi-threaded way which has sparked quite the academic theorizing session within my classroom circle.
when using Scanner to read all files, everything works as intended - 1000 files are read at around 500ms linear time and 400ms threaded time
yet when i use BufferedReader times drop to around 110ms linear and 130ms threaded.
which part of the code causes this bottleneck and why?
EDIT: Just to clarify, I'm not asking why does Scanner works slower than BufferedReader.
the full compile-able code: (though you should change the file creation path output)
import java.io.*;
import java.util.Random;
import java.util.Scanner;
/**
* Builds text files with random amount of lines and counts them with
* one process or multi-threading.
* #author Hazir
*/// CLASS MATALA_4A START:
public class Matala_4A {
/* Finals: */
private static final String MSG = "Hello World";
/* Privates: */
private static int count;
private static Random rand;
/* Private Methods: */ /**
* Increases the random generator.
* #return The new random value.
*/
private static synchronized int getRand() {
return rand.nextInt(1000);
}
/**
* Increments the lines-read counter by a value.
* #param val The amount to be incremented by.
*/
private static synchronized void incrementCount(int val) {
count+=val;
}
/**
* Sets lines-read counter to 0 and Initializes random generator
* by the seed - 123.
*/
private static void Initialize() {
count=0;
rand = new Random(123);
}
/* Public Methods: */ /**
* Creates n files with random amount of lines.
* #param n The amount of files to be created.
* #return String array with all the file paths.
*/
public static String[] createFiles(int n) {
String[] array = new String[n];
for (int i=0; i<n; i++) {
array[i] = String.format("C:\\Files\\File_%d.txt", i+1);
try ( // Try with Resources:
FileWriter fw = new FileWriter(array[i]);
PrintWriter pw = new PrintWriter(fw);
) {
int numLines = getRand();
for (int j=0; j<numLines; j++) pw.println(MSG);
} catch (IOException ex) {
System.err.println(String.format("Failed Writing to file: %s",
array[i]));
}
}
return array;
}
/**
* Deletes all the files who's file paths are specified
* in the fileNames array.
* #param fileNames The files to be deleted.
*/
public static void deleteFiles(String[] fileNames) {
for (String fileName : fileNames) {
File file = new File(fileName);
if (file.exists()) {
file.delete();
}
}
}
/**
* Creates numFiles amount of files.<br>
* Counts how many lines are in all the files via Multi-threading.<br>
* Deletes all the files when finished.
* #param numFiles The amount of files to be created.
*/
public static void countLinesThread(int numFiles) {
Initialize();
/* Create Files */
String[] fileNames = createFiles(numFiles);
Thread[] running = new Thread[numFiles];
int k=0;
long start = System.currentTimeMillis();
/* Start all threads */
for (String fileName : fileNames) {
LineCounter thread = new LineCounter(fileName);
running[k++] = thread;
thread.start();
}
/* Join all threads */
for (Thread thread : running) {
try {
thread.join();
} catch (InterruptedException e) {
// Shouldn't happen.
}
}
long end = System.currentTimeMillis();
System.out.println(String.format("threads time = %d ms, lines = %d",
end-start,count));
/* Delete all files */
deleteFiles(fileNames);
}
#SuppressWarnings("CallToThreadRun")
/**
* Creates numFiles amount of files.<br>
* Counts how many lines are in all the files in one process.<br>
* Deletes all the files when finished.
* #param numFiles The amount of files to be created.
*/
public static void countLinesOneProcess(int numFiles) {
Initialize();
/* Create Files */
String[] fileNames = createFiles(numFiles);
/* Iterate Files*/
long start = System.currentTimeMillis();
LineCounter thread;
for (String fileName : fileNames) {
thread = new LineCounter(fileName);
thread.run(); // same process
}
long end = System.currentTimeMillis();
System.out.println(String.format("linear time = %d ms, lines = %d",
end-start,count));
/* Delete all files */
deleteFiles(fileNames);
}
public static void main(String[] args) {
int num = 1000;
countLinesThread(num);
countLinesOneProcess(num);
}
/**
* Auxiliary class designed to count the amount of lines in a text file.
*/// NESTED CLASS LINECOUNTER START:
private static class LineCounter extends Thread {
/* Privates: */
private String fileName;
/* Constructor: */
private LineCounter(String fileName) {
this.fileName=fileName;
}
/* Methods: */
/**
* Reads a file and counts the amount of lines it has.
*/ #Override
public void run() {
int count=0;
try ( // Try with Resources:
FileReader fr = new FileReader(fileName);
//Scanner sc = new Scanner(fr);
BufferedReader br = new BufferedReader(fr);
) {
String str;
for (str=br.readLine(); str!=null; str=br.readLine()) count++;
//for (; sc.hasNext(); sc.nextLine()) count++;
incrementCount(count);
} catch (IOException e) {
System.err.println(String.format("Failed Reading from file: %s",
fileName));
}
}
} // NESTED CLASS LINECOUNTER END;
} // CLASS MATALA_4A END;

The bottleneck is the disk.
You can access to the disk only with one thread per time, so using multiple threads doesn't help and instead the overtime needed for the thread switching will slow your global performances.
Using multithread is interesting only if you need to split your work waiting for long I/O operations on different sources (for example network and disk, or two different disks, or many network streams) or if you have a cpu intensive operation that can be splitted between different cores.
Remember that for a good multithreading program you need always to take in consideration:
switch context time between threads
long I/O operations can be done in parallel or not
intensive cpu time for computations is present or not
cpu computations can be splitted in subproblems or not
complexity to share data between threads (semaphores or synchronization)
difficult to read, write and manage a multithread code compared to a single thread application

There can be different factors:
Most important is avoiding disk access from multiple threads at the same time (but since you are on SSD, you might get away with that). On a normal harddisk however, switching from one file to another could cost you 10ms seek time (depending on how the data is cached).
1000 threads is too much, try to use number of cores * 2. Too much time will be lost switching contexts only.
Try using a thread pool. Total times are between 110ms and 130ms, part of that will be from creating threads.
Do some more work in the test in general. Timing 110ms isn't always that accurate. Also depends on what other processes or threads are running at that time.
Try to switch the order of your tests to see if it makes a difference (caching could be an important factor)
countLinesThread(num);
countLinesOneProcess(num);
Also, depending on the system, currentTimeMillis() might have a resolution of 10 to 15ms. So it isn't very accurate to time short runs.
long start = System.currentTimeMillis();
long end = System.currentTimeMillis();

The number of Threads used is very important. a single process trying to switch between 1000 threads(you have created a new thread per file) is probably the main reason for being slower.
try to use let's say 10 threads to read 1000 files, then you'll see the noticeable speed increase

If the actual time needed for the computation is negligible compared to the time needed for I/O, potential multi-threding benefits are negligible as well: One thread is well able to saturate the I/O and will then do a very quick computation; more threads cannot accelerate things much. Instead, the usual threading overheads will apply, plus possibly a locking penalty in the I/O implementation actually decreasing throughput.
I think the potential benefits are greatest when the CPU time needed to deal with a data chunk is long compared to the time to obtain it from disk. In that case all threads but the currently reading one (if any) can compute, and execution speed should scale nicely with the number of cores. Try checking large prime number candidates from a file or cracking encrypted lines (which, kindof, amounts to the same thing, silly enough).

Runtime freeMemory not decreasing as I use more memory

There's a memory leak in my program, probably at the point where I call certain native functions using JNA. I decided to test a function that's causing it when I wasn't sure. I created this test:
public class MemoryLeakCheck {
public static final Runtime runtime = Runtime.getRuntime();
public static final double mb = 1024*1024;
public static void main(String[] args) throws Exception {
//Remember the starting memory for the final comparison
double usedMemoryStart = getMBUsed();
System.out.println("Starting with "+String.format("%4.1f", usedMemoryStart));
//This will be updated to keep track of changes during test
double lastMemory = usedMemoryStart;
//Memory change threshold - once the change is greater than this, info will appear in console
final double threshold = 10;
while(win.isValid()) {
//Run the tested operation
//something here
//Do not kill the CPU
Thread.sleep(200);
//Calculate memory changes
double mbnew = getMBUsed();
double diff = mbnew-lastMemory;
if(diff>=threshold || diff<=-threshold) {
System.out.println((diff>0?"+":"-")+" "+String.format("%3.3f", diff*(diff>0?01.0D:-1.0D)));
System.out.println(" = "+String.format("%4.1f", mbnew));
//Update lastMemory to keep track of the next change
lastMemory = mbnew;
}
}
//Final change sum
double mbnew = getMBUsed();
double diff = mbnew-usedMemoryStart;
System.out.println("Overall diff: "+String.format("%4.1f", diff));
}
/** Will return used memory in MBytes as double. Calculates from difference
* between total and free memory.
*
* #return used memory in MBytes.
*/
public static final double getMBUsed() {
return (runtime.totalMemory() - runtime.freeMemory())/mb;
}
}
I started the loop and went to make coffee. When I came back, the corresponding java.exe instance was using 500MB ram according to the Task Manager. But the output of the program looked like this:
Starting with 31,1
- 22,945
= 8,1
+ 10,754
= 18,9
+ 10,254
= 29,2
- 21,284
= 7,9
... repeats a lot ...
+ 10,587
= 52,2
- 10,579
= 41,6
+ 10,587
= 52,2
- 10,579
= 41,6
Overall diff: 15,9
------------------------------------------------------------------------
BUILD SUCCESS
------------------------------------------------------------------------
Total time: 2:46.265s
Finished at: Mon Apr 13 12:20:56 CEST 2015
Final Memory: 6M/106M
------------------------------------------------------------------------
As you can see, neither my results nor the maven output contain the correct value. I'm surprised. Is this memory leak even caused by my program?

Understanding why/how Java's native sorting of an int array is optimized on successive sorts...so I can stop it

My program in a nutshell:
I have a program that successively runs several sort algorithms against an array of ints, timing each. The GUI allows the user to select array size and a variety of random number ranges with which to fill the array to be sorted. Each click of the "Sort" button grabs the user's array values, constructs a new array, then creates for each sort algorithm a clone of the array, using .clone().
The problem:
when "sort" button is clicked a second time the sorts improve themselves.
Somewhere there's optimization happening that I don't understand.
The reason this is an issue: if the user doesn't change their array settings and runs the sort methods again, it is true that a new array is constructed with new random numbers but the random number ranges remain the same so the run time, over a list of 125k, should remain about the same....not improve 300%.
So here is a demo program of what I am facing. It uses only one sort, Java's native sort to demonstrate the issue. It also uses hard coded values for constructing the random int array to be sorted - but does so with each "enter" press. I think this simulation accurately reflects my program because the same "error" is happening here, too.
Why is the sort faster the second time?
...the array is rebuilt with new values for each run, so how can it get faster?
package sortTooFast;
import java.util.Arrays;
import java.util.Scanner;
public class SortTooFast {
public static final int ARRAY_SIZE = 500000;
public static final int MIN_RANGE = 0;
public static final int MAX_RANGE = 100;
public static final int INCLUSIVE = 1;
int[] sortingArray;
public static void main(String[] args) {
SortTooFast test = new SortTooFast();
test.run();
}
// Run program.
public void run(){
while(true){
// Assign int[] filled with random numbers.
sortingArray = getArray();
// Inform user.
System.out.println("\nPress return key to run sort!");
// Wait for user.
new Scanner(System.in).nextLine();
System.out.println("First 15 elements to be sorted:");
// Print a small section of the array; prove not sorted
for (int i = 0; i < 15; i++){
System.out.printf("%4d", sortingArray[i]);
}
// Perform sort.
runNativeSort(sortingArray);
}
}
// Run native java sort.
private void runNativeSort(int[] array) {
// Start timer
long startTime = System.currentTimeMillis();
// Perform sort.
Arrays.sort(array);
// End timer
long finishTime = System.currentTimeMillis();
// Running time.
long runTime = finishTime - startTime;
// Report run time.
System.out.println("\nRun time: " +runTime);
}
// Obtain an array filled with random int values.
private int[] getArray() {
// Make int array.
int[] mArray = new int[ARRAY_SIZE];
// Length of array.
int length = mArray.length;
// Fill array with random numbers.
for(int counter = 0; counter < length; counter++){
int random = MIN_RANGE + (int)(Math.random() * ((MAX_RANGE - MIN_RANGE) + INCLUSIVE));
mArray[counter] = random;
}
return mArray;
}
}

Why is the sort faster the second time?
Because by that time, the JIT has optimized the bytecode into faster native code.
There are two effects you need to counter when benchmarking this sort of thing:
Time taken to JIT the code in the first place
The way that the native code improves over time as it is optimized harder and harder by the JIT.
Typically you can reduce the effect of this to achieve a steady state by running the code for long enough to get it fully optimized before you start timing.
Additionally, you should use System.nanoTime instead of System.currentTimeMillis when benchmarking: System.currentTimeMillis is meant to give you a reasonably accurate "wall clock" time, which may be adjusted by the operating system if it notices that the clock is out of sync, whereas nanoTime is specifically designed for measuring elapsed time since a particular instant, regardless of changes to the system clock.

Attempting to create a stable game engine loop

I'm writing a fairly simple 2D multiplayer-over-network game. Right now, I find it nearly impossible for myself to create a stable loop. By stable I mean such kind of loop inside which certain calculations are done and which is repeated over strict periods of time (let's say, every 25 ms, that's what I'm fighting for right now). I haven't faced many severe hindrances this far except for this one.
In this game, several threads are running, both in server and client applications, assigned to various tasks. Let's take for example engine thread in my server application. In this thread, I try to create game loop using Thread.sleep, trying to take in account time taken by game calculations. Here's my loop, placed within run() method. Tick() function is payload of the loop. It simply contains ordered calls to other methods doing constant game updating.
long engFPS = 40;
long frameDur = 1000 / engFPS;
long lastFrameTime;
long nextFrame;
<...>
while(true)
{
lastFrameTime = System.currentTimeMillis();
nextFrame = lastFrameTime + frameDur;
Tick();
if(nextFrame - System.currentTimeMillis() > 0)
{
try
{
Thread.sleep(nextFrame - System.currentTimeMillis());
}
catch(Exception e)
{
System.err.println("TSEngine :: run :: " + e);
}
}
}
The major problem is that Thread.sleep just loves to betray your expectations about how much it will sleep. It can easily put thread to rest for much longer or much shorter time, especially on some machines with Windows XP (I've tested it myself, WinXP gives really nasty results compared to Win7 and other OS). I've poked around internets quite a lot, and result was disappointing. It seems to be fault of the thread scheduler of the OS we're running on, and its so-called granularity. As far as I understood, this scheduler constantly, over certain amount of time, checks demands of every thread in system, in particular, puts/awakes them from sleep. When re-checking time is low, like 1ms, things may seem smooth. Although, it is said that WinXP has granularity as high as 10 or 15 ms. I've also read that not only Java programmers, but those using other languages face this problem as well.
Knowing this, it seems almost impossible to make a stable, sturdy, reliable game engine. Nevertheless, they're everywhere.
I'm highly wondering by which means this problem can be fought or circumvented. Could someone more experienced give me a hint on this?

Don't rely on the OS or any timer mechanism to wake your thread or invoke some callback at a precise point in time or after a precise delay. It's just not going to happen.
The way to deal with this is instead of setting a sleep/callback/poll interval and then assuming that the interval is kept with a high degree of precision, keep track of the amount of time that has elapsed since the previous iteration and use that to determine what the current state should be. Pass this amount through to anything that updates state based upon the current "frame" (really you should design your engine in a way that the internal components don't know or care about anything as concrete as a frame; so that instead there is just state that moves fluidly through time, and when a new frame needs to be sent for rendering a snapshot of this state is used).
So for example, you might do:
long maxWorkingTimePerFrame = 1000 / FRAMES_PER_SECOND; //this is optional
lastStartTime = System.currentTimeMillis();
while(true)
{
long elapsedTime = System.currentTimeMillis() - lastStartTime;
lastStartTime = System.currentTimeMillis();
Tick(elapsedTime);
//enforcing a maximum framerate here is optional...you don't need to sleep the thread
long processingTimeForCurrentFrame = System.currentTimeMillis() - lastStartTime;
if(processingTimeForCurrentFrame < maxWorkingTimePerFrame)
{
try
{
Thread.sleep(maxWorkingTimePerFrame - processingTimeForCurrentFrame);
}
catch(Exception e)
{
System.err.println("TSEngine :: run :: " + e);
}
}
}
Also note that you can get greater timer granularity by using System.nanoTime() in place of System.currentTimeMillis().

You may getter better results with
LockSupport.parkNanos(long nanos)
altho it complicates the code a bit compared to sleep()

maybe this helps you.
its from david brackeen's bock developing games in java
and calculates average granularity to fake a more fluent framerate:
link
public class TimeSmoothie {
/**
How often to recalc the frame rate
*/
protected static final long FRAME_RATE_RECALC_PERIOD = 500;
/**
Don't allow the elapsed time between frames to be more than 100 ms
*/
protected static final long MAX_ELAPSED_TIME = 100;
/**
Take the average of the last few samples during the last 100ms
*/
protected static final long AVERAGE_PERIOD = 100;
protected static final int NUM_SAMPLES_BITS = 6; // 64 samples
protected static final int NUM_SAMPLES = 1 << NUM_SAMPLES_BITS;
protected static final int NUM_SAMPLES_MASK = NUM_SAMPLES - 1;
protected long[] samples;
protected int numSamples = 0;
protected int firstIndex = 0;
// for calculating frame rate
protected int numFrames = 0;
protected long startTime;
protected float frameRate;
public TimeSmoothie() {
samples = new long[NUM_SAMPLES];
}
/**
Adds the specified time sample and returns the average
of all the recorded time samples.
*/
public long getTime(long elapsedTime) {
addSample(elapsedTime);
return getAverage();
}
/**
Adds a time sample.
*/
public void addSample(long elapsedTime) {
numFrames++;
// cap the time
elapsedTime = Math.min(elapsedTime, MAX_ELAPSED_TIME);
// add the sample to the list
samples[(firstIndex + numSamples) & NUM_SAMPLES_MASK] =
elapsedTime;
if (numSamples == samples.length) {
firstIndex = (firstIndex + 1) & NUM_SAMPLES_MASK;
}
else {
numSamples++;
}
}
/**
Gets the average of the recorded time samples.
*/
public long getAverage() {
long sum = 0;
for (int i=numSamples-1; i>=0; i--) {
sum+=samples[(firstIndex + i) & NUM_SAMPLES_MASK];
// if the average period is already reached, go ahead and return
// the average.
if (sum >= AVERAGE_PERIOD) {
Math.round((double)sum / (numSamples-i));
}
}
return Math.round((double)sum / numSamples);
}
/**
Gets the frame rate (number of calls to getTime() or
addSample() in real time). The frame rate is recalculated
every 500ms.
*/
public float getFrameRate() {
long currTime = System.currentTimeMillis();
// calculate the frame rate every 500 milliseconds
if (currTime > startTime + FRAME_RATE_RECALC_PERIOD) {
frameRate = (float)numFrames * 1000 /
(currTime - startTime);
startTime = currTime;
numFrames = 0;
}
return frameRate;
}
}

We Keep Coding

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

OpenCL kernel slower than normal Java loop - java

Related

Why does my Java program's performance drop significantly after startup?

MultiThread runs slower than single process

Runtime freeMemory not decreasing as I use more memory

Understanding why/how Java's native sorting of an int array is optimized on successive sorts...so I can stop it

Attempting to create a stable game engine loop

Categories

Resources