I am learning JCuda and studying with JCuda samples.
When I studied a KMeans algorithm code using JCuda, I got a "CUDA_ERROR_ILLEGAL_ADDRESS" when executed line cuCtxSynchronize();
It confused me a lot. How can I solve it?
Here is KMeansKernel.cu
extern "C"
__global__ void add(int n, float *a, float *b, float *sum)
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i<n)
sum[i] = a[i] + b[i];
Main method(my class named "CUDA"):
public static void main(String[] args){
// omit some code which input kinds of parameters
try {
// Open image file
BufferedImage bi = ImageIO.read(picFiles);
if (bi == null) {
System.out.println("ERROR: File input error.");
// Read image data
int length = bi.getWidth() * bi.getHeight();
int[] imageProperty = new int[length*5];
int[] pixel;
int count = 0;
for (int y = 0; y < bi.getHeight(); y++) {
for (int x = 0; x < bi.getWidth(); x++) {
pixel = bi.getRaster().getPixel(x, y, new int[4]);
imageProperty[count*5 ] = pixel[0];
imageProperty[count*5+1] = pixel[1];
imageProperty[count*5+2] = pixel[2];
imageProperty[count*5+3] = x;
imageProperty[count*5+4] = y;
// Create the PTX file
String ptxFileName;
ptxFileName = preparePtxFile("KmeansKernel.cu");
catch (IOException e)
CUdevice device = new CUdevice();
cuDeviceGet(device, 0);
CUcontext context = new CUcontext();
cuCtxCreate(context, 0, device);
CUmodule module = new CUmodule();
cuModuleLoad(module, ptxFileName);
CUfunction kmeansFunction = new CUfunction();
cuModuleGetFunction(kmeansFunction, module, "add");
//copy host input to device
CUdeviceptr imageDevice = new CUdeviceptr();
cuMemAlloc(imageDevice, imageProperty.length * Sizeof.INT);
cuMemcpyHtoD(imageDevice, Pointer.to(imageProperty), imageProperty.length * Sizeof.INT);
int blockSizeX = 256;
int gridSizeX = (int) Math.ceil((double)(imageProperty.length / 5) / blockSizeX);
long et = System.currentTimeMillis();
System.out.println(((double)(et-st)/1000.0) + "s");
for (int k = startClusters; k <= endClusters; k++) {
long startTime = System.currentTimeMillis();
int[] clusters = new int[length];
int[] c = new int[k*5];
int h = 0;
for(int i = 0; i < k; i++) {
c[i*5] = imageProperty[h*5];
c[i*5+1] = imageProperty[h*5+1];
c[i*5+2] = imageProperty[h*5+2];
c[i*5+3] = imageProperty[h*5+3];
c[i*5+4] = imageProperty[h*5+4];
h += length / k;
double tolerance = 1e-4;
**//got warning in following line
CUDA.KmeansKernel(kmeansFunction, imageDevice, imageProperty, clusters, c, k, tolerance, distanceWeight, colorWeight, blockSizeX, gridSizeX);**
int[] output = calculateAveragePixels(imageProperty, clusters);
BufferedImage outputImage = new BufferedImage(bi.getWidth(), bi.getHeight(), BufferedImage.TYPE_INT_RGB);
for (int i = 0; i < length; i++) {
int rgb = output[i*5];
rgb = (rgb * 256) + output[i*5+1];
rgb = (rgb * 256) + output[i*5+2];
outputImage.setRGB(i%bi.getWidth(), i/bi.getWidth(), rgb);
String fileName = (picFiles.getName()) + ".bmp";
File outputFile = new File("output/" + fileName);
ImageIO.write(outputImage, "BMP", outputFile);
long runTime = System.currentTimeMillis() - startTime;
System.out.println("Completed iteration k=" + k + " in " + ((double)runTime/1000.0) + "s");
System.out.println("Files saved to " + outputDirectory.getAbsolutePath() + "\\");
} catch (IOException e) {
Method KmeansKernel:
private static void KmeansKernel(CUfunction kmeansFunction, CUdeviceptr imageDevice, int[] imageProperty, int[] clusters, int[] c,
int k, double tolerance, double distanceWeight, double colorWeight,
int blockSizeX, int gridSizeX) {
CUdeviceptr clustersDevice = new CUdeviceptr();
cuMemAlloc(clustersDevice, clusters.length * Sizeof.INT);
// Alloc device output
CUdeviceptr centroidPixels = new CUdeviceptr();
cuMemAlloc(centroidPixels, k * 5 * Sizeof.INT);
CUdeviceptr errorDevice = new CUdeviceptr();
cuMemAlloc(errorDevice, Sizeof.DOUBLE * clusters.length);
int[] c1 = new int[k*5];
cuMemcpyHtoD(centroidPixels, Pointer.to(c), Sizeof.INT * 5 * k);
// begin algorithm
int[] counts = new int[k];
double old_error, error = Double.MAX_VALUE;
int l = 0;
do {
old_error = error;
error = 0;
Arrays.fill(counts, 0);
Arrays.fill(c1, 0);
cuMemcpyHtoD(centroidPixels, Pointer.to(c), k * 5 * Sizeof.INT);
Pointer kernelParameters = Pointer.to(
Pointer.to(new int[] {clusters.length}),
Pointer.to(new int[] {k}),
Pointer.to(new double[] {colorWeight}),
Pointer.to(new double[] {distanceWeight}),
gridSizeX, 1, 1,
blockSizeX, 1, 1,
0, null,
kernelParameters, null
**cuCtxSynchronize(); //got warning here.why?**
cuMemcpyDtoH(Pointer.to(clusters), clustersDevice, Sizeof.INT*clusters.length);
for (int i = 0; i < clusters.length; i++) {
int cluster = clusters[i];
c1[cluster*5] += imageProperty[i*5];
c1[cluster*5+1] += imageProperty[i*5+1];
c1[cluster*5+2] += imageProperty[i*5+2];
c1[cluster*5+3] += imageProperty[i*5+3];
c1[cluster*5+4] += imageProperty[i*5+4];
for (int i = 0; i < k; i++) {
if (counts[i] > 0) {
c[i*5] = c1[i*5] / counts[i];
c[i*5+1] = c1[i*5+1] / counts[i];
c[i*5+2] = c1[i*5+2] / counts[i];
c[i*5+3] = c1[i*5+3] / counts[i];
c[i*5+4] = c1[i*5+4] / counts[i];
} else {
c[i*5] = c1[i*5];
c[i*5+1] = c1[i*5+1];
c[i*5+2] = c1[i*5+2];
c[i*5+3] = c1[i*5+3];
c[i*5+4] = c1[i*5+4];
double[] errors = new double[clusters.length];
cuMemcpyDtoH(Pointer.to(errors), errorDevice, Sizeof.DOUBLE*clusters.length);
error = sumArray(errors);
System.out.println("" + l + " iterations");
} while (Math.abs(old_error - error) > tolerance);
cuMemcpyDtoH(Pointer.to(clusters), clustersDevice, clusters.length * Sizeof.INT);
Stack trace:
Exception in thread "main" jcuda.CudaException: CUDA_ERROR_ILLEGAL_ADDRESS
at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:330)
at jcuda.driver.JCudaDriver.cuCtxSynchronize(JCudaDriver.java:1938)
at com.test.CUDA.KmeansKernel(CUDA.java:269)
at com.test.CUDA.main(CUDA.java:184)
As #talonmies mentions, the kernelParameters you are passing to the cuLaunchKernel method are not in line with add kernel function signature.
You get the error at cuCtxSynchronize because CUDA execution model is asynchronous: cuLaunchKernel returns immediately and actual execution of the kernel on the device is asynchronous. cuCtxSynchronize documentation reads:
Note that this function may also return error codes from previous, asynchronous launches.
The second kernelParameters entry is an int k, where the second parameter of add method is a pointer to float, hence most probably the illegal access error.
Made a new sketch on openprocessing.org to test for some different grade combinations quickly... but whenever I run it, the page freezes hangs until chrome says that it is unresponsive. My other sketches are working just fine, it is only this one.
Here is the sketch:
double a, o, u, k;
int[][] combos;
int[][] a_combos, b_combos;
int failcounter;
void setup() {
a = 2 + 4;
o = 4 + 4;
u = 3 + 4;
k = 3 + 5;
combos = new int[10000][4];
a_combos = new int[10000][4];
b_combos = new int[10000][4];
failcounter = 0;
void draw() {
for (int i = 0; i < combos.length; i++) {
double atemp = a + combos[0];
double otemp = o + combos[1];
double utemp = u + combos[2];
double ktemp = k + combos[3];
double avg = (atemp + otemp + utemp + ktemp) / 4;
if (avg >= 17) {
a_combos[i] = combos[i];
} else if (avg >= 13.48) {
b_combos[i] = combos[i];
} else {
println("Getting an A:");
for (int i = 0; i < a_combos.length; i++) {
if (a_combos[i] != null) println(a_combos[i]);
println("Getting a B:");
for (int i = 0; i < b_combos.length; i++) {
if (b_combos[i] != null) println(b_combos[i]);
println("A or B versus C, D, or F:");
println(10000 - failcount + ", " + failcount);
void fillCombos() {
int q = 0;
int w = 0;
int e = 0;
int r = 0;
for (int i = 0; i < combos.length; i++) {
combos[i][0] = q;
combos[i][1] = w;
combos[i][2] = e;
combos[i][3] = r;
if (r == 10) {
r = 0;
if (e == 10) {
e = 0;
if (w == 10) {
w = 0;
If I put print lines into several different locations in the code, none of them run for whatever reason. Any insight?
I am trying to perform hamming followed by FFT of a wav file. I have implemented the same in python. How to do this in Java. I have applied hamming on the given wave file which returns a bytearrayOutputStream. Now how to perform FFT over this byteArrayOutputStream?
I am new to audio processing. My current code is:
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.DataLine;
import javax.sound.sampled.LineUnavailableException;
import javax.sound.sampled.TargetDataLine;
import javax.sound.sampled.UnsupportedAudioFileException;
public class AudioFFT {
public static void main(String[] args) throws UnsupportedAudioFileException, IOException {
String wavFilePath="C:\\abc.wav";
ByteArrayOutputStream byteArrayOutputStream=applyHamming(wavFilePath);
byte[] bytesOut=byteArrayOutputStream.toByteArray();
public static ByteArrayOutputStream applyHamming(String filePath)
// TODO Auto-generated method stub
ByteArrayOutputStream outputStream=new ByteArrayOutputStream();
File fileIn = new File(filePath);
AudioInputStream audioInputStream;
try {
audioInputStream = AudioSystem.getAudioInputStream(fileIn);
int bytesPerFrame = audioInputStream.getFormat().getFrameSize();
if (bytesPerFrame == AudioSystem.NOT_SPECIFIED) {
bytesPerFrame = 1;
int numBytes = 1024 * bytesPerFrame;
byte[] audioBytes = new byte[numBytes];
int numBytesRead = 0;
while ((numBytesRead = audioInputStream.read(audioBytes, 0, audioBytes.length)) != -1) {
outputStream.write(audioBytes, 0, numBytesRead);
} catch (UnsupportedAudioFileException | IOException e1) {
// TODO Auto-generated catch block
return outputStream;
private static int BITS_IN_BYTE = 8;
private static AudioInputStream audioInputStream;
private static AudioFormat format;
final static int W = 1024;
public static void getFFT() {
String wavFilePath="C:\\abc.wav";;
File AudioFile = new File(wavFilePath);
ByteArrayOutputStream out = new ByteArrayOutputStream();
BufferedInputStream in;
try {
audioInputStream = AudioSystem.getAudioInputStream(AudioFile);
} catch (UnsupportedAudioFileException e) {
} catch (IOException e) {
format = audioInputStream.getFormat();
DataLine.Info info = new DataLine.Info(TargetDataLine.class, format);
if (!AudioSystem.isLineSupported(info)) {
TargetDataLine line = null;
try {
line = (TargetDataLine) AudioSystem.getLine(info);
} catch (LineUnavailableException ex) {
byte[] data = new byte[W * format.getSampleSizeInBits() / BITS_IN_BYTE];
double[] inbuf = new double[W];
double[] fftbuf = new double[W];
try {
in = new BufferedInputStream(new FileInputStream(AudioFile));
int read;
while ((read = in.read(data)) > 0) {
out.write(data, 0, read);
} catch (FileNotFoundException e) {
} catch (IOException e) {
data = out.toByteArray();
decode(data, inbuf);
fft(inbuf, fftbuf);
public static void decode(byte[] input, double[] output) {
assert input.length == 2 * output.length;
for (int i = 0; i < output.length; i++) {
output[i] = (short) (((0xFF & input[2 * i + 1]) << 8) | (0xFF & input[2 * i]));
output[i] /= Short.MAX_VALUE;
public static void fft(final double[] inputReal, double[] inputImag) {
assert inputReal.length == 2 * inputImag.length;
int n = inputReal.length;
double ld = Math.log(n) / Math.log(2.0);
if (((int) ld) - ld != 0) {
System.out.println("The number of elements is not a power of 2.");
int nu = (int) ld;
int n2 = n / 2;
int nu1 = nu - 1;
double[] xReal = new double[n];
double[] xImag = new double[n];
double tReal, tImag, p, arg, c, s;
double constant;
if (true){
constant = -2 * Math.PI;
for (int i = 0; i < n; i++) {
xReal[i] = inputReal[i];
xImag[i] = inputImag[i];
int k = 0;
for (int l = 1; l <= nu; l++) {
while (k < n) {
for (int i = 1; i <= n2; i++) {
p = bitreverseReference(k >> nu1, nu);
arg = constant * p / n;
c = Math.cos(arg);
s = Math.sin(arg);
tReal = xReal[k + n2] * c + xImag[k + n2] * s;
tImag = xImag[k + n2] * c - xReal[k + n2] * s;
xReal[k + n2] = xReal[k] - tReal;
xImag[k + n2] = xImag[k] - tImag;
xReal[k] += tReal;
xImag[k] += tImag;
k += n2;
k = 0;
n2 /= 2;
k = 0;
int r;
while (k < n) {
r = bitreverseReference(k, nu);
if (r > k) {
tReal = xReal[k];
tImag = xImag[k];
xReal[k] = xReal[r];
xImag[k] = xImag[r];
xReal[r] = tReal;
xImag[r] = tImag;
double[] newArray = new double[xReal.length * 2];
double radice = 1 / Math.sqrt(n);
for (int i = 0; i < newArray.length; i += 2) {
int i2 = i / 2;
newArray[i] = xReal[i2] * radice;
newArray[i + 1] = xImag[i2] * radice;
for (int i = 0; i < newArray.length; i++) {
System.out.println("Array: " + newArray[i]);
private static int bitreverseReference(int j, int nu) {
int j2;
int j1 = j;
int k = 0;
for (int i = 1; i <= nu; i++) {
j2 = j1 / 2;
k = 2 * k + j1 - 2 * j2;
j1 = j2;
return k;
bytesOut contain your wav file data (after being modified by a Hamming window function). These data
represent the real part that you should send to your fft method (inputReal). For the imaginary part,
create an array of the same size as inputReal and fill it with zeros
//create an array for the imaginary part ( I assume 1024 in length)
double[] imgBytesOut = new double[1024]; //imgBytesOut is not a good name for this
for(int i=0; i<1024;i++)
imgBytesOut[i] = 0;
Now you have everything you need to call fft
fft(bytesOut, imgBytesOut);
Your fft method populates xReal and xImg arrays but since you declared them locally, you won't be
able to use them after fft has finsihed (declare them as static global variables).
Also, if your file contains, say, 10000 samples and your fft size is 1024 samples long(outBytes and imgBytesOut are 1024 samples long)
you will have to call fft repeatedly to process the whole file. To get the best results, you would still need to apply overlapping (e.g. for a 50% overlap and fft size of 1024, you'd process samples 1-1024, then 512-1536, then 1024-2048 and so on).
I am trying to determine the core and delta points of a fingerprint. I'm using the Poincaré index method, but I am unable to successfully detect this points and I can't figure out why.
First I divide the image in 15x15 blocks, then I calculate the x and y gradients which i use in obtaining the orientation map. After getting the mean orientation for each block then i apply the Poincaré index method, described in the image below (credits: Handbook of Fingerprint Recognition, Davide Maltoni):
And the code is this:
public static void detectSgPoints(int blkSze, Mat src) {
int windowX = 1;
int windowY = 1;
if (blkSze < src.width()) {
windowX = src.width() - blkSze;
if (blkSze < src.height()) {
windowY = src.height() - blkSze;
Map<Point, Double> map = new HashMap<>();
double[][] avg = new double[src.height() / blkSze][src.width() / blkSze];
int m = 0;
int n = 0;
for (int i = 0; i < windowY; i += blkSze) {
for (int j = 0; j < windowX; j += blkSze) {
Mat block = utils.getROI(src, new Rect(j, i, blkSze, blkSze));
Mat dx = new Mat(new Size(blkSze, blkSze), CvType.CV_64FC1);
Mat dy = new Mat(new Size(blkSze, blkSze), CvType.CV_64FC1);
Imgproc.Sobel(block, dx, CvType.CV_64FC1, 1, 0);
Imgproc.Sobel(block, dy, CvType.CV_64FC1, 0, 1);
Mat orientation = calculateOrientation(dx, dy);
int cpx = j + (blkSze / 2), cpy = i + (blkSze / 2);
avg[m][n] = avgAngle(orientation, false);
if (avg[m][n] < 0) {
avg[m][n] = 360 + avg[m][n];
map.put(new Point(cpx, cpy), avg[m][n]);
n = 0;
for (int mm = 1; mm < avg.length - 1; mm++) {
for (int nn = 1; nn < avg[0].length - 1; nn++) {
int j = nn * blkSze;
int i = mm * blkSze;
double psum = 0;
int cpx = j + (blkSze / 2), cpy = i + (blkSze / 2);
for (int k = 0; k < anglePos2.length - 1; k++) {
double dif = 0.0;
dif = avg[mm + anglePos2[k + 1][0]][nn + anglePos2[k + 1][1]]
- avg[mm + anglePos2[k][0]][nn + anglePos2[k][1]];
System.out.println("adding " + "(" + avg[mm + anglePos2[k +1[0]][nn + anglePos2[k + 1][1]] + "-"
+ avg[mm + anglePos2[k][0]][nn + anglePos2[k][1]] + ") = " + dif + " to " + psum);
psum = psum + dif;
double poincare = psum;
System.out.println("cpx = " + cpx + ", cpy = " + cpy + " poincare = " + poincare);
private static double avgAngle(Mat orientation, boolean toDegrees) {
List<Double> angle = new ArrayList<>();
for (int i = 0; i < orientation.height(); i++) {
for (int j = 0; j < orientation.width(); j++) {
double value = orientation.get(i, j)[0];
value = Math.toDegrees(value);
return getMeanAngle(angle);
public static double getMeanAngle(List<Double> sample) {
double x_component = 0.0;
double y_component = 0.0;
double avg_d, avg_r;
for (double angle_d : sample) {
double angle_r;
angle_r = Math.toRadians(angle_d);
x_component += Math.cos(angle_r);
y_component += Math.sin(angle_r);
x_component /= sample.size();
y_component /= sample.size();
avg_r = Math.atan2(y_component, x_component);
avg_d = Math.toDegrees(avg_r);
return avg_d;
public static Mat calculateOrientation(Mat dx, Mat dy) {
Mat orientation = new Mat(dx.size(), CvType.CV_32F);
for (int i = 0; i < dx.rows(); i++) {
for (int j = 0; j < dx.cols(); j++) {
double valueX = dx.get(i, j)[0];
double valueY = dy.get(i, j)[0];
double result = Math.atan2(valueY, valueX);
orientation.put(i, j, result);
return orientation;
Where is the problem?
Recently learned about Cramers rule in precalculus, and decided to make an algorithm in Java to help me understand it better.
The following code works 100% correctly, however it does not use any sort of for loop to do what it does in a much simpler fashion.
Question: Is there a more elegant implementation of Cramers Rule in Java?
I'm thinking that making a basic determinant method, and then doing some column swapping for when I need to take the determinant of Dx, Dy, and Dz. (for Dx, swap column 4 with column 1 of the original matrix, then take determinant and divide by original determinant.)
This sound good?
public static void main(String[] args) {
int[][] matrix = new int[3][3];
matrix[0] = new int[] { 3, 5, -1, -2 };
matrix[1] = new int[] { 1, -4, 2, 13 };
matrix[2] = new int[] { 2, 4, 3, 1 };
int[] r = crame(matrix);
info("x: " + r[0] + ", y: " + r[1] + ", z: " + r[2]);
for(int i = 0; i < matrix.length; i++) {
int[] base = matrix[i];
if(check(base, r, base[3])) {
info("System " + (i+1) + " checks!");
} else {
info("System " + (i+1) + " fails check!");
public static int[] crame(int[][] m) {
int[] result;
if (m.length == 2) {
result = new int[2];
int D = (m[0][0] * m[1][1]) - (m[1][0] * m[0][1]);
int Dx = (m[0][2] * m[1][1]) - (m[1][2] * m[0][1]);
int Dy = (m[0][0] * m[1][2]) - (m[1][0] * m[0][2]);
result[0] = (int) (Dx / D);
result[1] = (int) (Dy / D);
} else if (m.length == 3) {
result = new int[3];
int D = (((m[0][2] * m[1][1] * m[0][2]) + (m[2][1] * m[1][2] * m[0][0]) + (m[2][2]
* m[1][0] * m[0][2])) - ((m[0][0] * m[1][1] * m[2][2])
+ (m[0][1] * m[1][2] * m[0][2]) + (m[0][2] * m[1][0] * m[2][1])));
int Dx = (((m[2][3] * m[1][1] * m[0][2]) + (m[2][1] * m[1][2] * m[0][3]) + (m[2][2]
* m[1][3] * m[0][1])) - ((m[0][3] * m[1][1] * m[2][2])
+ (m[0][1] * m[1][2] * m[2][3]) + (m[0][2] * m[1][3] * m[2][1])));
int Dy = (((m[2][0] * m[1][3] * m[0][2]) + (m[2][3] * m[1][2] * m[0][3]) + (m[2][2]
* m[1][0] * m[0][3])) - ((m[0][0] * m[1][3] * m[2][2])
+ (m[0][3] * m[1][2] * m[2][0]) + (m[0][2] * m[1][0] * m[2][3])));
int Dz = (((m[2][0] * m[1][1] * m[0][3]) + (m[2][1] * m[1][3] * m[0][0]) + (m[2][3]
* m[1][0] * m[0][1])) - ((m[0][0] * m[1][1] * m[2][3])
+ (m[0][1] * m[1][3] * m[2][0]) + (m[0][3] * m[1][0] * m[2][1])));
result[0] = (int) (Dx / D);
result[1] = (int) (Dy / D);
result[2] = (int) (Dz / D);
} else {
return new int[] {};
return result;
public static int product(int[] a, int[] b) {
int p = 0;
int[] fin = new int[(a.length -1)];
for(int x = 0; x < fin.length; x++) {
fin[x] = a[x] * b[x];
for (int f : fin) {
p += f;
return p;
public static boolean check(int[] a, int[] b, int z) {
return product(a, b) == z;
public static void info(String log) {
My question pertains to the specific algorithm that can be used to solve systems of equations using Cramers rule only, is there any algorithm that is more elegant? The function is only designed for square matrices.
This is not a homework assignment, after HS I will be studying CS and I've been working on developing algorithms as preliminary practice.
Thank you for checking this out
First of, there is one way in which Cramers rule is perfect: It gives the algebraic solution of a linear system as a rational function in its coefficients.
However, practically, it has its limits. While the most perfect formula for a 2x2 system, and still good for a 3x3 system, its performance, if implemented in the straightforward way, deteriorates with each additional dimension.
An almost literal implementation of Cramers rule can be achieved with the Leverrier-Faddeev algorithm a b. It only requires the computation of matrix products and matrix traces, and manipulations of the matrix diagonal. Not only does it compute the determinant of the matrix A (along with the other coefficients of the characteristic polynomial), it also has the adjugate or co-factor matrix A# in its iteration matrix. The interesting fact about this matrix is that it allows to write the solution of A*x=b as (A#*b)/det(A), that is, the entries of A#*b already are the other determinants required by Cramers rule.
Leverrier-Faddeev requires n4+O(n3) operations. The same results can be obtained by the more complicated Samuelson-Berkowitz algorith, which has one third of that complexity, that is n4/3+O(n3).
The computation of the determinants required in Cramers rule becomes downright trivial if the system (A|b) is first transformed into triangular form. That can be achieved by Gauß elimination, aka LU decomposition (with pivoting for numerical stability) or the QR decomposition (easiest to debug should be the variant with Givens rotations). The efficient application of Cramers rule is then backward substitution in the triangular system.
Your method sounds good to me at least; however, I just may not be aware of any more efficient methods. The not-fun part may be figuring out how to best implement the determinant-calculating method, as apparently it's not an inexpensive operation.
But once you know that that's working, the rest sounds pretty OK to me. Cache the determinant of the original matrix, substitute in columns, etc.
Figured out exactly how to do this effectively.
Provides a method for seamless determinants, and mentions matrix decomposition. I have not learned this yet as it's not a HS level concept however I did some problems using it and it's a solid method.
Final Code:
public static void main(String[] args) {
int[][] matrix = new int[3][3];
matrix[0] = new int[] { 3, 5, -1, -2 };
matrix[1] = new int[] { 1, -4, 2, 13 };
matrix[2] = new int[] { 2, 4, 3, 1 };
int[] r = crame(matrix);
info("x: " + r[0] + ", y: " + r[1] + ", z: " + r[2]);
for (int i = 0; i < matrix.length; i++) {
int[] base = matrix[i];
if (check(base, r, base[3])) {
info("System " + (i + 1) + " checks!");
} else {
info("System " + (i + 1) + " fails check!");
public static int getDet(int[][] a) {
int n = a.length - 1;
if (n < 0)
return 0;
int M[][][] = new int[n + 1][][];
M[n] = a; // init first, largest, M to a
// create working arrays
for (int i = 0; i < n; i++)
M[i] = new int[i + 1][i + 1];
return getDet(M, n);
} // end method getDecDet double [][] parameter
public static int getDet(int[][][] M, int m) {
if (m == 0)
return M[0][0][0];
int e = 1;
// init subarray to upper left mxm submatrix
for (int i = 0; i < m; i++)
for (int j = 0; j < m; j++)
M[m - 1][i][j] = M[m][i][j];
int sum = M[m][m][m] * getDet(M, m - 1);
// walk through rest of rows of M
for (int i = m - 1; i >= 0; i--) {
for (int j = 0; j < m; j++)
M[m - 1][i][j] = M[m][i + 1][j];
e = -e;
sum += e * M[m][i][m] * getDet(M, m - 1);
} // end for each row of matrix
return sum;
} // end getDecDet double [][][], int
public static int[] crame(int[][] m) {
int[] result;
if (m.length == 2) {
result = new int[m.length];
int D = getDet(m);
for (int i = 0; i < m.length; i++) {
result[i] = getDet(slide(m, i, m.length)) / D;
} else if (m.length == 3) {
result = new int[m.length];
int D = getDet(m);
for (int i = 0; i < m.length; i++) {
result[i] = (getDet(slide(m, i, m.length)) / D);
} else {
return new int[] {};
return result;
public static int[][] slide(int[][] base, int col, int fin) {
int[][] copy = new int[base.length][];
for (int i = 0; i < base.length; i++) {
int[] aMatrix = base[i];
int aLength = aMatrix.length;
copy[i] = new int[aLength];
System.arraycopy(aMatrix, 0, copy[i], 0, aLength);
for (int i = 0; i < base.length; i++) {
copy[i][col] = base[i][fin];
return copy;
public static int product(int[] a, int[] b) {
int p = 0;
int[] fin = new int[(a.length - 1)];
for (int x = 0; x < fin.length; x++) {
fin[x] = a[x] * b[x];
for (int f : fin) {
p += f;
return p;
public static boolean check(int[] a, int[] b, int z) {
return product(a, b) == z;
public static void info(String log) {
I wrote a neural network in java and it looked like a good idea to take the computation on the gpu for performance issue. The problem I have is that its too slow... I have used jocl to do so. I dont now if its the kernel here is some code:
private static String programSource = "__kernel void "
+ "sampleKernel(__constant float *input,"
+ " __global float *weights,"
+ " __constant int *length,"
+ " __global float *dst)" + " {"
+ " __private int gid = get_global_id(0);"
+ " __private int pos = (gid*length[0]);"
+ " __private float tmp = 0;"
+ " __private int l = length[0];" + " dst[gid] = 0;"
+ " for(int i = 0; i < l; i++){"
+ " tmp += gewichte[pos+i]*input[i];"
+ " }"
+ " dst[gid] = tanh(tmp);" + "}";
making the weights __constant made the programm even slower(maybe it has to permanently switch data between global and local memory because the weights array is too big)
it seems like the most time takes this line:
tmp += gewichte[pos+i]*input[i];
one kernel call represents the computation of one Neural Network Layer and for every neuron of the layer one shader should perform( tanh(weightsOnThisNeuron + OutputFromAllNeuronsOfPreviousLayer).
I prepare all the kernels and store them so that if I want to execute them, they dont have to be prepared again and again.
The only IO between GPU and CPU is at the beginning and at the end when I retrieve the Output
Here is the code where I initialize the network and run the kernels:
public OpenClNetz(float[][][] gew, cl_context context,
cl_command_queue commandQueue) throws Exception {
if (context == null) {
throw new Exception("context == null, Konstruktor schlug fehl");
if (commandQueue == null) {
throw new Exception("commandQueue == null, Konstruktor schlug fehl");
this.layersize = new int[gew.length + 1];
for (int i = 0; i < layersize.length - 1; i++) {
this.layersize[i] = gew[i][0].length;
this.layersize[this.layersize.length - 1] = gew[gew.length - 1].length;
this.context = context;
builded = false;
this.commandQueue = commandQueue;
this.output = new float[layersize[layersize.length - 1]];
gewichte = new cl_mem[layersize.length - 1];
tmp = new cl_mem[layersize.length - 1];
lengths = new cl_mem[layersize.length - 1];
input = new cl_mem();
float[] tmpG;
int[][] tmpL = new int[layersize.length - 1][];
for (int i = 0; i < gewichte.length; i++) {
tmpG = new float[layersize[i] * layersize[i + 1]];
tmpL[i] = new int[1];
tmpL[i][0] = layersize[i];
int n = 0;
for (int j = 0; j < layersize[i + 1]; j++) {
for (int k = 0; k < layersize[i]; k++) {
tmpG[n] = gew[i][j][k];
gewichte[i] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Sizeof.cl_float * tmpG.length, Pointer.to(tmpG),
lengths[i] = clCreateBuffer(context, CL_MEM_READ_WRITE
| CL_MEM_COPY_HOST_PTR, Sizeof.cl_int, Pointer.to(tmpL[i]), null);
tmp[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, Sizeof.cl_float
* layersize[i + 1], null, null);
public void setInput(float[] in) {
if (in.length != layersize[0]) {
.println("array Länge entspricht nicht der Inputsize, setInput schlug fehl");
input = clCreateBuffer(context, CL_MEM_READ_WRITE
| CL_MEM_COPY_HOST_PTR, Sizeof.cl_float * layersize[0],
Pointer.to(in), null);
clSetKernelArg(kernel[0], 0, Sizeof.cl_mem, Pointer.to(input));
public void buildProgramm() {
program = clCreateProgramWithSource(context, 1,
new String[] { programSource }, null, null);
clBuildProgram(program, 0, null, null, null, null);
builded = true;
kernel = new cl_kernel[gewichte.length];
kernel[0] = clCreateKernel(program, "sampleKernel", null);
clSetKernelArg(kernel[0], 0, Sizeof.cl_mem, Pointer.to(input));
clSetKernelArg(kernel[0], 1, Sizeof.cl_mem, Pointer.to(gewichte[0]));
clSetKernelArg(kernel[0], 2, Sizeof.cl_mem, Pointer.to(lengths[0]));
clSetKernelArg(kernel[0], 3, Sizeof.cl_mem, Pointer.to(tmp[0]));
for (int i = 1; i < gewichte.length; i++) {
kernel[i] = clCreateKernel(program, "sampleKernel", null);
clSetKernelArg(kernel[i], 0, Sizeof.cl_mem, Pointer.to(tmp[i - 1]));
clSetKernelArg(kernel[i], 1, Sizeof.cl_mem, Pointer.to(gewichte[i]));
clSetKernelArg(kernel[i], 2, Sizeof.cl_mem, Pointer.to(lengths[i]));
clSetKernelArg(kernel[i], 3, Sizeof.cl_mem, Pointer.to(tmp[i]));
public void run() throws Exception {
if (!builded) {
throw new Exception(
"buildProgramm muss zuerst aufgerufen werden, run schlug fehl");
long global_work_size[] = new long[] { layersize[1] };
this.local_work_size = new long[] { 8 };
// Execute the kernel
clEnqueueNDRangeKernel(commandQueue, kernel[0], 1, null,
global_work_size, local_work_size, 0, null, null);
for (int i = 1; i < gewichte.length; i++) {
global_work_size = new long[] { layersize[i + 1] };
// Execute the kernel
clEnqueueNDRangeKernel(commandQueue, kernel[i], 1, null,
global_work_size, local_work_size, 0, null, null);
thats the main:
public class TEST{
public static void main(String args[]) throws Exception
// The platform, device type and device number
// that will be used
final int platformIndex = 0;
final long deviceType = CL_DEVICE_TYPE_DEFAULT;
final int deviceIndex = 0;
// Enable exceptions and subsequently omit error checks in this sample
// Obtain the number of platforms
int numPlatformsArray[] = new int[1];
clGetPlatformIDs(0, null, numPlatformsArray);
int numPlatforms = numPlatformsArray[0];
// Obtain a platform ID
cl_platform_id platforms[] = new cl_platform_id[numPlatforms];
clGetPlatformIDs(platforms.length, platforms, null);
cl_platform_id platform = platforms[platformIndex];
// Initialize the context properties
cl_context_properties contextProperties = new cl_context_properties();
contextProperties.addProperty(CL_CONTEXT_PLATFORM, platform);
// Obtain the number of devices for the platform
int numDevicesArray[] = new int[1];
clGetDeviceIDs(platform, deviceType, 0, null, numDevicesArray);
int numDevices = numDevicesArray[0];
// Obtain a device ID
cl_device_id devices[] = new cl_device_id[numDevices];
clGetDeviceIDs(platform, deviceType, numDevices, devices, null);
cl_device_id device = devices[deviceIndex];
// Create a context for the selected device
cl_context context = clCreateContext(
contextProperties, 1, new cl_device_id[]{device},
null, null, null);
// Create a command-queue for the selected device
cl_command_queue commandQueue =
clCreateCommandQueue(context, device, 0, null);
int[] layersize = {512,512,512};
float[] in = new float[512];
for(int i = 0; i < 512; i++){
in[i] = (float) (Math.random()*1.4 -0.7);
Netz net = new Netz(layersize);
OpenClNetz netz= new OpenClNetz(net.gewichte,context,commandQueue);
double time = System.currentTimeMillis();
for(int i = 0; i < 10000; i++){
System.out.println("time OpenCl: " + (System.currentTimeMillis()-time));
time = System.currentTimeMillis();
for(int i = 0; i < 10000; i++){
System.out.println("time normal: " + (System.currentTimeMillis()-time));
// Release kernel, program, and memory objects
has somebody any idea how I can make this faster??
the output is for:
normal (running on CPU) : 6475ms
running on GPU (local worksize = 1) : 19110ms
running on GPU (local worksize = 2) : 11778ms
running on GPU (local worksize = 4) : 8985ms
running on GPU (local worksize = 8) : 6880ms
running on GPU (local worksize = 16) : 8237ms (it becomes slower ?! O.o)
running on GPU (local worksize = 32) : 9298ms (Im kinda new to Jocl)
running on GPU (local worksize = 64) : 10062ms