To understand more about global/local work sizes in OpenCL API
clEnqueueNDRangeKernel, I wrote small program in Java using nice
nativelibs4java library at
JavaCL from Olivier Chafik.
Some more links on NDRange are:
Understanding NDRange
Java program
package com.nativelibs4java.opencl.demos;
import static com.nativelibs4java.opencl.JavaCL.createBestContext;
import java.io.*;
import java.nio.*;
import com.nativelibs4java.opencl.*;
import com.nativelibs4java.util.*;
/* Usage: java [-DGLOBAL=256] [-DLOCAL=1] com.nativelibs4java.opencl.demos.NDRange1 */
public class NDRange1 {
private static final String PRG_NAME="ndrange1";
private static final int ITEMS=8;// number of ints updated in kernel
private static final String src = "__kernel void "+ PRG_NAME
+ "("
+ " __global int* output \n"
+ " ) \n"
+ "{ \n"
+ " int i = get_global_id(0)*8; \n"
+ " output[i] = get_global_id(0); \n"
+ " output[i+1] = get_global_size(0); \n"
+ " output[i+2] = get_work_dim(); \n"
+ " output[i+3] = get_local_id(0); \n"
+ " output[i+4] = get_local_size(0); \n"
+ " output[i+5] = get_group_id(0); \n"
+ " output[i+6] = get_num_groups(0); \n"
+ " output[i+7] = 9999999; \n"
+ "} \n"
+ "\n";
private static final int G_SIZE = Integer.getInteger("GLOBAL", 256);
private static final int L_SIZE = Integer.getInteger("LOCAL", 4);
private static final boolean DEBUG = Boolean.parseBoolean(System.getProperty("DEBUG", "true"));
private static final int G_SIZE_MAX = G_SIZE * 128; // multiplied by just for safety
private static IntBuffer output = NIOUtils.directInts(G_SIZE_MAX);
private static IntBuffer output2 = NIOUtils.directInts(G_SIZE_MAX);
public static void main(String[] args) {
try {
SetupUtils.failWithDownloadProposalsIfOpenCLNotAvailable();
for (int i = 0; i < G_SIZE_MAX; i++)
output.put(i, Integer.MIN_VALUE);
long time = buildAndExecuteKernel(output, src, G_SIZE, L_SIZE);
int count = 0;
IntBuffer O = output2;
for (int i = 0; i < G_SIZE_MAX; i++) {
int v = O.get(i);
if (v != Integer.MIN_VALUE) {
count += 8;
// junk value is printed to check correct ness
if(DEBUG) System.out.printf("gl_id:%8d(max:%8d), work_dim:%3d: lid:%2d(max:%2d): gr_id:%8d(max:%8d):junk:%8d\n",
v,O.get(i+1),O.get(i+2), O.get(i+3), O.get(i+4), O.get(i+5), O.get(i+6),O.get(i+7));
i += 7;
}
}
System.out.printf("#Global:%8d: Local:%3d: computed in :%10d microsec: entries:%10d: ns-per-entry:%10d\n",
G_SIZE,L_SIZE, (time / 1000), count/ITEMS,(time/G_SIZE));
} catch (Exception e) {
System.err.println(e);
e.printStackTrace();
}
}
private static long buildAndExecuteKernel(IntBuffer out, String src, int gsize, int lsize)
throws CLBuildException, IOException {
CLContext context = createBestContext();
CLQueue queue = context.createDefaultQueue();
CLProgram program = context.createProgram(src).build();
CLKernel kernel = program.createKernel(PRG_NAME);
long startTime = System.nanoTime();
CLIntBuffer out1 = context.createIntBuffer(CLMem.Usage.Output, out,false);
kernel.setArgs(out1);
CLEvent kernelCompletion = kernel.enqueueNDRange(queue, new int[]{gsize},new int[]{lsize });
kernelCompletion.waitFor();
queue.finish();
// Copy the OpenCL-hosted array back to RAM
out1.read(queue, output2, true);
long time = System.nanoTime() - startTime;
return time;
}
}
Program output
java -DGLOBAL=64 -DLOCAL=4 com.nativelibs4java.opencl.demos.NDRange1
gl_id = get_global_id(0)
max = get_global_size(0)
work_dim = get_work_dim()
lid = get_local_id(0)
max = get_local_size(0)
gr_id = get_group_id(0)
max = get_num_groups(0)
gl_id: 0(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 0(max: 16):junk: 9999999
gl_id: 1(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 0(max: 16):junk: 9999999
gl_id: 2(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 0(max: 16):junk: 9999999
gl_id: 3(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 0(max: 16):junk: 9999999
gl_id: 4(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 1(max: 16):junk: 9999999
gl_id: 5(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 1(max: 16):junk: 9999999
gl_id: 6(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 1(max: 16):junk: 9999999
gl_id: 7(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 1(max: 16):junk: 9999999
gl_id: 8(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 2(max: 16):junk: 9999999
gl_id: 9(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 2(max: 16):junk: 9999999
gl_id: 10(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 2(max: 16):junk: 9999999
gl_id: 11(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 2(max: 16):junk: 9999999
gl_id: 12(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 3(max: 16):junk: 9999999
gl_id: 13(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 3(max: 16):junk: 9999999
gl_id: 14(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 3(max: 16):junk: 9999999
gl_id: 15(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 3(max: 16):junk: 9999999
gl_id: 16(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 4(max: 16):junk: 9999999
gl_id: 17(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 4(max: 16):junk: 9999999
gl_id: 18(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 4(max: 16):junk: 9999999
gl_id: 19(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 4(max: 16):junk: 9999999
gl_id: 20(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 5(max: 16):junk: 9999999
gl_id: 21(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 5(max: 16):junk: 9999999
gl_id: 22(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 5(max: 16):junk: 9999999
gl_id: 23(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 5(max: 16):junk: 9999999
gl_id: 24(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 6(max: 16):junk: 9999999
gl_id: 25(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 6(max: 16):junk: 9999999
gl_id: 26(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 6(max: 16):junk: 9999999
gl_id: 27(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 6(max: 16):junk: 9999999
gl_id: 28(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 7(max: 16):junk: 9999999
gl_id: 29(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 7(max: 16):junk: 9999999
gl_id: 30(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 7(max: 16):junk: 9999999
gl_id: 31(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 7(max: 16):junk: 9999999
gl_id: 32(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 8(max: 16):junk: 9999999
gl_id: 33(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 8(max: 16):junk: 9999999
gl_id: 34(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 8(max: 16):junk: 9999999
gl_id: 35(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 8(max: 16):junk: 9999999
gl_id: 36(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 9(max: 16):junk: 9999999
gl_id: 37(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 9(max: 16):junk: 9999999
gl_id: 38(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 9(max: 16):junk: 9999999
gl_id: 39(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 9(max: 16):junk: 9999999
gl_id: 40(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 10(max: 16):junk: 9999999
gl_id: 41(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 10(max: 16):junk: 9999999
gl_id: 42(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 10(max: 16):junk: 9999999
gl_id: 43(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 10(max: 16):junk: 9999999
gl_id: 44(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 11(max: 16):junk: 9999999
gl_id: 45(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 11(max: 16):junk: 9999999
gl_id: 46(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 11(max: 16):junk: 9999999
gl_id: 47(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 11(max: 16):junk: 9999999
gl_id: 48(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 12(max: 16):junk: 9999999
gl_id: 49(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 12(max: 16):junk: 9999999
gl_id: 50(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 12(max: 16):junk: 9999999
gl_id: 51(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 12(max: 16):junk: 9999999
gl_id: 52(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 13(max: 16):junk: 9999999
gl_id: 53(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 13(max: 16):junk: 9999999
gl_id: 54(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 13(max: 16):junk: 9999999
gl_id: 55(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 13(max: 16):junk: 9999999
gl_id: 56(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 14(max: 16):junk: 9999999
gl_id: 57(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 14(max: 16):junk: 9999999
gl_id: 58(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 14(max: 16):junk: 9999999
gl_id: 59(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 14(max: 16):junk: 9999999
gl_id: 60(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 15(max: 16):junk: 9999999
gl_id: 61(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 15(max: 16):junk: 9999999
gl_id: 62(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 15(max: 16):junk: 9999999
gl_id: 63(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 15(max: 16):junk: 9999999
#Global: 64: Local: 4: computed in : 9519 microsec: entries: 64: ns-per-entry: 148744