Understanding NDRange
Java program
package com.nativelibs4java.opencl.demos; import static com.nativelibs4java.opencl.JavaCL.createBestContext; import java.io.*; import java.nio.*; import com.nativelibs4java.opencl.*; import com.nativelibs4java.util.*; /* Usage: java [-DGLOBAL=256] [-DLOCAL=1] com.nativelibs4java.opencl.demos.NDRange1 */ public class NDRange1 { private static final String PRG_NAME="ndrange1"; private static final int ITEMS=8;// number of ints updated in kernel private static final String src = "__kernel void "+ PRG_NAME + "(" + " __global int* output \n" + " ) \n" + "{ \n" + " int i = get_global_id(0)*8; \n" + " output[i] = get_global_id(0); \n" + " output[i+1] = get_global_size(0); \n" + " output[i+2] = get_work_dim(); \n" + " output[i+3] = get_local_id(0); \n" + " output[i+4] = get_local_size(0); \n" + " output[i+5] = get_group_id(0); \n" + " output[i+6] = get_num_groups(0); \n" + " output[i+7] = 9999999; \n" + "} \n" + "\n"; private static final int G_SIZE = Integer.getInteger("GLOBAL", 256); private static final int L_SIZE = Integer.getInteger("LOCAL", 4); private static final boolean DEBUG = Boolean.parseBoolean(System.getProperty("DEBUG", "true")); private static final int G_SIZE_MAX = G_SIZE * 128; // multiplied by just for safety private static IntBuffer output = NIOUtils.directInts(G_SIZE_MAX); private static IntBuffer output2 = NIOUtils.directInts(G_SIZE_MAX); public static void main(String[] args) { try { SetupUtils.failWithDownloadProposalsIfOpenCLNotAvailable(); for (int i = 0; i < G_SIZE_MAX; i++) output.put(i, Integer.MIN_VALUE); long time = buildAndExecuteKernel(output, src, G_SIZE, L_SIZE); int count = 0; IntBuffer O = output2; for (int i = 0; i < G_SIZE_MAX; i++) { int v = O.get(i); if (v != Integer.MIN_VALUE) { count += 8; // junk value is printed to check correct ness if(DEBUG) System.out.printf("gl_id:%8d(max:%8d), work_dim:%3d: lid:%2d(max:%2d): gr_id:%8d(max:%8d):junk:%8d\n", v,O.get(i+1),O.get(i+2), O.get(i+3), O.get(i+4), O.get(i+5), O.get(i+6),O.get(i+7)); i += 7; } } System.out.printf("#Global:%8d: Local:%3d: computed in :%10d microsec: entries:%10d: ns-per-entry:%10d\n", G_SIZE,L_SIZE, (time / 1000), count/ITEMS,(time/G_SIZE)); } catch (Exception e) { System.err.println(e); e.printStackTrace(); } } private static long buildAndExecuteKernel(IntBuffer out, String src, int gsize, int lsize) throws CLBuildException, IOException { CLContext context = createBestContext(); CLQueue queue = context.createDefaultQueue(); CLProgram program = context.createProgram(src).build(); CLKernel kernel = program.createKernel(PRG_NAME); long startTime = System.nanoTime(); CLIntBuffer out1 = context.createIntBuffer(CLMem.Usage.Output, out,false); kernel.setArgs(out1); CLEvent kernelCompletion = kernel.enqueueNDRange(queue, new int[]{gsize},new int[]{lsize }); kernelCompletion.waitFor(); queue.finish(); // Copy the OpenCL-hosted array back to RAM out1.read(queue, output2, true); long time = System.nanoTime() - startTime; return time; } }
Program output
java -DGLOBAL=64 -DLOCAL=4 com.nativelibs4java.opencl.demos.NDRange1 gl_id = get_global_id(0) max = get_global_size(0) work_dim = get_work_dim() lid = get_local_id(0) max = get_local_size(0) gr_id = get_group_id(0) max = get_num_groups(0) gl_id: 0(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 0(max: 16):junk: 9999999 gl_id: 1(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 0(max: 16):junk: 9999999 gl_id: 2(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 0(max: 16):junk: 9999999 gl_id: 3(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 0(max: 16):junk: 9999999 gl_id: 4(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 1(max: 16):junk: 9999999 gl_id: 5(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 1(max: 16):junk: 9999999 gl_id: 6(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 1(max: 16):junk: 9999999 gl_id: 7(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 1(max: 16):junk: 9999999 gl_id: 8(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 2(max: 16):junk: 9999999 gl_id: 9(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 2(max: 16):junk: 9999999 gl_id: 10(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 2(max: 16):junk: 9999999 gl_id: 11(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 2(max: 16):junk: 9999999 gl_id: 12(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 3(max: 16):junk: 9999999 gl_id: 13(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 3(max: 16):junk: 9999999 gl_id: 14(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 3(max: 16):junk: 9999999 gl_id: 15(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 3(max: 16):junk: 9999999 gl_id: 16(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 4(max: 16):junk: 9999999 gl_id: 17(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 4(max: 16):junk: 9999999 gl_id: 18(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 4(max: 16):junk: 9999999 gl_id: 19(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 4(max: 16):junk: 9999999 gl_id: 20(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 5(max: 16):junk: 9999999 gl_id: 21(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 5(max: 16):junk: 9999999 gl_id: 22(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 5(max: 16):junk: 9999999 gl_id: 23(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 5(max: 16):junk: 9999999 gl_id: 24(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 6(max: 16):junk: 9999999 gl_id: 25(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 6(max: 16):junk: 9999999 gl_id: 26(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 6(max: 16):junk: 9999999 gl_id: 27(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 6(max: 16):junk: 9999999 gl_id: 28(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 7(max: 16):junk: 9999999 gl_id: 29(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 7(max: 16):junk: 9999999 gl_id: 30(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 7(max: 16):junk: 9999999 gl_id: 31(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 7(max: 16):junk: 9999999 gl_id: 32(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 8(max: 16):junk: 9999999 gl_id: 33(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 8(max: 16):junk: 9999999 gl_id: 34(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 8(max: 16):junk: 9999999 gl_id: 35(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 8(max: 16):junk: 9999999 gl_id: 36(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 9(max: 16):junk: 9999999 gl_id: 37(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 9(max: 16):junk: 9999999 gl_id: 38(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 9(max: 16):junk: 9999999 gl_id: 39(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 9(max: 16):junk: 9999999 gl_id: 40(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 10(max: 16):junk: 9999999 gl_id: 41(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 10(max: 16):junk: 9999999 gl_id: 42(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 10(max: 16):junk: 9999999 gl_id: 43(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 10(max: 16):junk: 9999999 gl_id: 44(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 11(max: 16):junk: 9999999 gl_id: 45(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 11(max: 16):junk: 9999999 gl_id: 46(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 11(max: 16):junk: 9999999 gl_id: 47(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 11(max: 16):junk: 9999999 gl_id: 48(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 12(max: 16):junk: 9999999 gl_id: 49(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 12(max: 16):junk: 9999999 gl_id: 50(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 12(max: 16):junk: 9999999 gl_id: 51(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 12(max: 16):junk: 9999999 gl_id: 52(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 13(max: 16):junk: 9999999 gl_id: 53(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 13(max: 16):junk: 9999999 gl_id: 54(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 13(max: 16):junk: 9999999 gl_id: 55(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 13(max: 16):junk: 9999999 gl_id: 56(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 14(max: 16):junk: 9999999 gl_id: 57(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 14(max: 16):junk: 9999999 gl_id: 58(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 14(max: 16):junk: 9999999 gl_id: 59(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 14(max: 16):junk: 9999999 gl_id: 60(max: 64), work_dim: 1: lid: 0(max: 4): gr_id: 15(max: 16):junk: 9999999 gl_id: 61(max: 64), work_dim: 1: lid: 1(max: 4): gr_id: 15(max: 16):junk: 9999999 gl_id: 62(max: 64), work_dim: 1: lid: 2(max: 4): gr_id: 15(max: 16):junk: 9999999 gl_id: 63(max: 64), work_dim: 1: lid: 3(max: 4): gr_id: 15(max: 16):junk: 9999999 #Global: 64: Local: 4: computed in : 9519 microsec: entries: 64: ns-per-entry: 148744
No comments:
Post a Comment