Observations:
Usage: java [-DGMIN=16] [-DGMAX=65536] [-DLMIN=1] [-DLMAX=2] [-DDEBUG=false] com.nativelibs4java.opencl.demos.NDRange2 #Global: 16: Local: 1: computed in : 22477 microsec: entries: 16: ns-per-entry: 1404822 #Global: 16: Local: 2: computed in : 4207 microsec: entries: 16: ns-per-entry: 262987 #Global: 32: Local: 1: computed in : 4172 microsec: entries: 32: ns-per-entry: 130384 #Global: 32: Local: 2: computed in : 4194 microsec: entries: 32: ns-per-entry: 131065 #Global: 64: Local: 1: computed in : 4070 microsec: entries: 64: ns-per-entry: 63603 #Global: 64: Local: 2: computed in : 6431 microsec: entries: 64: ns-per-entry: 100497 #Global: 128: Local: 1: computed in : 4863 microsec: entries: 128: ns-per-entry: 37993 #Global: 128: Local: 2: computed in : 4537 microsec: entries: 128: ns-per-entry: 35446 #Global: 256: Local: 1: computed in : 4079 microsec: entries: 256: ns-per-entry: 15936 #Global: 256: Local: 2: computed in : 7222 microsec: entries: 256: ns-per-entry: 28211 #Global: 512: Local: 1: computed in : 4155 microsec: entries: 512: ns-per-entry: 8116 #Global: 512: Local: 2: computed in : 4095 microsec: entries: 512: ns-per-entry: 7999 #Global: 1024: Local: 1: computed in : 4194 microsec: entries: 1024: ns-per-entry: 4095 #Global: 1024: Local: 2: computed in : 8201 microsec: entries: 1024: ns-per-entry: 8009 #Global: 2048: Local: 1: computed in : 4528 microsec: entries: 2048: ns-per-entry: 2211 #Global: 2048: Local: 2: computed in : 4173 microsec: entries: 2048: ns-per-entry: 2037 #Global: 4096: Local: 1: computed in : 4428 microsec: entries: 4096: ns-per-entry: 1081 #Global: 4096: Local: 2: computed in : 9895 microsec: entries: 4096: ns-per-entry: 2415 #Global: 8192: Local: 1: computed in : 4933 microsec: entries: 8192: ns-per-entry: 602 #Global: 8192: Local: 2: computed in : 5058 microsec: entries: 8192: ns-per-entry: 617 #Global: 16384: Local: 1: computed in : 5595 microsec: entries: 16384: ns-per-entry: 341 #Global: 16384: Local: 2: computed in : 10664 microsec: entries: 16384: ns-per-entry: 650 #Global: 32768: Local: 1: computed in : 7050 microsec: entries: 32768: ns-per-entry: 215 #Global: 32768: Local: 2: computed in : 5615 microsec: entries: 32768: ns-per-entry: 171 #Global: 65536: Local: 1: computed in : 10011 microsec: entries: 65536: ns-per-entry: 152 #Global: 65536: Local: 2: computed in : 13677 microsec: entries: 65536: ns-per-entry: 208
Java Source:
package com.nativelibs4java.opencl.demos; import static com.nativelibs4java.opencl.JavaCL.createBestContext; import java.io.*; import java.nio.*; import com.nativelibs4java.opencl.*; import com.nativelibs4java.util.*; /* This class runs an OpenCL kernel in loops with various combinations of global-size and local-sizes. * By varying the global-size and local-size values, one can find out optimum values for global/local sizes * for a given kernel. * * @author GSS Mahadevan * */ public class NDRange2 { private static final String PRG_NAME="ndrange2"; private static final int ITEMS=8;// number of ints updated in this kernel private static final String usage="Usage: java [-DGMIN=16] [-DGMAX=65536] [-DLMIN=1] [-DLMAX=2] " + "[-DDEBUG=false] "+NDRange2.class.getName()+"\n"; private static final String src = "__kernel void "+ PRG_NAME + "(" + " __global int* output \n" + " ) \n" + "{ \n" + " int i = get_global_id(0)*8; \n" + " output[i] = get_global_id(0); \n" + " output[i+1] = get_global_size(0); \n" + " output[i+2] = get_work_dim(); \n" + " output[i+3] = get_local_id(0); \n" + " output[i+4] = get_local_size(0); \n" + " output[i+5] = get_group_id(0); \n" + " output[i+6] = get_num_groups(0); \n" + " output[i+7] = 9999999; \n" + "} \n" + "\n"; private static final int GMIN = Integer.getInteger("GMIN", 16); private static final int GMAX = Integer.getInteger("GMAX", 65536); private static final int LMIN = Integer.getInteger("LMIN", 1); private static final int LMAX = Integer.getInteger("LMAX", 2); private static final boolean DEBUG = Boolean.parseBoolean(System.getProperty("DEBUG", "false")); private static final int G_SIZE_MAX = GMAX * 8; // multiplied by just for safety private static IntBuffer output = NIOUtils.directInts(G_SIZE_MAX); private static IntBuffer output2 = NIOUtils.directInts(G_SIZE_MAX); public static class OCL{ public final CLProgram program; public final CLQueue queue; public final CLContext context; public final CLKernel kernel; public OCL(String src,String kernelName) throws CLBuildException{ SetupUtils.failWithDownloadProposalsIfOpenCLNotAvailable(); context = createBestContext(); queue = context.createDefaultQueue(); program = context.createProgram(src).build(); kernel = program.createKernel(kernelName); } } public static void main(String[] args) { System.out.println(usage); try { OCL ocl = new OCL(src,PRG_NAME); for(int g=GMIN;g <= GMAX; g *= 2){ for(int l=LMIN;l <= LMAX; l++){ for (int i = 0; i < G_SIZE_MAX; i++) output.put(i, Integer.MIN_VALUE); long time = executeKernel(ocl,output, g, l); int count = 0; IntBuffer O = output2; for (int i = 0; i < G_SIZE_MAX; i++) { int v = O.get(i); if (v != Integer.MIN_VALUE) { count += 8; if(DEBUG) System.out.printf("gl_id:%8d(max:%8d), work_dim:%3d: lid:%2d(max:%2d): gr_id:%8d(max:%8d):junk:%8d\n", v,O.get(i+1),O.get(i+2), O.get(i+3), O.get(i+4), O.get(i+5), O.get(i+6),O.get(i+7)); i += 7; } } System.out.printf("#Global:%8d: Local:%3d: computed in :%10d microsec: entries:%10d: ns-per-entry:%10d\n", g,l, (time / 1000), count/ITEMS,(time/g)); } } } catch (Exception e) { System.err.println(e); e.printStackTrace(); } } private static long executeKernel(OCL ocl, IntBuffer out, int gsize, int lsize) throws IOException { long startTime = System.nanoTime(); CLIntBuffer out1 = ocl.context.createIntBuffer(CLMem.Usage.Output, out,false); ocl.kernel.setArgs(out1); CLEvent kernelCompletion = ocl.kernel.enqueueNDRange(ocl.queue, new int[]{gsize},new int[]{lsize }); kernelCompletion.waitFor(); ocl.queue.finish(); // Copy the OpenCL-hosted array back to RAM out1.read(ocl.queue, output2, true); long time = System.nanoTime() - startTime; return time; } }
No comments:
Post a Comment