With on going study of finding better value for Global/Local work size for OpenCL kernels, here is small program in Java using OpenCL library from
JavaCL.
Program does check all combinations of Global/Local sizes
Global size's range is GMIN to GMAX
Local size's range is LMIN to LMAX
For ever iteration, Global sizes are multiplied by 2
Time in micro-secs for each loop is noted by computed in
Time taken in nano-secs for each entry is noted bye ns-per-entry
Observations:
Usage: java [-DGMIN=16] [-DGMAX=65536] [-DLMIN=1] [-DLMAX=2] [-DDEBUG=false] com.nativelibs4java.opencl.demos.NDRange2
#Global: 16: Local: 1: computed in : 22477 microsec: entries: 16: ns-per-entry: 1404822
#Global: 16: Local: 2: computed in : 4207 microsec: entries: 16: ns-per-entry: 262987
#Global: 32: Local: 1: computed in : 4172 microsec: entries: 32: ns-per-entry: 130384
#Global: 32: Local: 2: computed in : 4194 microsec: entries: 32: ns-per-entry: 131065
#Global: 64: Local: 1: computed in : 4070 microsec: entries: 64: ns-per-entry: 63603
#Global: 64: Local: 2: computed in : 6431 microsec: entries: 64: ns-per-entry: 100497
#Global: 128: Local: 1: computed in : 4863 microsec: entries: 128: ns-per-entry: 37993
#Global: 128: Local: 2: computed in : 4537 microsec: entries: 128: ns-per-entry: 35446
#Global: 256: Local: 1: computed in : 4079 microsec: entries: 256: ns-per-entry: 15936
#Global: 256: Local: 2: computed in : 7222 microsec: entries: 256: ns-per-entry: 28211
#Global: 512: Local: 1: computed in : 4155 microsec: entries: 512: ns-per-entry: 8116
#Global: 512: Local: 2: computed in : 4095 microsec: entries: 512: ns-per-entry: 7999
#Global: 1024: Local: 1: computed in : 4194 microsec: entries: 1024: ns-per-entry: 4095
#Global: 1024: Local: 2: computed in : 8201 microsec: entries: 1024: ns-per-entry: 8009
#Global: 2048: Local: 1: computed in : 4528 microsec: entries: 2048: ns-per-entry: 2211
#Global: 2048: Local: 2: computed in : 4173 microsec: entries: 2048: ns-per-entry: 2037
#Global: 4096: Local: 1: computed in : 4428 microsec: entries: 4096: ns-per-entry: 1081
#Global: 4096: Local: 2: computed in : 9895 microsec: entries: 4096: ns-per-entry: 2415
#Global: 8192: Local: 1: computed in : 4933 microsec: entries: 8192: ns-per-entry: 602
#Global: 8192: Local: 2: computed in : 5058 microsec: entries: 8192: ns-per-entry: 617
#Global: 16384: Local: 1: computed in : 5595 microsec: entries: 16384: ns-per-entry: 341
#Global: 16384: Local: 2: computed in : 10664 microsec: entries: 16384: ns-per-entry: 650
#Global: 32768: Local: 1: computed in : 7050 microsec: entries: 32768: ns-per-entry: 215
#Global: 32768: Local: 2: computed in : 5615 microsec: entries: 32768: ns-per-entry: 171
#Global: 65536: Local: 1: computed in : 10011 microsec: entries: 65536: ns-per-entry: 152
#Global: 65536: Local: 2: computed in : 13677 microsec: entries: 65536: ns-per-entry: 208
Java Source:
package com.nativelibs4java.opencl.demos;
import static com.nativelibs4java.opencl.JavaCL.createBestContext;
import java.io.*;
import java.nio.*;
import com.nativelibs4java.opencl.*;
import com.nativelibs4java.util.*;
/* This class runs an OpenCL kernel in loops with various combinations of global-size and local-sizes.
* By varying the global-size and local-size values, one can find out optimum values for global/local sizes
* for a given kernel.
*
* @author GSS Mahadevan
* */
public class NDRange2 {
private static final String PRG_NAME="ndrange2";
private static final int ITEMS=8;// number of ints updated in this kernel
private static final String usage="Usage: java [-DGMIN=16] [-DGMAX=65536] [-DLMIN=1] [-DLMAX=2] " +
"[-DDEBUG=false] "+NDRange2.class.getName()+"\n";
private static final String src = "__kernel void "+ PRG_NAME
+ "("
+ " __global int* output \n"
+ " ) \n"
+ "{ \n"
+ " int i = get_global_id(0)*8; \n"
+ " output[i] = get_global_id(0); \n"
+ " output[i+1] = get_global_size(0); \n"
+ " output[i+2] = get_work_dim(); \n"
+ " output[i+3] = get_local_id(0); \n"
+ " output[i+4] = get_local_size(0); \n"
+ " output[i+5] = get_group_id(0); \n"
+ " output[i+6] = get_num_groups(0); \n"
+ " output[i+7] = 9999999; \n"
+ "} \n"
+ "\n";
private static final int GMIN = Integer.getInteger("GMIN", 16);
private static final int GMAX = Integer.getInteger("GMAX", 65536);
private static final int LMIN = Integer.getInteger("LMIN", 1);
private static final int LMAX = Integer.getInteger("LMAX", 2);
private static final boolean DEBUG = Boolean.parseBoolean(System.getProperty("DEBUG", "false"));
private static final int G_SIZE_MAX = GMAX * 8; // multiplied by just for safety
private static IntBuffer output = NIOUtils.directInts(G_SIZE_MAX);
private static IntBuffer output2 = NIOUtils.directInts(G_SIZE_MAX);
public static class OCL{
public final CLProgram program;
public final CLQueue queue;
public final CLContext context;
public final CLKernel kernel;
public OCL(String src,String kernelName) throws CLBuildException{
SetupUtils.failWithDownloadProposalsIfOpenCLNotAvailable();
context = createBestContext();
queue = context.createDefaultQueue();
program = context.createProgram(src).build();
kernel = program.createKernel(kernelName);
}
}
public static void main(String[] args) {
System.out.println(usage);
try {
OCL ocl = new OCL(src,PRG_NAME);
for(int g=GMIN;g <= GMAX; g *= 2){
for(int l=LMIN;l <= LMAX; l++){
for (int i = 0; i < G_SIZE_MAX; i++)
output.put(i, Integer.MIN_VALUE);
long time = executeKernel(ocl,output, g, l);
int count = 0;
IntBuffer O = output2;
for (int i = 0; i < G_SIZE_MAX; i++) {
int v = O.get(i);
if (v != Integer.MIN_VALUE) {
count += 8;
if(DEBUG) System.out.printf("gl_id:%8d(max:%8d), work_dim:%3d: lid:%2d(max:%2d): gr_id:%8d(max:%8d):junk:%8d\n",
v,O.get(i+1),O.get(i+2), O.get(i+3), O.get(i+4), O.get(i+5), O.get(i+6),O.get(i+7));
i += 7;
}
}
System.out.printf("#Global:%8d: Local:%3d: computed in :%10d microsec: entries:%10d: ns-per-entry:%10d\n",
g,l, (time / 1000), count/ITEMS,(time/g));
}
}
} catch (Exception e) {
System.err.println(e);
e.printStackTrace();
}
}
private static long executeKernel(OCL ocl, IntBuffer out, int gsize, int lsize)
throws IOException {
long startTime = System.nanoTime();
CLIntBuffer out1 = ocl.context.createIntBuffer(CLMem.Usage.Output, out,false);
ocl.kernel.setArgs(out1);
CLEvent kernelCompletion = ocl.kernel.enqueueNDRange(ocl.queue, new int[]{gsize},new int[]{lsize });
kernelCompletion.waitFor();
ocl.queue.finish();
// Copy the OpenCL-hosted array back to RAM
out1.read(ocl.queue, output2, true);
long time = System.nanoTime() - startTime;
return time;
}
}
No comments:
Post a Comment