GSS Mahadevan's Blog: Optimum global/local work size for a given OpenCL kernel

With on going study of finding better value for Global/Local work size for OpenCL kernels, here is small program in Java using OpenCL library from JavaCL.

Program does check all combinations of Global/Local sizes

Global size's range is GMIN to GMAX

Local size's range is LMIN to LMAX

For ever iteration, Global sizes are multiplied by 2

Time in micro-secs for each loop is noted by computed in

Time taken in nano-secs for each entry is noted bye ns-per-entry

Observations:

Usage: java [-DGMIN=16] [-DGMAX=65536] [-DLMIN=1] [-DLMAX=2] [-DDEBUG=false] com.nativelibs4java.opencl.demos.NDRange2
#Global:      16: Local:  1: computed in :     22477 microsec: entries:        16: ns-per-entry:   1404822
#Global:      16: Local:  2: computed in :      4207 microsec: entries:        16: ns-per-entry:    262987
#Global:      32: Local:  1: computed in :      4172 microsec: entries:        32: ns-per-entry:    130384
#Global:      32: Local:  2: computed in :      4194 microsec: entries:        32: ns-per-entry:    131065
#Global:      64: Local:  1: computed in :      4070 microsec: entries:        64: ns-per-entry:     63603
#Global:      64: Local:  2: computed in :      6431 microsec: entries:        64: ns-per-entry:    100497
#Global:     128: Local:  1: computed in :      4863 microsec: entries:       128: ns-per-entry:     37993
#Global:     128: Local:  2: computed in :      4537 microsec: entries:       128: ns-per-entry:     35446
#Global:     256: Local:  1: computed in :      4079 microsec: entries:       256: ns-per-entry:     15936
#Global:     256: Local:  2: computed in :      7222 microsec: entries:       256: ns-per-entry:     28211
#Global:     512: Local:  1: computed in :      4155 microsec: entries:       512: ns-per-entry:      8116
#Global:     512: Local:  2: computed in :      4095 microsec: entries:       512: ns-per-entry:      7999
#Global:    1024: Local:  1: computed in :      4194 microsec: entries:      1024: ns-per-entry:      4095
#Global:    1024: Local:  2: computed in :      8201 microsec: entries:      1024: ns-per-entry:      8009
#Global:    2048: Local:  1: computed in :      4528 microsec: entries:      2048: ns-per-entry:      2211
#Global:    2048: Local:  2: computed in :      4173 microsec: entries:      2048: ns-per-entry:      2037
#Global:    4096: Local:  1: computed in :      4428 microsec: entries:      4096: ns-per-entry:      1081
#Global:    4096: Local:  2: computed in :      9895 microsec: entries:      4096: ns-per-entry:      2415
#Global:    8192: Local:  1: computed in :      4933 microsec: entries:      8192: ns-per-entry:       602
#Global:    8192: Local:  2: computed in :      5058 microsec: entries:      8192: ns-per-entry:       617
#Global:   16384: Local:  1: computed in :      5595 microsec: entries:     16384: ns-per-entry:       341
#Global:   16384: Local:  2: computed in :     10664 microsec: entries:     16384: ns-per-entry:       650
#Global:   32768: Local:  1: computed in :      7050 microsec: entries:     32768: ns-per-entry:       215
#Global:   32768: Local:  2: computed in :      5615 microsec: entries:     32768: ns-per-entry:       171
#Global:   65536: Local:  1: computed in :     10011 microsec: entries:     65536: ns-per-entry:       152
#Global:   65536: Local:  2: computed in :     13677 microsec: entries:     65536: ns-per-entry:       208

Java Source:

package com.nativelibs4java.opencl.demos;
import static com.nativelibs4java.opencl.JavaCL.createBestContext;
import java.io.*;
import java.nio.*;
import com.nativelibs4java.opencl.*;
import com.nativelibs4java.util.*;
/* This class runs an OpenCL kernel in loops with various combinations of global-size and local-sizes.
 * By varying the global-size and local-size values, one can find out optimum values for global/local sizes
 * for a given kernel.
 * 
 *   @author GSS Mahadevan
 *  */
public class NDRange2 {
 private static final String PRG_NAME="ndrange2";
 private static final int ITEMS=8;// number of ints updated in this kernel
 private static final String usage="Usage: java [-DGMIN=16] [-DGMAX=65536] [-DLMIN=1] [-DLMAX=2] " +
   "[-DDEBUG=false] "+NDRange2.class.getName()+"\n";
 
 private static final String src = "__kernel void "+ PRG_NAME
         + "("
   + "   __global int* output                                             \n"
   + "   )                                           \n"
   + "{                                                                      \n"
   + "   int i = get_global_id(0)*8;                               \n"
   + "   output[i] = get_global_id(0);                                \n"
   + "   output[i+1] = get_global_size(0);                                \n"
   + "   output[i+2] = get_work_dim();                                \n"
   + "   output[i+3] = get_local_id(0);                                \n"
   + "   output[i+4] = get_local_size(0);                                \n"
   + "   output[i+5] = get_group_id(0);                                \n"
   + "   output[i+6] = get_num_groups(0);                                \n"
   + "   output[i+7] = 9999999;                                \n"
   + "}                                                                      \n"
   + "\n";
 private static final int GMIN = Integer.getInteger("GMIN", 16);
 private static final int GMAX = Integer.getInteger("GMAX", 65536);
 
 private static final int LMIN = Integer.getInteger("LMIN", 1);
 private static final int LMAX = Integer.getInteger("LMAX", 2);
 
 private static final boolean DEBUG = Boolean.parseBoolean(System.getProperty("DEBUG", "false"));
 
 private static final int G_SIZE_MAX = GMAX * 8; // multiplied by  just for safety 

 private static IntBuffer output = NIOUtils.directInts(G_SIZE_MAX);
 private static IntBuffer output2 = NIOUtils.directInts(G_SIZE_MAX);
 
 public static class OCL{
  public final CLProgram program;
  public final CLQueue queue;
  public final CLContext context;
  public final CLKernel kernel;
  public OCL(String src,String kernelName) throws CLBuildException{
   SetupUtils.failWithDownloadProposalsIfOpenCLNotAvailable();
   context = createBestContext();
   queue = context.createDefaultQueue();
   program = context.createProgram(src).build();
   kernel = program.createKernel(kernelName);
  }
 }
 public static void main(String[] args) {
  System.out.println(usage);
  try {
   OCL ocl = new OCL(src,PRG_NAME);
   for(int g=GMIN;g <= GMAX; g *= 2){
    for(int l=LMIN;l <= LMAX; l++){
     for (int i = 0; i < G_SIZE_MAX; i++)
      output.put(i, Integer.MIN_VALUE);
     long time = executeKernel(ocl,output,  g, l);
     int count = 0;
     IntBuffer O = output2;
     for (int i = 0; i < G_SIZE_MAX; i++) {
      int v = O.get(i);
      if (v != Integer.MIN_VALUE) {
       count += 8;
       if(DEBUG) System.out.printf("gl_id:%8d(max:%8d), work_dim:%3d: lid:%2d(max:%2d): gr_id:%8d(max:%8d):junk:%8d\n",
                    v,O.get(i+1),O.get(i+2), O.get(i+3), O.get(i+4), O.get(i+5), O.get(i+6),O.get(i+7));
       i += 7;
      } 
     }
     System.out.printf("#Global:%8d: Local:%3d: computed in :%10d microsec: entries:%10d: ns-per-entry:%10d\n",
                     g,l, (time / 1000), count/ITEMS,(time/g));
    }
   }
  } catch (Exception e) {
   System.err.println(e);
   e.printStackTrace();
  }
 }

 private static long executeKernel(OCL ocl, IntBuffer out, int gsize, int lsize)
   throws IOException {
  long startTime = System.nanoTime();
  CLIntBuffer out1 = ocl.context.createIntBuffer(CLMem.Usage.Output, out,false);
  ocl.kernel.setArgs(out1);
  
  CLEvent kernelCompletion = ocl.kernel.enqueueNDRange(ocl.queue, new int[]{gsize},new int[]{lsize });
  kernelCompletion.waitFor();
  ocl.queue.finish();
  // Copy the OpenCL-hosted array back to RAM
  out1.read(ocl.queue, output2, true);
  long time = System.nanoTime() - startTime;
  return time;
 }
}

GSS Mahadevan's Blog

Wednesday, April 21, 2010

Optimum global/local work size for a given OpenCL kernel

Observations:

Java Source:

No comments:

Post a Comment

Blog Archive

About Me