Showing posts with label Local ID. Show all posts
Showing posts with label Local ID. Show all posts

Wednesday, April 21, 2010

Optimum global/local work size for a given OpenCL kernel

With on going study of finding better value for Global/Local work size for OpenCL kernels, here is small program in Java using OpenCL library from JavaCL.

  • Program does check all combinations of Global/Local sizes
  • Global size's range is GMIN to GMAX
  • Local size's range is LMIN to LMAX
  • For ever iteration, Global sizes are multiplied by 2

  • Time in micro-secs for each loop is noted by computed in
  • Time taken in nano-secs for each entry is noted bye ns-per-entry
  • Observations:

    Usage: java [-DGMIN=16] [-DGMAX=65536] [-DLMIN=1] [-DLMAX=2] [-DDEBUG=false] com.nativelibs4java.opencl.demos.NDRange2
    #Global:      16: Local:  1: computed in :     22477 microsec: entries:        16: ns-per-entry:   1404822
    #Global:      16: Local:  2: computed in :      4207 microsec: entries:        16: ns-per-entry:    262987
    #Global:      32: Local:  1: computed in :      4172 microsec: entries:        32: ns-per-entry:    130384
    #Global:      32: Local:  2: computed in :      4194 microsec: entries:        32: ns-per-entry:    131065
    #Global:      64: Local:  1: computed in :      4070 microsec: entries:        64: ns-per-entry:     63603
    #Global:      64: Local:  2: computed in :      6431 microsec: entries:        64: ns-per-entry:    100497
    #Global:     128: Local:  1: computed in :      4863 microsec: entries:       128: ns-per-entry:     37993
    #Global:     128: Local:  2: computed in :      4537 microsec: entries:       128: ns-per-entry:     35446
    #Global:     256: Local:  1: computed in :      4079 microsec: entries:       256: ns-per-entry:     15936
    #Global:     256: Local:  2: computed in :      7222 microsec: entries:       256: ns-per-entry:     28211
    #Global:     512: Local:  1: computed in :      4155 microsec: entries:       512: ns-per-entry:      8116
    #Global:     512: Local:  2: computed in :      4095 microsec: entries:       512: ns-per-entry:      7999
    #Global:    1024: Local:  1: computed in :      4194 microsec: entries:      1024: ns-per-entry:      4095
    #Global:    1024: Local:  2: computed in :      8201 microsec: entries:      1024: ns-per-entry:      8009
    #Global:    2048: Local:  1: computed in :      4528 microsec: entries:      2048: ns-per-entry:      2211
    #Global:    2048: Local:  2: computed in :      4173 microsec: entries:      2048: ns-per-entry:      2037
    #Global:    4096: Local:  1: computed in :      4428 microsec: entries:      4096: ns-per-entry:      1081
    #Global:    4096: Local:  2: computed in :      9895 microsec: entries:      4096: ns-per-entry:      2415
    #Global:    8192: Local:  1: computed in :      4933 microsec: entries:      8192: ns-per-entry:       602
    #Global:    8192: Local:  2: computed in :      5058 microsec: entries:      8192: ns-per-entry:       617
    #Global:   16384: Local:  1: computed in :      5595 microsec: entries:     16384: ns-per-entry:       341
    #Global:   16384: Local:  2: computed in :     10664 microsec: entries:     16384: ns-per-entry:       650
    #Global:   32768: Local:  1: computed in :      7050 microsec: entries:     32768: ns-per-entry:       215
    #Global:   32768: Local:  2: computed in :      5615 microsec: entries:     32768: ns-per-entry:       171
    #Global:   65536: Local:  1: computed in :     10011 microsec: entries:     65536: ns-per-entry:       152
    #Global:   65536: Local:  2: computed in :     13677 microsec: entries:     65536: ns-per-entry:       208
    
    

    Java Source:

    package com.nativelibs4java.opencl.demos;
    import static com.nativelibs4java.opencl.JavaCL.createBestContext;
    import java.io.*;
    import java.nio.*;
    import com.nativelibs4java.opencl.*;
    import com.nativelibs4java.util.*;
    /* This class runs an OpenCL kernel in loops with various combinations of global-size and local-sizes.
     * By varying the global-size and local-size values, one can find out optimum values for global/local sizes
     * for a given kernel.
     * 
     *   @author GSS Mahadevan
     *  */
    public class NDRange2 {
     private static final String PRG_NAME="ndrange2";
     private static final int ITEMS=8;// number of ints updated in this kernel
     private static final String usage="Usage: java [-DGMIN=16] [-DGMAX=65536] [-DLMIN=1] [-DLMAX=2] " +
       "[-DDEBUG=false] "+NDRange2.class.getName()+"\n";
     
     private static final String src = "__kernel void "+ PRG_NAME
             + "("
       + "   __global int* output                                             \n"
       + "   )                                           \n"
       + "{                                                                      \n"
       + "   int i = get_global_id(0)*8;                               \n"
       + "   output[i] = get_global_id(0);                                \n"
       + "   output[i+1] = get_global_size(0);                                \n"
       + "   output[i+2] = get_work_dim();                                \n"
       + "   output[i+3] = get_local_id(0);                                \n"
       + "   output[i+4] = get_local_size(0);                                \n"
       + "   output[i+5] = get_group_id(0);                                \n"
       + "   output[i+6] = get_num_groups(0);                                \n"
       + "   output[i+7] = 9999999;                                \n"
       + "}                                                                      \n"
       + "\n";
     private static final int GMIN = Integer.getInteger("GMIN", 16);
     private static final int GMAX = Integer.getInteger("GMAX", 65536);
     
     private static final int LMIN = Integer.getInteger("LMIN", 1);
     private static final int LMAX = Integer.getInteger("LMAX", 2);
     
     private static final boolean DEBUG = Boolean.parseBoolean(System.getProperty("DEBUG", "false"));
     
     private static final int G_SIZE_MAX = GMAX * 8; // multiplied by  just for safety 
    
     private static IntBuffer output = NIOUtils.directInts(G_SIZE_MAX);
     private static IntBuffer output2 = NIOUtils.directInts(G_SIZE_MAX);
     
     public static class OCL{
      public final CLProgram program;
      public final CLQueue queue;
      public final CLContext context;
      public final CLKernel kernel;
      public OCL(String src,String kernelName) throws CLBuildException{
       SetupUtils.failWithDownloadProposalsIfOpenCLNotAvailable();
       context = createBestContext();
       queue = context.createDefaultQueue();
       program = context.createProgram(src).build();
       kernel = program.createKernel(kernelName);
      }
     }
     public static void main(String[] args) {
      System.out.println(usage);
      try {
       OCL ocl = new OCL(src,PRG_NAME);
       for(int g=GMIN;g <= GMAX; g *= 2){
        for(int l=LMIN;l <= LMAX; l++){
         for (int i = 0; i < G_SIZE_MAX; i++)
          output.put(i, Integer.MIN_VALUE);
         long time = executeKernel(ocl,output,  g, l);
         int count = 0;
         IntBuffer O = output2;
         for (int i = 0; i < G_SIZE_MAX; i++) {
          int v = O.get(i);
          if (v != Integer.MIN_VALUE) {
           count += 8;
           if(DEBUG) System.out.printf("gl_id:%8d(max:%8d), work_dim:%3d: lid:%2d(max:%2d): gr_id:%8d(max:%8d):junk:%8d\n",
                        v,O.get(i+1),O.get(i+2), O.get(i+3), O.get(i+4), O.get(i+5), O.get(i+6),O.get(i+7));
           i += 7;
          } 
         }
         System.out.printf("#Global:%8d: Local:%3d: computed in :%10d microsec: entries:%10d: ns-per-entry:%10d\n",
                         g,l, (time / 1000), count/ITEMS,(time/g));
        }
       }
      } catch (Exception e) {
       System.err.println(e);
       e.printStackTrace();
      }
     }
    
     private static long executeKernel(OCL ocl, IntBuffer out, int gsize, int lsize)
       throws IOException {
      long startTime = System.nanoTime();
      CLIntBuffer out1 = ocl.context.createIntBuffer(CLMem.Usage.Output, out,false);
      ocl.kernel.setArgs(out1);
      
      CLEvent kernelCompletion = ocl.kernel.enqueueNDRange(ocl.queue, new int[]{gsize},new int[]{lsize });
      kernelCompletion.waitFor();
      ocl.queue.finish();
      // Copy the OpenCL-hosted array back to RAM
      out1.read(ocl.queue, output2, true);
      long time = System.nanoTime() - startTime;
      return time;
     }
    }
    

    Information about OpenCL Global size and Local size dimensions

    To understand more about global/local work sizes in OpenCL API clEnqueueNDRangeKernel, I wrote small program in Java using nice nativelibs4java library at JavaCL from Olivier Chafik. Some more links on NDRange are:
    Understanding NDRange

    Java program

    package com.nativelibs4java.opencl.demos;
    
    import static com.nativelibs4java.opencl.JavaCL.createBestContext;
    import java.io.*;
    import java.nio.*;
    import com.nativelibs4java.opencl.*;
    import com.nativelibs4java.util.*;
    /* Usage: java [-DGLOBAL=256] [-DLOCAL=1] com.nativelibs4java.opencl.demos.NDRange1 */
    public class NDRange1 {
     private static final String PRG_NAME="ndrange1";
     private static final int ITEMS=8;// number of ints updated in kernel
     
     private static final String src = "__kernel void "+ PRG_NAME
        + "("
     + "   __global int* output                                             \n"
     + "   )                                           \n"
     + "{                                                                      \n"
     + "   int i = get_global_id(0)*8;                               \n"
     + "   output[i] = get_global_id(0);                                \n"
     + "   output[i+1] = get_global_size(0);                                \n"
     + "   output[i+2] = get_work_dim();                                \n"
     + "   output[i+3] = get_local_id(0);                                \n"
     + "   output[i+4] = get_local_size(0);                                \n"
     + "   output[i+5] = get_group_id(0);                                \n"
     + "   output[i+6] = get_num_groups(0);                                \n"
     + "   output[i+7] = 9999999;                                \n"
     + "}                                                                      \n"
     + "\n";
     private static final int G_SIZE = Integer.getInteger("GLOBAL", 256);
     private static final int L_SIZE = Integer.getInteger("LOCAL", 4);
     private static final boolean DEBUG = Boolean.parseBoolean(System.getProperty("DEBUG", "true"));
     
     private static final int G_SIZE_MAX = G_SIZE * 128; // multiplied by  just for safety 
    
     private static IntBuffer output = NIOUtils.directInts(G_SIZE_MAX);
     private static IntBuffer output2 = NIOUtils.directInts(G_SIZE_MAX);
     public static void main(String[] args) {
      try {
       SetupUtils.failWithDownloadProposalsIfOpenCLNotAvailable();
       for (int i = 0; i < G_SIZE_MAX; i++)
        output.put(i, Integer.MIN_VALUE);
    
       long time = buildAndExecuteKernel(output, src, G_SIZE, L_SIZE);
       
       int count = 0;
       IntBuffer O = output2;
       for (int i = 0; i < G_SIZE_MAX; i++) {
        int v = O.get(i);
        if (v != Integer.MIN_VALUE) {
         count += 8;
         // junk value is printed to check correct ness
         if(DEBUG) System.out.printf("gl_id:%8d(max:%8d), work_dim:%3d: lid:%2d(max:%2d): gr_id:%8d(max:%8d):junk:%8d\n",
                      v,O.get(i+1),O.get(i+2), O.get(i+3), O.get(i+4), O.get(i+5), O.get(i+6),O.get(i+7));
         i += 7;
        } 
       }
       System.out.printf("#Global:%8d: Local:%3d: computed in :%10d microsec: entries:%10d: ns-per-entry:%10d\n",
                    G_SIZE,L_SIZE, (time / 1000), count/ITEMS,(time/G_SIZE));
      } catch (Exception e) {
       System.err.println(e);
       e.printStackTrace();
      }
     }
    
     private static long buildAndExecuteKernel(IntBuffer out, String src, int gsize, int lsize)
       throws CLBuildException, IOException {
      CLContext context = createBestContext();
      CLQueue queue = context.createDefaultQueue();
      CLProgram program = context.createProgram(src).build();
    
      CLKernel kernel = program.createKernel(PRG_NAME);
      long startTime = System.nanoTime();
      CLIntBuffer out1 = context.createIntBuffer(CLMem.Usage.Output, out,false);
      kernel.setArgs(out1);
    
      CLEvent kernelCompletion = kernel.enqueueNDRange(queue, new int[]{gsize},new int[]{lsize });
      kernelCompletion.waitFor();
      queue.finish();
      
      // Copy the OpenCL-hosted array back to RAM
      out1.read(queue, output2, true);
      long time = System.nanoTime() - startTime;
      return time;
     }
    }
    

    Program output

    java -DGLOBAL=64 -DLOCAL=4 com.nativelibs4java.opencl.demos.NDRange1
    
    gl_id     = get_global_id(0)
    max       = get_global_size(0)
    work_dim  = get_work_dim()
    lid       = get_local_id(0)
    max       = get_local_size(0)
    gr_id     = get_group_id(0)
    max       = get_num_groups(0)
    
    gl_id:       0(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       0(max:      16):junk: 9999999
    gl_id:       1(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       0(max:      16):junk: 9999999
    gl_id:       2(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       0(max:      16):junk: 9999999
    gl_id:       3(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       0(max:      16):junk: 9999999
    gl_id:       4(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       1(max:      16):junk: 9999999
    gl_id:       5(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       1(max:      16):junk: 9999999
    gl_id:       6(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       1(max:      16):junk: 9999999
    gl_id:       7(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       1(max:      16):junk: 9999999
    gl_id:       8(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       2(max:      16):junk: 9999999
    gl_id:       9(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       2(max:      16):junk: 9999999
    gl_id:      10(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       2(max:      16):junk: 9999999
    gl_id:      11(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       2(max:      16):junk: 9999999
    gl_id:      12(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       3(max:      16):junk: 9999999
    gl_id:      13(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       3(max:      16):junk: 9999999
    gl_id:      14(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       3(max:      16):junk: 9999999
    gl_id:      15(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       3(max:      16):junk: 9999999
    gl_id:      16(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       4(max:      16):junk: 9999999
    gl_id:      17(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       4(max:      16):junk: 9999999
    gl_id:      18(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       4(max:      16):junk: 9999999
    gl_id:      19(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       4(max:      16):junk: 9999999
    gl_id:      20(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       5(max:      16):junk: 9999999
    gl_id:      21(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       5(max:      16):junk: 9999999
    gl_id:      22(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       5(max:      16):junk: 9999999
    gl_id:      23(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       5(max:      16):junk: 9999999
    gl_id:      24(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       6(max:      16):junk: 9999999
    gl_id:      25(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       6(max:      16):junk: 9999999
    gl_id:      26(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       6(max:      16):junk: 9999999
    gl_id:      27(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       6(max:      16):junk: 9999999
    gl_id:      28(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       7(max:      16):junk: 9999999
    gl_id:      29(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       7(max:      16):junk: 9999999
    gl_id:      30(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       7(max:      16):junk: 9999999
    gl_id:      31(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       7(max:      16):junk: 9999999
    gl_id:      32(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       8(max:      16):junk: 9999999
    gl_id:      33(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       8(max:      16):junk: 9999999
    gl_id:      34(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       8(max:      16):junk: 9999999
    gl_id:      35(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       8(max:      16):junk: 9999999
    gl_id:      36(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       9(max:      16):junk: 9999999
    gl_id:      37(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       9(max:      16):junk: 9999999
    gl_id:      38(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       9(max:      16):junk: 9999999
    gl_id:      39(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       9(max:      16):junk: 9999999
    gl_id:      40(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:      10(max:      16):junk: 9999999
    gl_id:      41(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:      10(max:      16):junk: 9999999
    gl_id:      42(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:      10(max:      16):junk: 9999999
    gl_id:      43(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:      10(max:      16):junk: 9999999
    gl_id:      44(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:      11(max:      16):junk: 9999999
    gl_id:      45(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:      11(max:      16):junk: 9999999
    gl_id:      46(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:      11(max:      16):junk: 9999999
    gl_id:      47(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:      11(max:      16):junk: 9999999
    gl_id:      48(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:      12(max:      16):junk: 9999999
    gl_id:      49(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:      12(max:      16):junk: 9999999
    gl_id:      50(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:      12(max:      16):junk: 9999999
    gl_id:      51(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:      12(max:      16):junk: 9999999
    gl_id:      52(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:      13(max:      16):junk: 9999999
    gl_id:      53(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:      13(max:      16):junk: 9999999
    gl_id:      54(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:      13(max:      16):junk: 9999999
    gl_id:      55(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:      13(max:      16):junk: 9999999
    gl_id:      56(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:      14(max:      16):junk: 9999999
    gl_id:      57(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:      14(max:      16):junk: 9999999
    gl_id:      58(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:      14(max:      16):junk: 9999999
    gl_id:      59(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:      14(max:      16):junk: 9999999
    gl_id:      60(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:      15(max:      16):junk: 9999999
    gl_id:      61(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:      15(max:      16):junk: 9999999
    gl_id:      62(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:      15(max:      16):junk: 9999999
    gl_id:      63(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:      15(max:      16):junk: 9999999
    #Global:      64: Local:  4: computed in :      9519 microsec: entries:        64: ns-per-entry:    148744