Wednesday, April 21, 2010

Information about OpenCL Global size and Local size dimensions

To understand more about global/local work sizes in OpenCL API clEnqueueNDRangeKernel, I wrote small program in Java using nice nativelibs4java library at JavaCL from Olivier Chafik. Some more links on NDRange are:
Understanding NDRange

Java program

package com.nativelibs4java.opencl.demos;

import static com.nativelibs4java.opencl.JavaCL.createBestContext;
import java.io.*;
import java.nio.*;
import com.nativelibs4java.opencl.*;
import com.nativelibs4java.util.*;
/* Usage: java [-DGLOBAL=256] [-DLOCAL=1] com.nativelibs4java.opencl.demos.NDRange1 */
public class NDRange1 {
 private static final String PRG_NAME="ndrange1";
 private static final int ITEMS=8;// number of ints updated in kernel
 
 private static final String src = "__kernel void "+ PRG_NAME
    + "("
 + "   __global int* output                                             \n"
 + "   )                                           \n"
 + "{                                                                      \n"
 + "   int i = get_global_id(0)*8;                               \n"
 + "   output[i] = get_global_id(0);                                \n"
 + "   output[i+1] = get_global_size(0);                                \n"
 + "   output[i+2] = get_work_dim();                                \n"
 + "   output[i+3] = get_local_id(0);                                \n"
 + "   output[i+4] = get_local_size(0);                                \n"
 + "   output[i+5] = get_group_id(0);                                \n"
 + "   output[i+6] = get_num_groups(0);                                \n"
 + "   output[i+7] = 9999999;                                \n"
 + "}                                                                      \n"
 + "\n";
 private static final int G_SIZE = Integer.getInteger("GLOBAL", 256);
 private static final int L_SIZE = Integer.getInteger("LOCAL", 4);
 private static final boolean DEBUG = Boolean.parseBoolean(System.getProperty("DEBUG", "true"));
 
 private static final int G_SIZE_MAX = G_SIZE * 128; // multiplied by  just for safety 

 private static IntBuffer output = NIOUtils.directInts(G_SIZE_MAX);
 private static IntBuffer output2 = NIOUtils.directInts(G_SIZE_MAX);
 public static void main(String[] args) {
  try {
   SetupUtils.failWithDownloadProposalsIfOpenCLNotAvailable();
   for (int i = 0; i < G_SIZE_MAX; i++)
    output.put(i, Integer.MIN_VALUE);

   long time = buildAndExecuteKernel(output, src, G_SIZE, L_SIZE);
   
   int count = 0;
   IntBuffer O = output2;
   for (int i = 0; i < G_SIZE_MAX; i++) {
    int v = O.get(i);
    if (v != Integer.MIN_VALUE) {
     count += 8;
     // junk value is printed to check correct ness
     if(DEBUG) System.out.printf("gl_id:%8d(max:%8d), work_dim:%3d: lid:%2d(max:%2d): gr_id:%8d(max:%8d):junk:%8d\n",
                  v,O.get(i+1),O.get(i+2), O.get(i+3), O.get(i+4), O.get(i+5), O.get(i+6),O.get(i+7));
     i += 7;
    } 
   }
   System.out.printf("#Global:%8d: Local:%3d: computed in :%10d microsec: entries:%10d: ns-per-entry:%10d\n",
                G_SIZE,L_SIZE, (time / 1000), count/ITEMS,(time/G_SIZE));
  } catch (Exception e) {
   System.err.println(e);
   e.printStackTrace();
  }
 }

 private static long buildAndExecuteKernel(IntBuffer out, String src, int gsize, int lsize)
   throws CLBuildException, IOException {
  CLContext context = createBestContext();
  CLQueue queue = context.createDefaultQueue();
  CLProgram program = context.createProgram(src).build();

  CLKernel kernel = program.createKernel(PRG_NAME);
  long startTime = System.nanoTime();
  CLIntBuffer out1 = context.createIntBuffer(CLMem.Usage.Output, out,false);
  kernel.setArgs(out1);

  CLEvent kernelCompletion = kernel.enqueueNDRange(queue, new int[]{gsize},new int[]{lsize });
  kernelCompletion.waitFor();
  queue.finish();
  
  // Copy the OpenCL-hosted array back to RAM
  out1.read(queue, output2, true);
  long time = System.nanoTime() - startTime;
  return time;
 }
}

Program output

java -DGLOBAL=64 -DLOCAL=4 com.nativelibs4java.opencl.demos.NDRange1

gl_id     = get_global_id(0)
max       = get_global_size(0)
work_dim  = get_work_dim()
lid       = get_local_id(0)
max       = get_local_size(0)
gr_id     = get_group_id(0)
max       = get_num_groups(0)

gl_id:       0(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       0(max:      16):junk: 9999999
gl_id:       1(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       0(max:      16):junk: 9999999
gl_id:       2(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       0(max:      16):junk: 9999999
gl_id:       3(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       0(max:      16):junk: 9999999
gl_id:       4(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       1(max:      16):junk: 9999999
gl_id:       5(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       1(max:      16):junk: 9999999
gl_id:       6(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       1(max:      16):junk: 9999999
gl_id:       7(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       1(max:      16):junk: 9999999
gl_id:       8(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       2(max:      16):junk: 9999999
gl_id:       9(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       2(max:      16):junk: 9999999
gl_id:      10(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       2(max:      16):junk: 9999999
gl_id:      11(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       2(max:      16):junk: 9999999
gl_id:      12(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       3(max:      16):junk: 9999999
gl_id:      13(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       3(max:      16):junk: 9999999
gl_id:      14(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       3(max:      16):junk: 9999999
gl_id:      15(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       3(max:      16):junk: 9999999
gl_id:      16(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       4(max:      16):junk: 9999999
gl_id:      17(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       4(max:      16):junk: 9999999
gl_id:      18(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       4(max:      16):junk: 9999999
gl_id:      19(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       4(max:      16):junk: 9999999
gl_id:      20(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       5(max:      16):junk: 9999999
gl_id:      21(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       5(max:      16):junk: 9999999
gl_id:      22(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       5(max:      16):junk: 9999999
gl_id:      23(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       5(max:      16):junk: 9999999
gl_id:      24(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       6(max:      16):junk: 9999999
gl_id:      25(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       6(max:      16):junk: 9999999
gl_id:      26(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       6(max:      16):junk: 9999999
gl_id:      27(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       6(max:      16):junk: 9999999
gl_id:      28(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       7(max:      16):junk: 9999999
gl_id:      29(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       7(max:      16):junk: 9999999
gl_id:      30(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       7(max:      16):junk: 9999999
gl_id:      31(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       7(max:      16):junk: 9999999
gl_id:      32(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       8(max:      16):junk: 9999999
gl_id:      33(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       8(max:      16):junk: 9999999
gl_id:      34(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       8(max:      16):junk: 9999999
gl_id:      35(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       8(max:      16):junk: 9999999
gl_id:      36(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:       9(max:      16):junk: 9999999
gl_id:      37(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:       9(max:      16):junk: 9999999
gl_id:      38(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:       9(max:      16):junk: 9999999
gl_id:      39(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:       9(max:      16):junk: 9999999
gl_id:      40(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:      10(max:      16):junk: 9999999
gl_id:      41(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:      10(max:      16):junk: 9999999
gl_id:      42(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:      10(max:      16):junk: 9999999
gl_id:      43(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:      10(max:      16):junk: 9999999
gl_id:      44(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:      11(max:      16):junk: 9999999
gl_id:      45(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:      11(max:      16):junk: 9999999
gl_id:      46(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:      11(max:      16):junk: 9999999
gl_id:      47(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:      11(max:      16):junk: 9999999
gl_id:      48(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:      12(max:      16):junk: 9999999
gl_id:      49(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:      12(max:      16):junk: 9999999
gl_id:      50(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:      12(max:      16):junk: 9999999
gl_id:      51(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:      12(max:      16):junk: 9999999
gl_id:      52(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:      13(max:      16):junk: 9999999
gl_id:      53(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:      13(max:      16):junk: 9999999
gl_id:      54(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:      13(max:      16):junk: 9999999
gl_id:      55(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:      13(max:      16):junk: 9999999
gl_id:      56(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:      14(max:      16):junk: 9999999
gl_id:      57(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:      14(max:      16):junk: 9999999
gl_id:      58(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:      14(max:      16):junk: 9999999
gl_id:      59(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:      14(max:      16):junk: 9999999
gl_id:      60(max:      64), work_dim:  1: lid: 0(max: 4): gr_id:      15(max:      16):junk: 9999999
gl_id:      61(max:      64), work_dim:  1: lid: 1(max: 4): gr_id:      15(max:      16):junk: 9999999
gl_id:      62(max:      64), work_dim:  1: lid: 2(max: 4): gr_id:      15(max:      16):junk: 9999999
gl_id:      63(max:      64), work_dim:  1: lid: 3(max: 4): gr_id:      15(max:      16):junk: 9999999
#Global:      64: Local:  4: computed in :      9519 microsec: entries:        64: ns-per-entry:    148744


No comments:

Post a Comment