OpenCL Sorting

Eric Bainville - June 2011

Parallel bitonic, local memory acceleration

We will now use all three levels of the GPU memory system: global, local, and registers. We will operate in global memory until inc becomes smaller than the workgroup size. From this point, we can terminate the loop on inc inside a single kernel. The following kernel C2(pre) does this, each thread processing two values at each iteration (it is based on kernel B2). To sort the entire input vector, we now call a mix of B2, B4, B8,... and C2(pre) to terminate each inc loop when inc ≤ WG.

__kernel void ParallelBitonic_C2_pre(__global data_t * data,int inc,int dir)
{
  int t = get_global_id(0); // thread index

  // Terminate the INC loop inside the workgroup
  for ( ;inc>0;inc>>=1)
  {
    int low = t & (inc - 1); // low order bits (below INC)
    int i = (t<<1) - low; // insert 0 at position INC
    bool reverse = ((dir & i) == 0); // asc/desc order

    barrier(CLK_GLOBAL_MEM_FENCE);

    // Load
    data_t x0 = data[i];
    data_t x1 = data[i+inc];

    // Sort
    ORDER(x0,x1)

    barrier(CLK_GLOBAL_MEM_FENCE);

    // Store
    data[i] = x0;
    data[i+inc] = x1;
  }
}

This kernel runs at 40 Mkey/s, a little slower than kernel B2. Now let's replace all intermediate storage in global memory by local memory accesses. This is kernel C2:

__kernel void ParallelBitonic_C2(__global data_t * data,int inc0,int dir,__local data_t * aux)
{
  int t = get_global_id(0); // thread index
  int wgBits = 2*get_local_size(0) - 1; // bit mask to get index in local memory AUX (size is 2*WG)

  for (int inc=inc0;inc>0;inc>>=1)
  {
    int low = t & (inc - 1); // low order bits (below INC)
    int i = (t<<1) - low; // insert 0 at position INC
    bool reverse = ((dir & i) == 0); // asc/desc order
    data_t x0,x1;

    // Load
    if (inc == inc0)
    {
      // First iteration: load from global memory
      x0 = data[i];
      x1 = data[i+inc];
    }
    else
    {
      // Other iterations: load from local memory
      barrier(CLK_LOCAL_MEM_FENCE);
      x0 = aux[i & wgBits];
      x1 = aux[(i+inc) & wgBits];
    }

    // Sort
    ORDER(x0,x1)

    // Store
    if (inc == 1)
    {
      // Last iteration: store to global memory
      data[i] = x0;
      data[i+inc] = x1;
    }
    else
    {
      // Other iterations: store to local memory
      barrier(CLK_LOCAL_MEM_FENCE);
      aux[i & wgBits] = x0;
      aux[(i+inc) & wgBits] = x1;
    }
  }
}

The local memory array aux has dimension 2*WG since each thread of the workgroup handles 2 values. Kernel C2 runs at 92 Mkey/s (combined with B2+B4+B8 for larger inc values) for Key+Value sorting, and 108 Mkey/s for Key only. Let's extend this to a C4 kernel, processing 4 values per thread, and called only when inc is 8, 32, 128:

__kernel void ParallelBitonic_C4(__global data_t * data,int inc0,int dir,__local data_t * aux)
{
  int t = get_global_id(0); // thread index
  int wgBits = 4*get_local_size(0) - 1; // bit mask to get index in local memory AUX (size is 4*WG)
  int inc,low,i;
  bool reverse;
  data_t x[4];

  // First iteration, global input, local output
  inc = inc0>>1;
  low = t & (inc - 1); // low order bits (below INC)
  i = ((t - low) << 2) + low; // insert 00 at position INC
  reverse = ((dir & i) == 0); // asc/desc order
  for (int k=0;k<4;k++) x[k] = data[i+k*inc];
  B4V(x,0);
  for (int k=0;k<4;k++) aux[(i+k*inc) & wgBits] = x[k];
  barrier(CLK_LOCAL_MEM_FENCE);

  // Internal iterations, local input and output
  for ( ;inc>1;inc>>=2)
  {
    low = t & (inc - 1); // low order bits (below INC)
    i = ((t - low) << 2) + low; // insert 00 at position INC
    reverse = ((dir & i) == 0); // asc/desc order
    for (int k=0;k<4;k++) x[k] = aux[(i+k*inc) & wgBits];
    B4V(x,0);
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int k=0;k<4;k++) aux[(i+k*inc) & wgBits] = x[k];
    barrier(CLK_LOCAL_MEM_FENCE);
  }

  // Final iteration, local input, global output, INC=1
  i = t << 2;
  reverse = ((dir & i) == 0); // asc/desc order
  for (int k=0;k<4;k++) x[k] = aux[(i+k) & wgBits];
  B4V(x,0);
  for (int k=0;k<4;k++) data[i+k] = x[k];
}

We now reach 131 Mkey/s for Key+Value, and 190 Mkey/s for Key only. Writing specific kernels for inc=128,32,8 without the inc loop will probably lead to even more performance.

ParallelBitonic (updated with C* kernels)
KernelsKey+ValueKey
B24973
B2+B478115
B2+B4+B8103160
C2+B*92108
C4+B*131190