CPU/GPU Multiprecision Mandelbrot Set

Eric Bainville - Dec 2009

Vectorized fp128 for OpenCL

As we have seen in the previous page, the performance of a direct port of C code to OpenCL is poor. In this page we will see how we can implement fp128 128-bit fixed-point reals specifically for OpenCL. In the code, the corresponding file is fp128.cl.

Each number will be represented by a signed 128-bit integer, stored in two's complement on four 32-bit words. In OpenCL, we will store these four words into a vector uint4. The real value corresponding to integer u will be u/2⁹⁶: the integer part of the real is the first word.

For example the real 2.5 will be represented by (uint4)(2,0x80000000,0,0), and its opposite -2.5 by (uint4)(0xFFFFFFFD,0x80000000,0,0).

Writing code specifically for OpenCL makes testing a little harder. Unit tests must be written in OpenCL kernels. In the C code, see UnitTests.cpp, activated with CONFIG_UNIT_TESTS in appMain.cpp. In the OpenCL code, the test kernel is at the end of fp128.cl.

Low level fp128 functions

We show here how to implement the base functions manipulating 128-bit signed integers.

// Increment U
uint4 inc128(uint4 u)
{
  // Compute all carries to add
  int4 h = (u == (uint4)(0xFFFFFFFF)); // Note that == sets ALL bits if true [6.3.d]
  uint4 c = (uint4)(h.y&h.z&h.w&1,h.z&h.w&1,h.w&1,1);
  return u+c;
}

We compute all the carries at the same time before adding them to the result. The negation is implemented using one's complement and increment:

// Return -U
uint4 neg128(uint4 u)
{
  return inc128(u ^ (uint4)(0xFFFFFFFF)); // (1 + ~U) is two's complement
}

The sum is a little tricky, because we have to handle the carry of each sum, and then propagate it if needed:

// Return U+V
uint4 add128(uint4 u,uint4 v)
{
  uint4 s = u+v;
  uint4 h = (uint4)(s < u);
  uint4 c1 = h.yzwx & (uint4)(1,1,1,0); // Carry from U+V
  h = (uint4)(s == (uint4)(0xFFFFFFFF));
  uint4 c2 = (uint4)((c1.y|(c1.z&h.z))&h.y,c1.z&h.z,0,0); // Propagated carry
  return s+c1+c2;
}

Left shift and right shift are symmetric, and illustrate how convenient the vector component selection syntax can be to access a permuted vector:

// Return U<<1
uint4 shl128(uint4 u)
{
  uint4 h = (u>>(uint4)(31)) & (uint4)(0,1,1,1); // Bits to move up
  return (u<<(uint4)(1)) | h.yzwx;
}

// Return U>>1
uint4 shr128(uint4 u)
{
  uint4 h = (u<<(uint4)(31)) & (uint4)(0x80000000,0x80000000,0x80000000,0); // Bits to move down
  return (u>>(uint4)(1)) | h.wxyz;
}

Product by a single word requires 7 32-bit products and 1 add128:

// Return U*K.
// U MUST be positive.
uint4 mul128u(uint4 u,uint k)
{
  uint4 s1 = u * (uint4)(k);
  uint4 s2 = (uint4)(mul_hi(u.y,k),mul_hi(u.z,k),mul_hi(u.w,k),0);
  return add128(s1,s2);
}

fp128 multiplication

The last function we need to implement Mandelbrot Set computations is the real x real product. We should truncate the result to maintain the 128-bit size and the position of the decimal point. The table below shows the destination words (noted xyzw) of the high and low words of the product of each pair of components of u and v:

            U
         x  y  z  w
      ---------------
    x | 'x xy yz zw |     ' = discarded value (overflow)
 V  y | xy yz zw w. |     . = discarded value (underflow)
    z | yz zw w. .. |
    w | zw w. .. .. |
      ---------------

In the result, the x component will be the sum of 3 values, 5 values for y, 7 values for z and w. In total we have to compute 22 32-bit products and 6 add128. When both operands are equal, it can be lowered to 14 products and 6 add128 + 6 shl128. Since in our application a majority of the products are squares it is interesting here to provide a dedicated square function.

// Return U*V truncated to keep the position of the decimal point.
// U and V MUST be positive.
uint4 mulfpu(uint4 u,uint4 v)
{
  // Diagonal coefficients
  uint4 s = (uint4)(u.x*v.x,mul_hi(u.y,v.y),u.y*v.y,mul_hi(u.z,v.z));
  // Off-diagonal
  uint4 t1 = (uint4)(mul_hi(u.x,v.y),u.x*v.y,mul_hi(u.x,v.w),u.x*v.w);
  uint4 t2 = (uint4)(mul_hi(v.x,u.y),v.x*u.y,mul_hi(v.x,u.w),v.x*u.w);
  s = add128(s,add128(t1,t2));
  t1 = (uint4)(0,mul_hi(u.x,v.z),u.x*v.z,mul_hi(u.y,v.w));
  t2 = (uint4)(0,mul_hi(v.x,u.z),v.x*u.z,mul_hi(v.y,u.w));
  s = add128(s,add128(t1,t2));
  t1 = (uint4)(0,0,mul_hi(u.y,v.z),u.y*v.z);
  t2 = (uint4)(0,0,mul_hi(v.y,u.z),v.y*u.z);
  s = add128(s,add128(t1,t2));
  // Add 3 to compensate truncation
  return add128(s,(uint4)(0,0,0,3));
}

// Return U*U truncated to keep the position of the decimal point.
// U MUST be positive.
uint4 sqrfpu(uint4 u)
{
  // Diagonal coefficients
  uint4 s = (uint4)(u.x*u.x,mul_hi(u.y,u.y),u.y*u.y,mul_hi(u.z,u.z));
  // Off-diagonal
  uint4 t = (uint4)(mul_hi(u.x,u.y),u.x*u.y,mul_hi(u.x,u.w),u.x*u.w);
  s = add128(s,shl128(t));
  t = (uint4)(0,mul_hi(u.x,u.z),u.x*u.z,mul_hi(u.y,u.w));
  s = add128(s,shl128(t));
  t = (uint4)(0,0,mul_hi(u.y,u.z),u.y*u.z);
  s = add128(s,shl128(t));
  // Add 3 to compensate truncation
  return add128(s,(uint4)(0,0,0,3));
}

Exactly computing the next word would require 7 additional 32-bit products to sum just to estimate the last 3 bits of the result. For speed reasons, I simply added 3, which is the mean value of the upcoming carry. Handling the signs is done trivially using the absolute values of the operands (there may be more subtle ways, but the cost would probably be comparable, if not higher).

Mandelbrot fp128 kernel

Finally, here is the Mandelbrot Set computation kernel using the new fp128 functions. The corresponding file is mandelbrot_fp128.cl.

__kernel void compute_fp128(__global uint * a,
			    __global uint * colormap,
			    __constant uint * coords,
			    int nx,int ny,
			    int offset,int lda,
			    int leftXSign,int topYSign,
			    int maxIt)
{
  // Convert inputs
  uint4 leftX = vload4(0,coords);
  uint4 topY  = vload4(1,coords);
  uint4 stepX = vload4(2,coords);
  uint4 stepY = vload4(3,coords);
  if (leftXSign < 0) leftX = neg128(leftX);
  if (topYSign < 0) topY = neg128(topY);

  for (int iy=0;iy<ny;iy++) for (int ix=0;ix<nx;ix++)
  {
    int xpix = get_global_id(0)*nx + ix;
    int ypix = get_global_id(1)*ny + iy;
    uint4 xc = add128(leftX,mul128(stepX,xpix)); // xc = leftX + xpix * stepX;
    uint4 yc = add128(topY,neg128(mul128(stepY,ypix))); // yc = topY - ypix * stepY;

    int it = 0;
    uint4 x = set128(0);
    uint4 y = set128(0);
    for (it=0;it<maxIt;it++)
    {
      uint4 x2 = sqrfp(x); // x2 = x^2
      uint4 y2 = sqrfp(y); // y2 = y^2
      uint4 aux = add128(x2,y2); // x^2+y^2
      if (aux.x >= 4) break; // Out!
      uint4 twoxy = shl128(mulfp(x,y)); // 2*x*y
      x = add128(xc,add128(x2,neg128(y2))); // x' = xc+x^2-y^2
      y = add128(yc,twoxy); // y' = yc+2*x*y
    }
    uint color = (it < maxIt)?colormap[it]:0xFF000000;
    a[offset+xpix+lda*ypix] = color;
  }
}

The next page shows some benchmarks of the various engines described here.