Consider the following code that computes the first 1024 Fibonacci numbers, and does so 1024 times over:

#include <iostream>

#include <Windows.h>

#include <CL/cl.h>

int main(int argc, char ** argv) {

cl_platform_id platform;

clGetPlatformIDs(1, &platform, NULL);

cl_device_id device;

clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);

cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);

cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);

const char *source =

"__kernel void fibonacci(__global double* dst) {\n"

" __local double buff[1026];\n"

" buff[0] = 0, buff[1] = 1;\n"

" for (int i = 0; i < 1024; i++) {\n"

" for (int j = 0; j < 1024; j++)\n"

" buff[j+2] = buff[j+1] + buff[j];\n"

" async_work_group_copy(&dst[i*1024], &buff[2], 1024, 0);\n"

" }\n"

"}\n";

const size_t global_work_size = 1;

cl_program program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);

clBuildProgram( program, 1, &device, NULL, NULL, NULL);

cl_kernel kernel = clCreateKernel( program, "fibonacci", NULL);

cl_mem buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 1024 * 1024 * 8, NULL, NULL);

clSetKernelArg(kernel, 0, sizeof(buf), (void*)&buf);

LARGE_INTEGER pcFreq = {}, pcStart = {}, pcEnd = {};

QueryPerformanceFrequency(&pcFreq);

QueryPerformanceCounter(&pcStart);

clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);

clFinish(queue);

QueryPerformanceCounter(&pcEnd);

std::cout << 8.0 * pcFreq.QuadPart / (pcEnd.QuadPart-pcStart.QuadPart) << "MB/sec";

}

Running on an i5-2500, the benchmarks are:

**As-is:**21 MB/sec

**Memory transfer commented out:**2814 MB/sec

**Inner for loop commented out:**38 MB/sec

Clearly the memory transfer is taking the bulk of the time, and the computation of Fibonacci numbers hardly any time at all. The way to speed it up is to speed up the memory write, but what could possibly be faster than async_work_group_copy()? It turns out there is a bit of intelligent cache maintenance going on behind the scenes. If we can write to buff[] from multiple work units, then async_work_group_copy() can pull the data from the memory associated with multiple work units, and it goes much faster.

But how can Fibonacci be parallelized, when it is seemingly a serial, recursive calculation? We can do so with lookahead. Based on the basic calculation x2 = x1 + x0, we have:

x3 = x2 + x1

= x1 + x0 + x1

= 2 * x1 + x0

x4 = x3 + x2

= 2 * x1 + x0 + x1 + x0

= 3 * x1 + 2 * x0

x5 = x4 + x3

= 3 * x1 + 2 * x0 + 2 * x1 + x0

= 5 * x1 + 3 * x0

x6 = x5 + x4

= 5 * x1 + 3 * x0 + 3 * x1 + 2 * x0

= 8 * x1 + 5 * x0

x7 = x6 + x5

= 8 * x1 + 5 * x0 + 5 * x1 + 3 * x0

= 13 * x1 + 8 * x0

x8 = x7 + x6

= 13 * x1 + 8 * x0 + 8 * x1 + 5 * x0

= 21 * x1 + 13 * x0

x9 = x8 + x7

= 21 * x1 + 13 * x0 + 13 * x1 + 8 * x0

= 34 * x1 + 21 * x0

And our new parallel code is:

const char *source =

"__kernel void fibonacci(__global double* dst) {\n"

" __local double buff[1026];\n"

" __private double coef[8][2] = {{1,1}, {2,1}, {3,2}, {5,3},\n"

" {8,5}, {13,8}, {21,13}, {34,21}};\n"

" buff[0] = 0, buff[1] = 1;\n"

" for (int i = 0; i < 1024; i++) {\n"

" for (int j = 0; j < 1024; j += 8)\n"

" buff[j+2+get_global_id(0)] =\n"

" coef[get_global_id(0)][0] * buff[j+1]\n"

" + coef[get_global_id(0)][1] * buff[j];\n"

" async_work_group_copy(&dst[i*1024], &buff[2], 1024, 0);\n"

" }\n"

"}\n";

const size_t global_work_size = 8;

This runs at

**8 work units:**122 MB/sec

That's a 6x speedup for increasing the number of work units by 8x! We could no doubt speed it up even more by increasing the look-ahead to increase the number of work units.

Recall that when we commented out the computation

*completely*it was only 38 MB/sec, so the speedup is from parallelizing the memory writes, not from parallelizing the computation.

Thanks once again to the folks at stackoverflow.com in helping me work through this.

## No comments:

Post a Comment