Nav apraksta

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <math.h>
  4. #include <CL/opencl.h>
  5. // OpenCL kernel. Each work item takes care of one element of c
  6. const char *kernelSource = "\n" \
  7. "#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n" \
  8. "__kernel void vecAdd( __global double *a, \n" \
  9. " __global double *b, \n" \
  10. " __global double *c, \n" \
  11. " const unsigned int n) \n" \
  12. "{ \n" \
  13. " //Get our global thread ID \n" \
  14. " int id = get_global_id(0); \n" \
  15. " \n" \
  16. " //Make sure we do not go out of bounds \n" \
  17. " if (id < n) \n" \
  18. " c[id] = a[id] + b[id]; \n" \
  19. "} \n" \
  20. "\n" ;
  21. int main( int argc, char* argv[] )
  22. {
  23. // Length of vectors
  24. unsigned int n = 100000;
  25. // Host input vectors
  26. double *h_a;
  27. double *h_b;
  28. // Host output vector
  29. double *h_c;
  30. // Device input buffers
  31. cl_mem d_a;
  32. cl_mem d_b;
  33. // Device output buffer
  34. cl_mem d_c;
  35. cl_platform_id cpPlatform; // OpenCL platform
  36. cl_device_id device_id; // device ID
  37. cl_context context; // context
  38. cl_command_queue queue; // command queue
  39. cl_program program; // program
  40. cl_kernel kernel; // kernel
  41. // Size, in bytes, of each vector
  42. size_t bytes = n*sizeof(double);
  43. // Allocate memory for each vector on host
  44. h_a = (double*)malloc(bytes);
  45. h_b = (double*)malloc(bytes);
  46. h_c = (double*)malloc(bytes);
  47. // Initialize vectors on host
  48. int i;
  49. for( i = 0; i < n; i++ )
  50. {
  51. h_a[i] = sinf(i)*sinf(i);
  52. h_b[i] = cosf(i)*cosf(i);
  53. }
  54. size_t globalSize, localSize;
  55. cl_int err;
  56. // Number of work items in each local work group
  57. localSize = 64;
  58. // Number of total work items - localSize must be devisor
  59. globalSize = ceil(n/(float)localSize)*localSize;
  60. // Bind to platform
  61. err = clGetPlatformIDs(1, &cpPlatform, NULL);
  62. // Get ID for the device
  63. err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
  64. // Create a context
  65. context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
  66. // Create a command queue
  67. queue = clCreateCommandQueue(context, device_id, 0, &err);
  68. // Create the compute program from the source buffer
  69. program = clCreateProgramWithSource(context, 1,
  70. (const char **) & kernelSource, NULL, &err);
  71. // Build the program executable
  72. clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  73. // Create the compute kernel in the program we wish to run
  74. kernel = clCreateKernel(program, "vecAdd", &err);
  75. // Create the input and output arrays in device memory for our calculation
  76. d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
  77. d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
  78. d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
  79. // Write our data set into the input array in device memory
  80. err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
  81. bytes, h_a, 0, NULL, NULL);
  82. err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
  83. bytes, h_b, 0, NULL, NULL);
  84. // Set the arguments to our compute kernel
  85. err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
  86. err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
  87. err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
  88. err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
  89. // Execute the kernel over the entire range of the data set
  90. err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
  91. 0, NULL, NULL);
  92. // Wait for the command queue to get serviced before reading back results
  93. clFinish(queue);
  94. // Read the results from the device
  95. clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
  96. bytes, h_c, 0, NULL, NULL );
  97. //Sum up vector c and print result divided by n, this should equal 1 within error
  98. double sum = 0;
  99. for(i=0; i<n; i++)
  100. sum += h_c[i];
  101. printf("final result: %f\n", sum/n);
  102. // release OpenCL resources
  103. clReleaseMemObject(d_a);
  104. clReleaseMemObject(d_b);
  105. clReleaseMemObject(d_c);
  106. clReleaseProgram(program);
  107. clReleaseKernel(kernel);
  108. clReleaseCommandQueue(queue);
  109. clReleaseContext(context);
  110. //release host memory
  111. free(h_a);
  112. free(h_b);
  113. free(h_c);
  114. return 0;
  115. }