主机程序如下: #include <stdio.h> #include <stdlib.h> #include <math.h> #include "CL/opencl.h" #include "AOCL_Utils.h" using namespace aocl_utils; // OpenCL runtime configuration cl_platform_id platform = NULL; unsigned num_devices = 0; scoped_array<cl_device_id> device; // num_devices elements cl_context context = NULL; scoped_array<cl_command_queue> queue; // num_devices elements cl_program program = NULL; scoped_array<cl_kernel> kernel; // num_devices elements scoped_array<cl_mem> input_a_buf; // num_devices elements scoped_array<cl_mem> input_b_buf; // num_devices elements scoped_array<cl_mem> output_buf; // num_devices elements // Problem data. const unsigned N = 1000000; // problem size scoped_array<scoped_aligned_ptr<float> > input_a, input_b; // num_devices elements scoped_array<scoped_aligned_ptr<float> > output; // num_devices elements scoped_array<scoped_array<float> > ref_output; // num_devices elements scoped_array<unsigned> n_per_device; // num_devices elements // Function prototypes float rand_float(); bool init_opencl(); void init_problem(); void run(); void cleanup(); // Entry point. int main() { // Initialize OpenCL. if(!init_opencl()) { return -1; } // Initialize the problem data. // Requires the number of devices to be known. init_problem(); // Run the kernel. run(); // Free the resources allocated cleanup(); return 0; } /////// HELPER FUNCTIONS /////// // Randomly generate a floating-point number between -10 and 10. float rand_float() { return float(rand()) / float(RAND_MAX) * 20.0f - 10.0f; } // Initializes the OpenCL objects. bool init_opencl() { cl_int status; printf("Initializing OpenCL\n"); if(!setCwdToExeDir()) { return false; } // Get the OpenCL platform. platform = findPlatform("Altera"); if(platform == NULL) { printf("ERROR: Unable to find Altera OpenCL platform.\n"); return false; } // Query the available OpenCL device. device.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices)); printf("Platform: %s\n", getPlatformName(platform).c_str()); printf("Using %d device(s)\n", num_devices); for(unsigned i = 0; i < num_devices; ++i) { printf(" %s\n", getDeviceName(device[i]).c_str()); } // Create the context. context = clCreateContext(NULL, num_devices, device, NULL, NULL, &status); checkError(status, "Failed to create context"); // Create the program for all device. Use the first device as the // representative device (assuming all device are of the same type). std::string binary_file = getBoardBinaryFile("vectorAdd", device[0]); printf("Using AOCX: %s\n", binary_file.c_str()); program = createProgramFromBinary(context, binary_file.c_str(), device, num_devices); // Build the program that was just created. status = clBuildProgram(program, 0, NULL, "", NULL, NULL); checkError(status, "Failed to build program"); // Create per-device objects. queue.reset(num_devices); kernel.reset(num_devices); n_per_device.reset(num_devices); input_a_buf.reset(num_devices); input_b_buf.reset(num_devices); output_buf.reset(num_devices); for(unsigned i = 0; i < num_devices; ++i) { // Command queue. queue[i] = clCreateCommandQueue(context, device[i], CL_QUEUE_PROFILING_ENABLE, &status); checkError(status, "Failed to create command queue"); // Kernel. const char *kernel_name = "vectorAdd"; kernel[i] = clCreateKernel(program, kernel_name, &status); checkError(status, "Failed to create kernel"); // Determine the number of elements processed by this device. n_per_device[i] = N / num_devices; // number of elements handled by this device // Spread out the remainder of the elements over the first // N % num_devices. if(i < (N % num_devices)) { n_per_device[i]++; } // Input buffers. input_a_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY, n_per_device[i] * sizeof(float), NULL, &status); checkError(status, "Failed to create buffer for input A"); input_b_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY, n_per_device[i] * sizeof(float), NULL, &status); checkError(status, "Failed to create buffer for input B"); // Output buffer. output_buf[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY, n_per_device[i] * sizeof(float), NULL, &status); checkError(status, "Failed to create buffer for output"); } return true; } // Initialize the data for the problem. Requires num_devices to be known. void init_problem() { if(num_devices == 0) { checkError(-1, "No devices"); } input_a.reset(num_devices); input_b.reset(num_devices); output.reset(num_devices); ref_output.reset(num_devices); // Generate input vectors A and B and the reference output consisting // of a total of N elements. // We create separate arrays for each device so that each device has an // aligned buffer. for(unsigned i = 0; i < num_devices; ++i) { input_a[i].reset(n_per_device[i]); input_b[i].reset(n_per_device[i]); output[i].reset(n_per_device[i]); ref_output[i].reset(n_per_device[i]); for(unsigned j = 0; j < n_per_device[i]; ++j) { input_a[i][j] = rand_float(); input_b[i][j] = rand_float(); ref_output[i][j] = input_a[i][j] + input_b[i][j]; } } } void run() { cl_int status; const double start_time = getCurrentTimestamp(); // Launch the problem for each device. scoped_array<cl_event> kernel_event(num_devices); scoped_array<cl_event> finish_event(num_devices); for(unsigned i = 0; i < num_devices; ++i) { // Transfer inputs to each device. Each of the host buffers supplied to // clEnqueueWriteBuffer here is already aligned to ensure that DMA is used // for the host-to-device transfer. cl_event write_event[2]; status = clEnqueueWriteBuffer(queue[i], input_a_buf[i], CL_FALSE, 0, n_per_device[i] * sizeof(float), input_a[i], 0, NULL, &write_event[0]); checkError(status, "Failed to transfer input A"); status = clEnqueueWriteBuffer(queue[i], input_b_buf[i], CL_FALSE, 0, n_per_device[i] * sizeof(float), input_b[i], 0, NULL, &write_event[1]); checkError(status, "Failed to transfer input B"); // Set kernel arguments. unsigned argi = 0; status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_a_buf[i]); checkError(status, "Failed to set argument %d", argi - 1); status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_b_buf[i]); checkError(status, "Failed to set argument %d", argi - 1); status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &output_buf[i]); checkError(status, "Failed to set argument %d", argi - 1); // Enqueue kernel. // Use a global work size corresponding to the number of elements to add // for this device. // // We don't specify a local work size and let the runtime choose // (it'll choose to use one work-group with the same size as the global // work-size). // // Events are used to ensure that the kernel is not launched until // the writes to the input buffers have completed. const size_t global_work_size = n_per_device[i]; printf("Launching for device %d (%d elements)\n", i, global_work_size); status = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL, &global_work_size, NULL, 2, write_event, &kernel_event[i]); checkError(status, "Failed to launch kernel"); // Read the result. This the final operation. status = clEnqueueReadBuffer(queue[i], output_buf[i], CL_FALSE, 0, n_per_device[i] * sizeof(float), output[i], 1, &kernel_event[i], &finish_event[i]); // Release local events. clReleaseEvent(write_event[0]); clReleaseEvent(write_event[1]); } // Wait for all devices to finish. clWaitForEvents(num_devices, finish_event); const double end_time = getCurrentTimestamp(); // Wall-clock time taken. printf("\nTime: %0.3f ms\n", (end_time - start_time) * 1e3); // Get kernel times using the OpenCL event profiling API. for(unsigned i = 0; i < num_devices; ++i) { cl_ulong time_ns = getStartEndTime(kernel_event[i]); printf("Kernel time (device %d): %0.3f ms\n", i, double(time_ns) * 1e-6); } // Release all events. for(unsigned i = 0; i < num_devices; ++i) { clReleaseEvent(kernel_event[i]); clReleaseEvent(finish_event[i]); } // Verify results. bool pass = true; for(unsigned i = 0; i < num_devices && pass; ++i) { for(unsigned j = 0; j < n_per_device[i] && pass; ++j) { if(fabsf(output[i][j] - ref_output[i][j]) > 1.0e-5f) { printf("Failed verification @ device %d, index %d\nOutput: %f\nReference: %f\n", i, j, output[i][j], ref_output[i][j]); pass = false; } } } printf("\nVerification: %s\n", pass ? "PASS" : "FAIL"); } // Free the resources allocated during initialization void cleanup() { for(unsigned i = 0; i < num_devices; ++i) { if(kernel && kernel[i]) { clReleaseKernel(kernel[i]); } if(queue && queue[i]) { clReleaseCommandQueue(queue[i]); } if(input_a_buf && input_a_buf[i]) { clReleaseMemObject(input_a_buf[i]); } if(input_b_buf && input_b_buf[i]) { clReleaseMemObject(input_b_buf[i]); } if(output_buf && output_buf[i]) { clReleaseMemObject(output_buf[i]); } } if(program) { clReleaseProgram(program); } if(context) { clReleaseContext(context); } } 将100w维度的两个向量相加,用时107.127ms,你可以试试只用ARM计算,看需要多久,对比下性能。 好了,今天到此为止,大家晚安! (完) |
好了,再运行一个例子就睡觉。 进入上一级目录,然后切入vectorAdd,运行一下: root@socfpga:~/helloworld# cd .. root@socfpga:~# ls README helloworld opencl_arm32_rte vector_Add boardtest init_opencl.sh swapper root@socfpga:~# cd vector_Add/ root@socfpga:~/vector_Add# ls vectorAdd vectorAdd.aocx root@socfpga:~/vector_Add# aocl program /dev/acl0 vectorAdd.aocx aocl program: Running reprogram from /home/root/opencl_arm32_rte/board/c5soc/arm32/bin Reprogramming was successful! root@socfpga:~/vector_Add# ./vectorAdd Initializing OpenCL Platform: Altera SDK for OpenCL Using 1 device(s) de1soc_sharedonly : Cyclone V SoC Development Kit Using AOCX: vectorAdd.aocx Launching for device 0 (1000000 elements) Time: 107.127 ms Kernel time (device 0): 6.933 ms Verification: PASS 这是个向量相加的例子,也是很经典的并行计算例子。核函数内容如下: __kernel void vectorAdd(__global const float *x, __global const float *y, __global float *restrict z) { // get index of the work item int index = get_global_id(0); // add the vector elements z[index] = x[index] + y[index]; } (未完,跟帖中) |
接着看下Host Program长什么样。 #include <assert.h> #include <stdio.h> #include <stdlib.h> #include <math.h> #include <cstring> #include "CL/opencl.h" #include "AOCL_Utils.h" using namespace aocl_utils; #define STRING_BUFFER_LEN 1024 // Runtime constants // Used to define the work set over which this kernel will execute. static const size_t work_group_size = 8; // 8 threads in the demo workgroup // Defines kernel argument value, which is the workitem ID that will // execute a printf call static const int thread_id_to_output = 2; // OpenCL runtime configuration static cl_platform_id platform = NULL; static cl_device_id device = NULL; static cl_context context = NULL; static cl_command_queue queue = NULL; static cl_kernel kernel = NULL; static cl_program program = NULL; // Function prototypes bool init(); void cleanup(); static void device_info_ulong( cl_device_id device, cl_device_info param, const char* name); static void device_info_uint( cl_device_id device, cl_device_info param, const char* name); static void device_info_bool( cl_device_id device, cl_device_info param, const char* name); static void device_info_string( cl_device_id device, cl_device_info param, const char* name); static void display_device_info( cl_device_id device ); // Entry point. int main() { cl_int status; if(!init()) { return -1; } // Set the kernel argument (argument 0) status = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&thread_id_to_output); checkError(status, "Failed to set kernel arg 0"); printf("\nKernel initialization is complete.\n"); printf("Launching the kernel...\n\n"); // Configure work set over which the kernel will execute size_t wgSize[3] = {work_group_size, 1, 1}; size_t gSize[3] = {work_group_size, 1, 1}; // Launch the kernel status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, gSize, wgSize, 0, NULL, NULL); checkError(status, "Failed to launch kernel"); // Wait for command queue to complete pending events status = clFinish(queue); checkError(status, "Failed to finish"); printf("\nKernel execution is complete.\n"); // Free the resources allocated cleanup(); return 0; } /////// HELPER FUNCTIONS /////// bool init() { cl_int status; if(!setCwdToExeDir()) { return false; } // Get the OpenCL platform. platform = findPlatform("Altera"); if(platform == NULL) { printf("ERROR: Unable to find Altera OpenCL platform.\n"); return false; } // User-visible output - Platform information { char char_buffer[STRING_BUFFER_LEN]; printf("Querying platform for info:\n"); printf("==========================\n"); clGetPlatformInfo(platform, CL_PLATFORM_NAME, STRING_BUFFER_LEN, char_buffer, NULL); printf("%-40s = %s\n", "CL_PLATFORM_NAME", char_buffer); clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, STRING_BUFFER_LEN, char_buffer, NULL); printf("%-40s = %s\n", "CL_PLATFORM_VENDOR ", char_buffer); clGetPlatformInfo(platform, CL_PLATFORM_VERSION, STRING_BUFFER_LEN, char_buffer, NULL); printf("%-40s = %s\n\n", "CL_PLATFORM_VERSION ", char_buffer); } // Query the available OpenCL devices. scoped_array<cl_device_id> devices; cl_uint num_devices; devices.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices)); // We'll just use the first device. device = devices[0]; // Display some device information. display_device_info(device); // Create the context. context = clCreateContext(NULL, 1, &device, NULL, NULL, &status); checkError(status, "Failed to create context"); // Create the command queue. queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status); checkError(status, "Failed to create command queue"); // Create the program. std::string binary_file = getBoardBinaryFile("hello_world", device); printf("Using AOCX: %s\n", binary_file.c_str()); program = createProgramFromBinary(context, binary_file.c_str(), &device, 1); // Build the program that was just created. status = clBuildProgram(program, 0, NULL, "", NULL, NULL); checkError(status, "Failed to build program"); // Create the kernel - name passed in here must match kernel name in the // original CL file, that was compiled into an AOCX file using the AOC tool const char *kernel_name = "hello_world"; // Kernel name, as defined in the CL file kernel = clCreateKernel(program, kernel_name, &status); checkError(status, "Failed to create kernel"); return true; } // Free the resources allocated during initialization void cleanup() { if(kernel) { clReleaseKernel(kernel); } if(program) { clReleaseProgram(program); } if(queue) { clReleaseCommandQueue(queue); } if(context) { clReleaseContext(context); } } // Helper functions to display parameters returned by OpenCL queries static void device_info_ulong( cl_device_id device, cl_device_info param, const char* name) { cl_ulong a; clGetDeviceInfo(device, param, sizeof(cl_ulong), &a, NULL); printf("%-40s = %lu\n", name, a); } static void device_info_uint( cl_device_id device, cl_device_info param, const char* name) { cl_uint a; clGetDeviceInfo(device, param, sizeof(cl_uint), &a, NULL); printf("%-40s = %u\n", name, a); } static void device_info_bool( cl_device_id device, cl_device_info param, const char* name) { cl_bool a; clGetDeviceInfo(device, param, sizeof(cl_bool), &a, NULL); printf("%-40s = %s\n", name, (a?"true":"false")); } static void device_info_string( cl_device_id device, cl_device_info param, const char* name) { char a[STRING_BUFFER_LEN]; clGetDeviceInfo(device, param, STRING_BUFFER_LEN, &a, NULL); printf("%-40s = %s\n", name, a); } // Query and display OpenCL information on device and runtime environment static void display_device_info( cl_device_id device ) { printf("Querying device for info:\n"); printf("========================\n"); device_info_string(device, CL_DEVICE_NAME, "CL_DEVICE_NAME"); device_info_string(device, CL_DEVICE_VENDOR, "CL_DEVICE_VENDOR"); device_info_uint(device, CL_DEVICE_VENDOR_ID, "CL_DEVICE_VENDOR_ID"); device_info_string(device, CL_DEVICE_VERSION, "CL_DEVICE_VERSION"); device_info_string(device, CL_DRIVER_VERSION, "CL_DRIVER_VERSION"); device_info_uint(device, CL_DEVICE_ADDRESS_BITS, "CL_DEVICE_ADDRESS_BITS"); device_info_bool(device, CL_DEVICE_AVAILABLE, "CL_DEVICE_AVAILABLE"); device_info_bool(device, CL_DEVICE_ENDIAN_LITTLE, "CL_DEVICE_ENDIAN_LITTLE"); device_info_ulong(device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, "CL_DEVICE_GLOBAL_MEM_CACHE_SIZE"); device_info_ulong(device, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, "CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE"); device_info_ulong(device, CL_DEVICE_GLOBAL_MEM_SIZE, "CL_DEVICE_GLOBAL_MEM_SIZE"); device_info_bool(device, CL_DEVICE_IMAGE_SUPPORT, "CL_DEVICE_IMAGE_SUPPORT"); device_info_ulong(device, CL_DEVICE_LOCAL_MEM_SIZE, "CL_DEVICE_LOCAL_MEM_SIZE"); device_info_ulong(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, "CL_DEVICE_MAX_CLOCK_FREQUENCY"); device_info_ulong(device, CL_DEVICE_MAX_COMPUTE_UNITS, "CL_DEVICE_MAX_COMPUTE_UNITS"); device_info_ulong(device, CL_DEVICE_MAX_CONSTANT_ARGS, "CL_DEVICE_MAX_CONSTANT_ARGS"); device_info_ulong(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, "CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE"); device_info_uint(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS"); device_info_uint(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS"); device_info_uint(device, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, "CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE"); device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR"); device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT"); device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT"); device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG"); device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT"); device_info_uint(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, "CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE"); { cl_command_queue_properties ccp; clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &ccp, NULL); printf("%-40s = %s\n", "Command queue out of order? ", ((ccp & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)?"true":"false")); printf("%-40s = %s\n", "Command queue profiling enabled? ", ((ccp & CL_QUEUE_PROFILING_ENABLE)?"true":"false")); } } 主机程序比较长,主要执行流程为: 初始化平台、寻找设备、打印设备信息、创建设备上下文、在设备上下文中创建指令队列、载入设备代码、编译设备代码、创建核函数对象、设置核函数参数、运行核函数、等待核函数运行结束、清除所有对象。 这是OpenCL的最基本流程,虽然比较繁琐,但熟悉之后几乎每次都是这几步,代码改动很少,真正需要用心设计的是核函数。 (未完,跟帖中) |
Powered by Discuz! X3
© 2001-2013 Comsenz Inc.