不止一个人问过这个问题了,其实我看到这个比赛时,想想自己都已经不是学生了,没有那么多课外时间搞比赛,所以没打算报名,但刚好看到在全球计算机大会上Altera与百度合作研发的深度神经网络加速器(DNN by FPGA),而自己恰好又有个想法在FPGA上完成卷积神经网络的搭建(工作相关),各种机缘巧合下,毅然报名了。
运行步骤如下:
root@socfpga:~/helloworld# aocl program /dev/acl0 hello_world.aocx
aocl program: Running reprogram from /home/root/opencl_arm32_rte/board/c5soc/arm32/bin
Reprogramming was successful!
root@socfpga:~/helloworld# ./helloworld
Querying platform for info:
==========================
CL_PLATFORM_NAME = Altera SDK for OpenCL
CL_PLATFORM_VENDOR = Altera Corporation
CL_PLATFORM_VERSION = OpenCL 1.0 Altera SDK for OpenCL, Version 14.0
// Runtime constants
// Used to define the work set over which this kernel will execute.
static const size_t work_group_size = 8; // 8 threads in the demo workgroup
// Defines kernel argument value, which is the workitem ID that will
// execute a printf call
static const int thread_id_to_output = 2;
// Set the kernel argument (argument 0)
status = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&thread_id_to_output);
checkError(status, "Failed to set kernel arg 0");
printf("\nKernel initialization is complete.\n");
printf("Launching the kernel...\n\n");
// Configure work set over which the kernel will execute
size_t wgSize[3] = {work_group_size, 1, 1};
size_t gSize[3] = {work_group_size, 1, 1};
// Launch the kernel
status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, gSize, wgSize, 0, NULL, NULL);
checkError(status, "Failed to launch kernel");
// Wait for command queue to complete pending events
status = clFinish(queue);
checkError(status, "Failed to finish");
printf("\nKernel execution is complete.\n");
// Free the resources allocated
cleanup();
return 0;
}
/////// HELPER FUNCTIONS ///////
bool init() {
cl_int status;
if(!setCwdToExeDir()) {
return false;
}
// Get the OpenCL platform.
platform = findPlatform("Altera");
if(platform == NULL) {
printf("ERROR: Unable to find Altera OpenCL platform.\n");
return false;
}
// We'll just use the first device.
device = devices[0];
// Display some device information.
display_device_info(device);
// Create the context.
context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
checkError(status, "Failed to create context");
// Create the command queue.
queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
checkError(status, "Failed to create command queue");
// Create the program.
std::string binary_file = getBoardBinaryFile("hello_world", device);
printf("Using AOCX: %s\n", binary_file.c_str());
program = createProgramFromBinary(context, binary_file.c_str(), &device, 1);
// Build the program that was just created.
status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
checkError(status, "Failed to build program");
// Create the kernel - name passed in here must match kernel name in the
// original CL file, that was compiled into an AOCX file using the AOC tool
const char *kernel_name = "hello_world"; // Kernel name, as defined in the CL file
kernel = clCreateKernel(program, kernel_name, &status);
checkError(status, "Failed to create kernel");
root@socfpga:~/helloworld# cd ..
root@socfpga:~# ls
README helloworld opencl_arm32_rte vector_Add
boardtest init_opencl.sh swapper
root@socfpga:~# cd vector_Add/
root@socfpga:~/vector_Add# ls
vectorAdd vectorAdd.aocx
root@socfpga:~/vector_Add# aocl program /dev/acl0 vectorAdd.aocx
aocl program: Running reprogram from /home/root/opencl_arm32_rte/board/c5soc/arm32/bin
Reprogramming was successful!
root@socfpga:~/vector_Add# ./vectorAdd
Initializing OpenCL
Platform: Altera SDK for OpenCL
Using 1 device(s)
de1soc_sharedonly : Cyclone V SoC Development Kit
Using AOCX: vectorAdd.aocx
Launching for device 0 (1000000 elements)
Time: 107.127 ms
Kernel time (device 0): 6.933 ms
Verification: PASS
这是个向量相加的例子,也是很经典的并行计算例子。核函数内容如下:
__kernel void vectorAdd(__global const float *x,
__global const float *y,
__global float *restrict z)
{
// get index of the work item
int index = get_global_id(0);
// add the vector elements
z[index] = x[index] + y[index];
}
// OpenCL runtime configuration
cl_platform_id platform = NULL;
unsigned num_devices = 0;
scoped_array<cl_device_id> device; // num_devices elements
cl_context context = NULL;
scoped_array<cl_command_queue> queue; // num_devices elements
cl_program program = NULL;
scoped_array<cl_kernel> kernel; // num_devices elements
scoped_array<cl_mem> input_a_buf; // num_devices elements
scoped_array<cl_mem> input_b_buf; // num_devices elements
scoped_array<cl_mem> output_buf; // num_devices elements
// Problem data.
const unsigned N = 1000000; // problem size
scoped_array<scoped_aligned_ptr<float> > input_a, input_b; // num_devices elements
scoped_array<scoped_aligned_ptr<float> > output; // num_devices elements
scoped_array<scoped_array<float> > ref_output; // num_devices elements
scoped_array<unsigned> n_per_device; // num_devices elements
// Initialize the problem data.
// Requires the number of devices to be known.
init_problem();
// Run the kernel.
run();
// Free the resources allocated
cleanup();
return 0;
}
/////// HELPER FUNCTIONS ///////
// Randomly generate a floating-point number between -10 and 10.
float rand_float() {
return float(rand()) / float(RAND_MAX) * 20.0f - 10.0f;
}
// Initializes the OpenCL objects.
bool init_opencl() {
cl_int status;
printf("Initializing OpenCL\n");
if(!setCwdToExeDir()) {
return false;
}
// Get the OpenCL platform.
platform = findPlatform("Altera");
if(platform == NULL) {
printf("ERROR: Unable to find Altera OpenCL platform.\n");
return false;
}
// Query the available OpenCL device.
device.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));
printf("Platform: %s\n", getPlatformName(platform).c_str());
printf("Using %d device(s)\n", num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
printf(" %s\n", getDeviceName(device[i]).c_str());
}
// Create the context.
context = clCreateContext(NULL, num_devices, device, NULL, NULL, &status);
checkError(status, "Failed to create context");
// Create the program for all device. Use the first device as the
// representative device (assuming all device are of the same type).
std::string binary_file = getBoardBinaryFile("vectorAdd", device[0]);
printf("Using AOCX: %s\n", binary_file.c_str());
program = createProgramFromBinary(context, binary_file.c_str(), device, num_devices);
// Build the program that was just created.
status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
checkError(status, "Failed to build program");
// Generate input vectors A and B and the reference output consisting
// of a total of N elements.
// We create separate arrays for each device so that each device has an
// aligned buffer.
for(unsigned i = 0; i < num_devices; ++i) {
input_a[i].reset(n_per_device[i]);
input_b[i].reset(n_per_device[i]);
output[i].reset(n_per_device[i]);
ref_output[i].reset(n_per_device[i]);
// Launch the problem for each device.
scoped_array<cl_event> kernel_event(num_devices);
scoped_array<cl_event> finish_event(num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
// Transfer inputs to each device. Each of the host buffers supplied to
// clEnqueueWriteBuffer here is already aligned to ensure that DMA is used
// for the host-to-device transfer.
cl_event write_event[2];
status = clEnqueueWriteBuffer(queue[i], input_a_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), input_a[i], 0, NULL, &write_event[0]);
checkError(status, "Failed to transfer input A");
status = clEnqueueWriteBuffer(queue[i], input_b_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), input_b[i], 0, NULL, &write_event[1]);
checkError(status, "Failed to transfer input B");
// Set kernel arguments.
unsigned argi = 0;
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_a_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_b_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &output_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
// Enqueue kernel.
// Use a global work size corresponding to the number of elements to add
// for this device.
//
// We don't specify a local work size and let the runtime choose
// (it'll choose to use one work-group with the same size as the global
// work-size).
//
// Events are used to ensure that the kernel is not launched until
// the writes to the input buffers have completed.
const size_t global_work_size = n_per_device[i];
printf("Launching for device %d (%d elements)\n", i, global_work_size);
status = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL,
&global_work_size, NULL, 2, write_event, &kernel_event[i]);
checkError(status, "Failed to launch kernel");
// Read the result. This the final operation.
status = clEnqueueReadBuffer(queue[i], output_buf[i], CL_FALSE,
0, n_per_device[i] * sizeof(float), output[i], 1, &kernel_event[i], &finish_event[i]);
// Release local events.
clReleaseEvent(write_event[0]);
clReleaseEvent(write_event[1]);
}
// Wait for all devices to finish.
clWaitForEvents(num_devices, finish_event);
// Get kernel times using the OpenCL event profiling API.
for(unsigned i = 0; i < num_devices; ++i) {
cl_ulong time_ns = getStartEndTime(kernel_event[i]);
printf("Kernel time (device %d): %0.3f ms\n", i, double(time_ns) * 1e-6);
}
// Release all events.
for(unsigned i = 0; i < num_devices; ++i) {
clReleaseEvent(kernel_event[i]);
clReleaseEvent(finish_event[i]);
}