程式師世界 >> 編程語言 >> C語言 >> C++ >> 關於C++ >> GPGPU OpenCL編程步驟與簡單實例

GPGPU OpenCL編程步驟與簡單實例

編輯：關於C++

1.OpenCL概念

OpenCL是一個為異構平台編寫程序的框架，此異構平台可由CPUI、GPU或其他類型的處理器組成。 OpenCL由一門用於編寫kernels （在OpenCL設備上運行的函數）的語言（基於C99）和一組用於定義並控制平台的API組成。

OpenCL提供了兩種層面的並行機制：任務並行與數據並行。

2.OpenCL與CUDA的區別

不同點：OpenCL是通用的異構平台編程語言，為了兼顧不同設備，使用繁瑣。

CUDA是nvidia公司發明的專門在其GPGPU上的編程的框架，使用簡單，好入門。

相同點：都是基於任務並行與數據並行。

3.OpenCL的編程步驟

（1）Discover and initialize the platforms

調用兩次clGetPlatformIDs函數，第一次獲取可用的平台數量，第二次獲取一個可用的平台。

（2）Discover and initialize the devices

調用兩次clGetDeviceIDs函數，第一次獲取可用的設備數量，第二次獲取一個可用的設備。

（3）Create a context(調用clCreateContext函數）

上下文context可能會管理多個設備device。

（4）Create a command queue(調用clCreateCommandQueue函數）

一個設備device對應一個command queue。

上下文conetxt將命令發送到設備對應的command queue，設備就可以執行命令隊列裡的命令。

（5）Create device buffers(調用clCreateBuffer函數）

Buffer中保存的是數據對象，就是設備執行程序需要的數據保存在其中。

Buffer由上下文conetxt創建，這樣上下文管理的多個設備就會共享Buffer中的數據。

（6）Write host data to device buffers(調用clEnqueueWriteBuffer函數）

（7）Create and compile the program

創建程序對象，程序對象就代表你的程序源文件或者二進制代碼數據。

（8）Create the kernel(調用clCreateKernel函數）

根據你的程序對象，生成kernel對象，表示設備程序的入口。

（9）Set the kernel arguments(調用clSetKernelArg函數）

（10）Configure the work-item structure(設置worksize）

配置work-item的組織形式（維數，group組成等）

（11）Enqueue the kernel for execution(調用clEnqueueNDRangeKernel函數）

將kernel對象，以及 work-item參數放入命令隊列中進行執行。

（12）Read the output buffer back to the host(調用clEnqueueReadBuffer函數）

（13）Release OpenCL resources（至此結束整個運行過程）

4.說明

OpenCL中的核函數必須單列一個文件。

OpenCL的編程一般步驟就是上面的13步，太長了，以至於要想做個向量加法都是那麼困難。

不過上面的步驟前3步一般是固定的，可以單獨寫在一個.h/.cpp文件中，其他的一般也不會有什麼大的變化。

5.程序實例，向量運算

5.1通用前3個步驟，生成一個文件

tool.h

#ifndef TOOLH
#define TOOLH
    
#include <CL/cl.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <string>
#include <fstream>
using namespace std;
    
/** convert the kernel file into a string */
int convertToString(const char *filename, std::string& s);
    
/**Getting platforms and choose an available one.*/
int getPlatform(cl_platform_id &platform);
    
/**Step 2:Query the platform and choose the first GPU device if has one.*/
cl_device_id *getCl_device_id(cl_platform_id &platform);
    
#endif

tool.cpp

#include <CL/cl.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <string>
#include <fstream>
#include "tool.h"
using namespace std;
    
/** convert the kernel file into a string */
int convertToString(const char *filename, std::string& s)
{
    size_t size;
    char*  str;
    std::fstream f(filename, (std::fstream::in | std::fstream::binary));
    
    if(f.is_open())
    {
        size_t fileSize;
        f.seekg(0, std::fstream::end);
        size = fileSize = (size_t)f.tellg();
        f.seekg(0, std::fstream::beg);
        str = new char[size+1];
        if(!str)
        {
            f.close();
            return 0;
        }
    
        f.read(str, fileSize);
        f.close();
        str[size] = '\0';
        s = str;
        delete[] str;
        return 0;
    }
    cout<<"Error: failed to open file\n:"<<filename<<endl;
    return -1;
}
    
/**Getting platforms and choose an available one.*/
int getPlatform(cl_platform_id &platform)
{
    platform = NULL;//the chosen platform
    
    cl_uint numPlatforms;//the NO. of platforms
    cl_int    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (status != CL_SUCCESS)
    {
        cout<<"Error: Getting platforms!"<<endl;
        return -1;
    }
    
    /**For clarity, choose the first available platform. */
    if(numPlatforms > 0)
    {
        cl_platform_id* platforms =
            (cl_platform_id* )malloc(numPlatforms* sizeof(cl_platform_id));
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        platform = platforms[0];
        free(platforms);
    }
    else
        return -1;
}
    
/**Step 2:Query the platform and choose the first GPU device if has one.*/
cl_device_id *getCl_device_id(cl_platform_id &platform)
{
    cl_uint numDevices = 0;
    cl_device_id *devices=NULL;
    cl_int    status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, numDevices);
    if (numDevices > 0) //GPU available.
    {
        devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
        status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
    }
    return devices;
}

查看本欄目

5.2核函數文件

HelloWorld_Kernel.cl

__kernel void helloworld(__global double* in, __global double* out)
{
    int num = get_global_id(0);
    out[num] = in[num] / 2.4 *(in[num]/6) ;
}

5.3主函數文件

HelloWorld.cpp

//For clarity,error checking has been omitted.
#include <CL/cl.h>
#include "tool.h"
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <string>
#include <fstream>
using namespace std;
    
int main(int argc, char* argv[])
{
    cl_int    status;
    /**Step 1: Getting platforms and choose an available one(first).*/
    cl_platform_id platform;
    getPlatform(platform);
    
    /**Step 2:Query the platform and choose the first GPU device if has one.*/
    cl_device_id *devices=getCl_device_id(platform);
    
    /**Step 3: Create context.*/
    cl_context context = clCreateContext(NULL,1, devices,NULL,NULL,NULL);
    
    /**Step 4: Creating command queue associate with the context.*/
    cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
    
    /**Step 5: Create program object */
    const char *filename = "HelloWorld_Kernel.cl";
    string sourceStr;
    status = convertToString(filename, sourceStr);
    const char *source = sourceStr.c_str();
    size_t sourceSize[] = {strlen(source)};
    cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, NULL);
    
    /**Step 6: Build program. */
    status=clBuildProgram(program, 1,devices,NULL,NULL,NULL);
    
    /**Step 7: Initial input,output for the host and create memory objects for the kernel*/
    const int NUM=512000;
    double* input = new double[NUM];
    for(int i=0;i<NUM;i++)
        input[i]=i;
    double* output = new double[NUM];
    
    cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, (NUM) * sizeof(double),(void *) input, NULL);
    cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY , NUM * sizeof(double), NULL, NULL);
    
    /**Step 8: Create kernel object */
    cl_kernel kernel = clCreateKernel(program,"helloworld", NULL);
    
    /**Step 9: Sets Kernel arguments.*/
    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer);
    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer);
    
    /**Step 10: Running the kernel.*/
    size_t global_work_size[1] = {NUM};
    cl_event enentPoint;
    status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &enentPoint);
    clWaitForEvents(1,&enentPoint); ///wait
    clReleaseEvent(enentPoint);
    
    /**Step 11: Read the cout put back to host memory.*/
    status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, NUM * sizeof(double), output, 0, NULL, NULL);
    cout<<output[NUM-1]<<endl;
    
    /**Step 12: Clean the resources.*/
    status = clReleaseKernel(kernel);//*Release kernel.
    status = clReleaseProgram(program);    //Release the program object.
    status = clReleaseMemObject(inputBuffer);//Release mem object.
    status = clReleaseMemObject(outputBuffer);
    status = clReleaseCommandQueue(commandQueue);//Release  Command queue.
    status = clReleaseContext(context);//Release context.
    
    if (output != NULL)
    {
        free(output);
        output = NULL;
    }
    
    if (devices != NULL)
    {
        free(devices);
        devices = NULL;
    }
    return 0;
}

編譯、鏈接、執行：

g++ -I /opt/AMDAPP/include/ -o A *.cpp -lOpenCL ; ./A