Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <cstdint>
- #include <vector>
- #include <iostream>
- #include <string>
- #include <algorithm>
- # define CL_TARGET_OPENCL_VERSION 200
- # define CL_HPP_TARGET_OPENCL_VERSION 200
- #include "CL/cl2.hpp"
- #define BUF_HEAD_PAD 2 // buffer contents is written by host from this offset - this buffer head padding is used to simplify async_copy count
- #define BUF_TAIL_PAD 2 // buffer tail padding for async_copy count simplification
- #define TILE_W 128
- #define TILE_H 2
- #define UNBR 2 // upper neighbouring rows
- #define LNBR 2 // lower neighbouring rows
- #define NB_ROWS UNBR+LNBR
- #define LNBC 2 // left neighbouring columns
- #define RNBC 2 // right neighbouring columns
- #define NB_COLS LNBC+RNBC
- #define DATA_WIDTH TILE_W*2
- #define DATA_HEIGHT 64
- const std::string sProg { R"CLC(
- #define BUF_HEAD_PAD 2
- #define BUF_TAIL_PAD 2
- #define TILE_W 128
- #define TILE_H 2
- #define UNBR 2 // upper neighbouring rows
- #define LNBR 2 // lower neighbouring rows
- #define NB_ROWS UNBR+LNBR
- #define LNBC 2 // left neighbouring columns
- #define RNBC 2 // right neighbouring columns
- #define NB_COLS LNBC+RNBC
- kernel
- void copylocal(global const uchar* restrict ip_main,
- global const uchar* restrict ip_prev, // top neighbouring buffer
- global const uchar* restrict ip_next, // bottom neighbouring buffer
- const long iv_beginRow, // <- begin row number corresponding to the start of ip_main
- const long iv_width,
- const long iv_height) {
- const long gx = get_global_id(0);
- const long gy = get_global_id(1);
- const long gx0 = get_global_offset(0);
- const long gy0 = get_global_offset(1);
- const long lx = get_local_id(0);
- const long ly = get_local_id(1);
- const long wx0 = gx0 + TILE_W * get_group_id(0); // workgroup x origin in the raster
- const long wy0 = gy0 + TILE_H * get_group_id(1); // workgroup y origin in the raster
- const long tileWidth = get_local_size(0);
- const long tileHeight = get_local_size(1);
- const long xOffset = wx0 + BUF_HEAD_PAD - RNBC;
- // copy 2 neighbouring pixels to the left and to the right of this workgroup's area
- const long count = tileWidth + NB_COLS;
- event_t evt = 0;
- // copy this workgroup's corresponding data locally:
- local uchar data[TILE_H + NB_ROWS][TILE_W + NB_COLS];
- long dataY = 0;
- long y = wy0 - UNBR; // where in the raster would the first top neighbour row be
- if (y < iv_beginRow) {
- if (ip_prev != NULL) {
- global const uchar* pSrc = ip_prev + (iv_height + y - iv_beginRow) * iv_width + xOffset;
- while(y < iv_beginRow) {
- evt = async_work_group_copy(&data[dataY][0], pSrc, count, evt);
- ++y;
- ++dataY;
- pSrc += iv_width;
- }
- } else {
- // no neighbouring stripe available, adjust the index into the local cache to the difference
- // and just jump to the first row from the main color stripe:
- dataY = iv_beginRow - y;
- y = iv_beginRow;
- }
- } // else neighbours are available in the main color stripe, next section will copy them
- const long wyE = wy0 + tileHeight + LNBR;
- const long nextY = min(iv_beginRow + iv_height, wyE);
- global const uchar* pSrc = ip_main + (y - iv_beginRow) * iv_width + xOffset;
- while (y < nextY) {
- evt = async_work_group_copy(&data[dataY][0], pSrc, count, evt);
- ++y;
- ++dataY;
- pSrc += iv_width;
- }
- // if we didn't fill neighbours this means we must take them from ip_next if it is available:
- if (y < wyE) {
- if (ip_next != NULL) {
- global const uchar* pSrc = ip_next + xOffset;
- while (y < wyE) {
- evt = async_work_group_copy(&data[dataY][0], pSrc, count, evt);
- ++y;
- ++dataY;
- pSrc += iv_width;
- }
- }
- }
- wait_group_events(1, &evt);
- // verify copied contents (only this work item's pixel, i.e. no checking its neighbours):
- #define traceAsync(txt) printf("group id(%3ld, %3ld) group origin(%3ld, %3ld) size(%3ld x %3ld) WI(%3ld, %3ld) " \
- txt \
- , get_group_id(0), get_group_id(1), wx0, wy0, tileWidth, tileHeight, lx, ly);
- // verify main data only (neighbours might be wrong too - check not added, for simplicity):
- global const uchar* pColor = ip_main + BUF_HEAD_PAD + (wy0 + ly - iv_beginRow)*iv_width + wx0 + lx;
- if (data[ly + UNBR][lx + LNBC] != *pColor) {
- traceAsync("mismatch");
- }
- }
- // )CLC" };
- int main(int argc, char** argv) {
- using std::string;
- using std::cout;
- using std::to_string;
- using std::runtime_error;
- using namespace std::string_literals;
- uint8_t arr1[DATA_HEIGHT][DATA_WIDTH];
- uint8_t arr2[DATA_HEIGHT][DATA_WIDTH];
- uint8_t arr3[DATA_HEIGHT][DATA_WIDTH];
- for (int y = 0; y < DATA_HEIGHT; ++y) {
- for (int x = 0; x < DATA_WIDTH; ++x) {
- arr1[y][x] = (x+y) % 256;
- arr2[y][x] = (x+y+1) % 256;
- arr3[y][x] = (x+y+2) % 256;
- }
- }
- cout << "Init done\n\n";
- auto d = cl::Device::getDefault();
- cl::Context ctx{d};
- cl::Program prg{ctx, sProg, false};
- cl_int err = prg.build(" -cl-std=CL2.0 ");
- cl::CommandQueue cq(ctx, d);
- cl::Buffer buf1(ctx, CL_MEM_READ_ONLY|CL_MEM_HOST_WRITE_ONLY, BUF_HEAD_PAD + sizeof(arr1) + BUF_TAIL_PAD, nullptr);
- cl::Buffer buf2(ctx, CL_MEM_READ_ONLY|CL_MEM_HOST_WRITE_ONLY, BUF_HEAD_PAD + sizeof(arr2) + BUF_TAIL_PAD, nullptr);
- cl::Buffer buf3(ctx, CL_MEM_READ_ONLY|CL_MEM_HOST_WRITE_ONLY, BUF_HEAD_PAD + sizeof(arr3) + BUF_TAIL_PAD, nullptr);
- // upload buffer contents
- cq.enqueueWriteBuffer(buf1, CL_TRUE, 2, sizeof(arr1), arr1);
- cq.enqueueWriteBuffer(buf2, CL_TRUE, 2, sizeof(arr2), arr2);
- cq.enqueueWriteBuffer(buf3, CL_TRUE, 2, sizeof(arr3), arr3);
- auto execKernel = [&cq, &prg]( const cl::Buffer& ir_prev, // top neighbouring buffer
- const cl::Buffer& ir_main, // main buffer to "process"
- const cl::Buffer& ir_next, // bottom neighbouring buffer
- const long& xOffset,
- const long& yOffset,
- const long& rangeX,
- const long& rangeY) {
- cl_int kErr;
- cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, long, long, long> k(prg, "copylocal", &kErr);
- if (kErr != CL_SUCCESS) {
- throw runtime_error("kernel ctor error: "s + to_string(kErr));
- }
- auto const globalOffsets = cl::NDRange(xOffset, yOffset);
- auto const globalRange = cl::NDRange(rangeX, rangeY);
- auto const enqueueArgs = cl::EnqueueArgs(cq, globalOffsets, globalRange, cl::NDRange(TILE_W, TILE_H));
- auto kEvt = k(enqueueArgs, ir_main, ir_prev, ir_next, yOffset, DATA_WIDTH, DATA_HEIGHT, kErr);
- kEvt.wait();
- if (kErr != CL_SUCCESS) {
- throw runtime_error("kernel exec error: "s + to_string(kErr));
- }
- };
- // test uniform workgroups: range DATA_WIDTH x DATA_HEIGHT
- execKernel(cl::Buffer(), buf1, buf2, 0, 0, DATA_WIDTH, DATA_HEIGHT);
- cout << "================ 1st test done\n";
- execKernel(buf1, buf2, buf3, 0, DATA_HEIGHT, DATA_WIDTH, DATA_HEIGHT);
- cout << "================ 2nd test done\n";
- execKernel(buf2, buf3, cl::Buffer(), 0, 2*DATA_HEIGHT, DATA_WIDTH, DATA_HEIGHT);
- cout << "================ 3rd test done\n";
- cout << "Uniform workgroups job completed.\n\n";
- // test nonuniform workgroups: range DATA_WIDTH-38 x DATA_HEIGHT
- execKernel(cl::Buffer(), buf1, buf2, 0, 0, DATA_WIDTH-38, DATA_HEIGHT);
- cout << "================ 4th test done\n";
- execKernel(buf1, buf2, buf3, 0, DATA_HEIGHT, DATA_WIDTH-38, DATA_HEIGHT);
- cout << "================ 5th test done\n";
- execKernel(buf2, buf3, cl::Buffer(), 0, 2*DATA_HEIGHT, DATA_WIDTH-38, DATA_HEIGHT);
- cout << "================ 6th test done\n";
- cout << "Nonuniform workgroups job completed.\n\nEXIT\n";
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement