Advertisement
Guest User

async-test

a guest
Mar 24th, 2023
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 8.38 KB | Source Code | 0 0
  1. #include <cstdint>
  2. #include <vector>
  3. #include <iostream>
  4. #include <string>
  5. #include <algorithm>
  6.  
  7. #   define CL_TARGET_OPENCL_VERSION 200
  8. #   define CL_HPP_TARGET_OPENCL_VERSION 200
  9.  
  10. #include "CL/cl2.hpp"
  11.  
  12. #define BUF_HEAD_PAD 2 // buffer contents is written by host from this offset - this buffer head padding is used to simplify async_copy count
  13. #define BUF_TAIL_PAD 2 // buffer tail padding for async_copy count simplification
  14.  
  15. #define TILE_W 128
  16. #define TILE_H 2
  17.  
  18. #define UNBR    2 // upper neighbouring rows
  19. #define LNBR    2 // lower neighbouring rows
  20. #define NB_ROWS UNBR+LNBR
  21.  
  22. #define LNBC    2 // left neighbouring columns
  23. #define RNBC    2 // right neighbouring columns
  24. #define NB_COLS LNBC+RNBC
  25.  
  26. #define DATA_WIDTH TILE_W*2
  27. #define DATA_HEIGHT 64
  28.  
  29. const std::string sProg { R"CLC(
  30.  
  31. #define BUF_HEAD_PAD 2
  32. #define BUF_TAIL_PAD 2
  33.  
  34. #define TILE_W 128
  35. #define TILE_H 2
  36.  
  37. #define UNBR    2 // upper neighbouring rows
  38. #define LNBR    2 // lower neighbouring rows
  39. #define NB_ROWS UNBR+LNBR
  40.  
  41. #define LNBC    2 // left neighbouring columns
  42. #define RNBC    2 // right neighbouring columns
  43. #define NB_COLS LNBC+RNBC
  44.  
  45. kernel
  46. void copylocal(global const uchar* restrict ip_main,
  47.                global const uchar* restrict ip_prev, // top neighbouring buffer
  48.                global const uchar* restrict ip_next, // bottom neighbouring buffer
  49.                const long iv_beginRow,                 // <- begin row number corresponding to the start of ip_main
  50.                const long iv_width,
  51.                const long iv_height) {
  52.  
  53.    const long gx = get_global_id(0);
  54.    const long gy = get_global_id(1);
  55.  
  56.    const long gx0 = get_global_offset(0);
  57.    const long gy0 = get_global_offset(1);
  58.  
  59.    const long lx = get_local_id(0);
  60.    const long ly = get_local_id(1);
  61.  
  62.    const long wx0 = gx0 + TILE_W * get_group_id(0); // workgroup x origin in the raster
  63.    const long wy0 = gy0 + TILE_H * get_group_id(1); // workgroup y origin in the raster
  64.  
  65.    const long tileWidth = get_local_size(0);
  66.    const long tileHeight = get_local_size(1);
  67.  
  68.    const long xOffset = wx0 + BUF_HEAD_PAD - RNBC;
  69.    // copy 2 neighbouring pixels to the left and to the right of this workgroup's area
  70.    const long count = tileWidth + NB_COLS;
  71.  
  72.    event_t evt = 0;
  73.  
  74.    // copy this workgroup's corresponding data locally:
  75.    local uchar data[TILE_H + NB_ROWS][TILE_W + NB_COLS];
  76.  
  77.    long dataY = 0;
  78.    long y = wy0 - UNBR; // where in the raster would the first top neighbour row be
  79.  
  80.    if (y < iv_beginRow) {
  81.        if (ip_prev != NULL) {
  82.            global const uchar* pSrc = ip_prev + (iv_height + y - iv_beginRow) * iv_width + xOffset;
  83.            while(y < iv_beginRow) {
  84.                evt = async_work_group_copy(&data[dataY][0], pSrc, count, evt);
  85.                ++y;
  86.                ++dataY;
  87.                pSrc += iv_width;
  88.            }
  89.        } else {
  90.            // no neighbouring stripe available, adjust the index into the local cache to the difference
  91.            // and just jump to the first row from the main color stripe:
  92.            dataY = iv_beginRow - y;
  93.            y = iv_beginRow;
  94.        }
  95.    } // else neighbours are available in the main color stripe, next section will copy them
  96.  
  97.    const long wyE = wy0 + tileHeight + LNBR;
  98.    const long nextY = min(iv_beginRow + iv_height, wyE);
  99.    global const uchar* pSrc = ip_main + (y - iv_beginRow) * iv_width + xOffset;
  100.    while (y < nextY) {
  101.        evt = async_work_group_copy(&data[dataY][0], pSrc, count, evt);
  102.        ++y;
  103.        ++dataY;
  104.        pSrc += iv_width;
  105.    }
  106.    // if we didn't fill neighbours this means we must take them from ip_next if it is available:
  107.    if (y < wyE) {
  108.        if (ip_next != NULL) {
  109.            global const uchar* pSrc = ip_next + xOffset;
  110.            while (y < wyE) {
  111.                evt = async_work_group_copy(&data[dataY][0], pSrc, count, evt);
  112.                ++y;
  113.                ++dataY;
  114.                pSrc += iv_width;
  115.            }
  116.        }
  117.    }
  118.  
  119.    wait_group_events(1, &evt);
  120.  
  121.    // verify copied contents (only this work item's pixel, i.e. no checking its neighbours):
  122.  
  123. #define traceAsync(txt) printf("group id(%3ld, %3ld) group origin(%3ld, %3ld) size(%3ld x %3ld) WI(%3ld, %3ld) " \
  124.                                txt \
  125.                                , get_group_id(0), get_group_id(1), wx0, wy0, tileWidth, tileHeight, lx, ly);
  126.  
  127.    // verify main data only (neighbours might be wrong too - check not added, for simplicity):
  128.    global const uchar* pColor = ip_main + BUF_HEAD_PAD + (wy0 + ly - iv_beginRow)*iv_width + wx0 + lx;
  129.    if (data[ly + UNBR][lx + LNBC] != *pColor) {
  130.        traceAsync("mismatch");
  131.    }
  132. }
  133. // )CLC" };
  134.  
  135. int main(int argc, char** argv) {
  136.     using std::string;
  137.     using std::cout;
  138.     using std::to_string;
  139.     using std::runtime_error;
  140.     using namespace std::string_literals;
  141.  
  142.     uint8_t arr1[DATA_HEIGHT][DATA_WIDTH];
  143.     uint8_t arr2[DATA_HEIGHT][DATA_WIDTH];
  144.     uint8_t arr3[DATA_HEIGHT][DATA_WIDTH];
  145.     for (int y = 0; y < DATA_HEIGHT; ++y) {
  146.         for (int x = 0; x < DATA_WIDTH; ++x) {
  147.             arr1[y][x] = (x+y) % 256;
  148.             arr2[y][x] = (x+y+1) % 256;
  149.             arr3[y][x] = (x+y+2) % 256;
  150.         }
  151.     }
  152.     cout << "Init done\n\n";
  153.  
  154.     auto d = cl::Device::getDefault();
  155.     cl::Context ctx{d};
  156.     cl::Program prg{ctx, sProg, false};
  157.     cl_int err = prg.build(" -cl-std=CL2.0 ");
  158.  
  159.     cl::CommandQueue cq(ctx, d);
  160.  
  161.     cl::Buffer buf1(ctx, CL_MEM_READ_ONLY|CL_MEM_HOST_WRITE_ONLY, BUF_HEAD_PAD + sizeof(arr1) + BUF_TAIL_PAD, nullptr);
  162.     cl::Buffer buf2(ctx, CL_MEM_READ_ONLY|CL_MEM_HOST_WRITE_ONLY, BUF_HEAD_PAD + sizeof(arr2) + BUF_TAIL_PAD, nullptr);
  163.     cl::Buffer buf3(ctx, CL_MEM_READ_ONLY|CL_MEM_HOST_WRITE_ONLY, BUF_HEAD_PAD + sizeof(arr3) + BUF_TAIL_PAD, nullptr);
  164.     // upload buffer contents
  165.     cq.enqueueWriteBuffer(buf1, CL_TRUE, 2, sizeof(arr1), arr1);
  166.     cq.enqueueWriteBuffer(buf2, CL_TRUE, 2, sizeof(arr2), arr2);
  167.     cq.enqueueWriteBuffer(buf3, CL_TRUE, 2, sizeof(arr3), arr3);
  168.  
  169.     auto execKernel = [&cq, &prg](  const cl::Buffer& ir_prev, // top neighbouring buffer
  170.                                     const cl::Buffer& ir_main, // main buffer to "process"
  171.                                     const cl::Buffer& ir_next, // bottom neighbouring buffer
  172.                                     const long& xOffset,
  173.                                     const long& yOffset,
  174.                                     const long& rangeX,
  175.                                     const long& rangeY) {
  176.         cl_int kErr;
  177.         cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, long, long, long> k(prg, "copylocal", &kErr);
  178.         if (kErr != CL_SUCCESS) {
  179.             throw runtime_error("kernel ctor error: "s + to_string(kErr));
  180.         }
  181.         auto const globalOffsets = cl::NDRange(xOffset, yOffset);
  182.         auto const globalRange = cl::NDRange(rangeX, rangeY);
  183.         auto const enqueueArgs = cl::EnqueueArgs(cq, globalOffsets, globalRange, cl::NDRange(TILE_W, TILE_H));
  184.  
  185.         auto kEvt = k(enqueueArgs, ir_main, ir_prev, ir_next, yOffset, DATA_WIDTH, DATA_HEIGHT, kErr);
  186.         kEvt.wait();
  187.         if (kErr != CL_SUCCESS) {
  188.             throw runtime_error("kernel exec error: "s + to_string(kErr));
  189.         }
  190.     };
  191.  
  192.     // test uniform workgroups: range DATA_WIDTH x DATA_HEIGHT
  193.     execKernel(cl::Buffer(), buf1, buf2,         0, 0,             DATA_WIDTH, DATA_HEIGHT);
  194.     cout << "================ 1st test done\n";
  195.     execKernel(buf1,         buf2, buf3,         0, DATA_HEIGHT,   DATA_WIDTH, DATA_HEIGHT);
  196.     cout << "================ 2nd test done\n";
  197.     execKernel(buf2,         buf3, cl::Buffer(), 0, 2*DATA_HEIGHT, DATA_WIDTH, DATA_HEIGHT);
  198.     cout << "================ 3rd test done\n";
  199.  
  200.     cout << "Uniform workgroups job completed.\n\n";
  201.  
  202.     // test nonuniform workgroups: range DATA_WIDTH-38 x DATA_HEIGHT
  203.     execKernel(cl::Buffer(), buf1, buf2,         0, 0,             DATA_WIDTH-38, DATA_HEIGHT);
  204.     cout << "================ 4th test done\n";
  205.     execKernel(buf1,         buf2, buf3,         0, DATA_HEIGHT,   DATA_WIDTH-38, DATA_HEIGHT);
  206.     cout << "================ 5th test done\n";
  207.     execKernel(buf2,         buf3, cl::Buffer(), 0, 2*DATA_HEIGHT, DATA_WIDTH-38, DATA_HEIGHT);
  208.     cout << "================ 6th test done\n";
  209.     cout << "Nonuniform workgroups job completed.\n\nEXIT\n";
  210.  
  211.     return 0;
  212. }
  213.  
Tags: C++ OpenCL
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement