Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #define arrA_GROUP_SIZE 1
- #define arrA_bitwidth 32
- #define arrA_bitwidthLog2 5
- #define arrA_valuesPerCell 1
- #define arrA_valuesPerCellLog2 0
- #define arrA_valueMask 0xffffffff
- #define arrA_indexMask 0x0
- #define arrA_base_offset 0
- uint arrA_cell(const uint index) { return index >> arrA_valuesPerCellLog2; }
- uint arrA_subcell(const uint index) { return (index & arrA_indexMask) << arrA_bitwidthLog2; }
- int arrA_get2(__local const uint* const a, const uint cell, const uint subcell) { return (a[cell] >> subcell) & arrA_valueMask; }
- void arrA_set(__local uint* const a, const uint index, const int v) { atomic_cmpxchg(&a[index], a[index], v); }
- int arrA_get(__local const uint* const a, const uint index) { const uint x = index % 1024; return arrA_get2(a, arrA_cell(x), arrA_subcell(x)); }
- #define LINEAR_LOCAL_ID_arrA (get_local_id(1) * get_local_size(0) + get_local_id(0))
- #define LINEAR_GROUP_ID_arrA (get_group_id(1) * get_num_groups(0) + get_group_id(0))
- #define LINEAR_GLOBAL_ID_arrA (LINEAR_GROUP_ID_arrA * 1 + LINEAR_LOCAL_ID_arrA)
- uint arrA_getSectionID() { return (get_group_id(0) * 1) / get_num_groups(0); }
- uint arrA_getSectionID_2D() { return LINEAR_GROUP_ID_arrA % 1;}
- #define INIT_VAR_arrA(global,local) __local uint (local)[1024]; prefetch_arrA(arrA_getSectionID(), get_local_id(0), (global), (local));
- #define INIT_VAR_2D_arrA(global,local) __local uint (local)[1024]; prefetch_arrA(arrA_getSectionID_2D(), LINEAR_LOCAL_ID_arrA, (global), (local));
- #define INIT_VAR_2D_row_major_arrA(global,local, tileID, M) __local uint (local)[1][1]; prefetch_2D_row_major_arrA(tileID, M, (global), (local));
- #define INIT_VAR_2D_col_major_arrA(global,local, tileID, K) __local uint (local)[1][1]; prefetch_2D_col_major_arrA(tileID, K, (global), (local));
- uint arrA_global_index(uint sectionID, uint local_index) {
- return (sectionID * 1024 + local_index);
- }
- void prefetch_arrA(const uint sectionID, const uint threadID, __global const uint* const ga, __local uint* la) {
- const uint g_offset = sectionID * 1024;
- #pragma unroll 1024
- for (uint loop=0, cell=threadID; loop < 1024; loop++, cell += 1) {
- const uint g_cell = g_offset + cell;
- la[cell] = ga[g_cell];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- void prefetch_2D_row_major_arrA(const uint tileID, const uint M, __global const uint* const ga, __local uint** la) {
- const int row = get_local_id(0);
- const int col = get_local_id(1);
- const int globalRow = 1 * get_group_id(0) + row;
- const int tiledCol = 1 * tileID + col;
- la[col][row] = ga[tiledCol*M + globalRow];
- }
- void prefetch_2D_col_major_arrA(const uint tileID, const uint K, __global const uint* const ga, __local uint** la) {
- const int row = get_local_id(0);
- const int col = get_local_id(1);
- const int globalCol = 1 * get_group_id(1) + col;
- const int tiledRow = 1 * tileID + row;
- la[col][row] = ga[globalCol*K + tiledRow];
- }
- #define arrB_GROUP_SIZE 1
- #define arrB_bitwidth 32
- #define arrB_bitwidthLog2 5
- #define arrB_valuesPerCell 1
- #define arrB_valuesPerCellLog2 0
- #define arrB_valueMask 0xffffffff
- #define arrB_indexMask 0x0
- #define arrB_base_offset 0
- uint arrB_cell(const uint index) { return index >> arrB_valuesPerCellLog2; }
- uint arrB_subcell(const uint index) { return (index & arrB_indexMask) << arrB_bitwidthLog2; }
- int arrB_get2(__local const uint* const a, const uint cell, const uint subcell) { return (a[cell] >> subcell) & arrB_valueMask; }
- void arrB_set(__local uint* const a, const uint index, const int v) { atomic_cmpxchg(&a[index], a[index], v); }
- int arrB_get(__local const uint* const a, const uint index) { const uint x = index % 1024; return arrB_get2(a, arrB_cell(x), arrB_subcell(x)); }
- #define LINEAR_LOCAL_ID_arrB (get_local_id(1) * get_local_size(0) + get_local_id(0))
- #define LINEAR_GROUP_ID_arrB (get_group_id(1) * get_num_groups(0) + get_group_id(0))
- #define LINEAR_GLOBAL_ID_arrB (LINEAR_GROUP_ID_arrB * 1 + LINEAR_LOCAL_ID_arrB)
- uint arrB_getSectionID() { return (get_group_id(0) * 1) / get_num_groups(0); }
- uint arrB_getSectionID_2D() { return LINEAR_GROUP_ID_arrB % 1;}
- #define INIT_VAR_arrB(global,local) __local uint (local)[1024]; prefetch_arrB(arrB_getSectionID(), get_local_id(0), (global), (local));
- #define INIT_VAR_2D_arrB(global,local) __local uint (local)[1024]; prefetch_arrB(arrB_getSectionID_2D(), LINEAR_LOCAL_ID_arrB, (global), (local));
- #define INIT_VAR_2D_row_major_arrB(global,local, tileID, M) __local uint (local)[1][1]; prefetch_2D_row_major_arrB(tileID, M, (global), (local));
- #define INIT_VAR_2D_col_major_arrB(global,local, tileID, K) __local uint (local)[1][1]; prefetch_2D_col_major_arrB(tileID, K, (global), (local));
- uint arrB_global_index(uint sectionID, uint local_index) {
- return (sectionID * 1024 + local_index);
- }
- void prefetch_arrB(const uint sectionID, const uint threadID, __global const uint* const ga, __local uint* la) {
- const uint g_offset = sectionID * 1024;
- #pragma unroll 1024
- for (uint loop=0, cell=threadID; loop < 1024; loop++, cell += 1) {
- const uint g_cell = g_offset + cell;
- la[cell] = ga[g_cell];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- void prefetch_2D_row_major_arrB(const uint tileID, const uint M, __global const uint* const ga, __local uint** la) {
- const int row = get_local_id(0);
- const int col = get_local_id(1);
- const int globalRow = 1 * get_group_id(0) + row;
- const int tiledCol = 1 * tileID + col;
- la[col][row] = ga[tiledCol*M + globalRow];
- }
- void prefetch_2D_col_major_arrB(const uint tileID, const uint K, __global const uint* const ga, __local uint** la) {
- const int row = get_local_id(0);
- const int col = get_local_id(1);
- const int globalCol = 1 * get_group_id(1) + col;
- const int tiledRow = 1 * tileID + row;
- la[col][row] = ga[globalCol*K + tiledRow];
- }
- __kernel void multi_tile(const int M, const int N, const int K,
- const __global int* A,
- const __global int* B,
- const __global int* Target,
- __global int* C) {
- const int row = get_local_id(0); // Local row ID (max: arrA_GROUP_SIZE)
- const int col = get_local_id(1); // Local col ID (max: arrA_GROUP_SIZE)
- const int globalRow = arrA_GROUP_SIZE*get_group_id(0) + row; // Row ID of C (0..M)
- const int globalCol = arrA_GROUP_SIZE*get_group_id(1) + col; // Col ID of C (0..N)
- int acc = 0.0f;
- const int numTiles = K/arrA_GROUP_SIZE;
- for (int t=0; t<numTiles; t++) {
- INIT_VAR_2D_row_major_arrA(A, A_sub, t, M)
- INIT_VAR_2D_col_major_arrB(B, B_sub, t, K)
- barrier(CLK_LOCAL_MEM_FENCE);
- for (int k=0; k<arrA_GROUP_SIZE; k++) {
- acc += A_sub[k][row] * B_sub[col][k];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- // Store the final result in C
- C[globalCol*M + globalRow] = acc;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement