Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #define arrA_GROUP_SIZE 1
- #define arrA_bitwidth 8
- #define arrA_bitwidthLog2 3
- #define arrA_valuesPerCell 4
- #define arrA_valuesPerCellLog2 2
- #define arrA_valueMask 0xff
- #define arrA_indexMask 0x3
- #define arrA_base_offset 0
- uint arrA_cell(const uint index) { return index >> arrA_valuesPerCellLog2; }
- uint arrA_subcell(const uint index) { return (index & arrA_indexMask) << arrA_bitwidthLog2; }
- int arrA_get2(__local const uint* const a, const uint cell, const uint subcell) { return (a[cell] >> subcell) & arrA_valueMask; }
- void arrA_set2(__local uint* const a, const uint cell, const uint subcell, const int v) { uint original = 0; uint current = 0; do { original = a[cell]; const uint cleared = original & ~(arrA_valueMask << subcell); const uint desired = cleared | v << subcell; current = atomic_cmpxchg(&a[cell], original, desired); } while (current != original);}
- void arrA_set(__local uint* const a, const uint index, const uint v) { arrA_set2(a, arrA_cell(index), arrA_subcell(index), v); }
- int arrA_get(__local const uint* const a, const uint index) { const uint x = index % 8; return arrA_get2(a, arrA_cell(x), arrA_subcell(x)); }
- #define LINEAR_LOCAL_ID_arrA (get_local_id(1) * get_local_size(0) + get_local_id(0))
- #define LINEAR_GROUP_ID_arrA (get_group_id(1) * get_num_groups(0) + get_group_id(0))
- #define LINEAR_GLOBAL_ID_arrA (LINEAR_GROUP_ID_arrA * 1 + LINEAR_LOCAL_ID_arrA)
- uint arrA_getSectionID() { return (get_group_id(0) * 1) / get_num_groups(0); }
- uint arrA_getSectionID_2D() { return LINEAR_GROUP_ID_arrA % 1;}
- #define INIT_VAR_arrA(global,local) __local uint (local)[2]; prefetch_arrA(arrA_getSectionID(), get_local_id(0), (global), (local));
- #define INIT_VAR_2D_arrA(global,local) __local uint (local)[2]; prefetch_arrA(arrA_getSectionID_2D(), LINEAR_LOCAL_ID_arrA, (global), (local));
- #define INIT_VAR_2D_row_major_arrA(global,local, tileID, M) __local uint (local)[1][1]; prefetch_2D_row_major_arrA(tileID, M, (global), (local));
- #define INIT_VAR_2D_col_major_arrA(global,local, tileID, K) __local uint (local)[1][1]; prefetch_2D_col_major_arrA(tileID, K, (global), (local));
- uint arrA_global_index(uint sectionID, uint local_index) {
- return (sectionID * 8 + local_index);
- }
- void prefetch_arrA(const uint sectionID, const uint threadID, __global const uint* const ga, __local uint* la) {
- const uint g_offset = sectionID * 2;
- #pragma unroll 2
- for (uint loop=0, cell=threadID; loop < 2; loop++, cell += 1) {
- const uint g_cell = g_offset + cell;
- la[cell] = ga[g_cell];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- void prefetch_2D_row_major_arrA(const uint tileID, const uint M, __global const uint* const ga, __local uint** la) {
- const int row = get_local_id(0);
- const int col = get_local_id(1);
- const int globalRow = 1 * get_group_id(0) + row;
- const int tiledCol = 1 * tileID + col;
- la[col][row] = ga[tiledCol*M + globalRow];
- }
- void prefetch_2D_col_major_arrA(const uint tileID, const uint K, __global const uint* const ga, __local uint** la) {
- const int row = get_local_id(0);
- const int col = get_local_id(1);
- const int globalCol = 1 * get_group_id(1) + col;
- const int tiledRow = 1 * tileID + row;
- la[col][row] = ga[globalCol*K + tiledRow];
- }
- #define arrB_GROUP_SIZE 1
- #define arrB_bitwidth 8
- #define arrB_bitwidthLog2 3
- #define arrB_valuesPerCell 4
- #define arrB_valuesPerCellLog2 2
- #define arrB_valueMask 0xff
- #define arrB_indexMask 0x3
- #define arrB_base_offset 0
- uint arrB_cell(const uint index) { return index >> arrB_valuesPerCellLog2; }
- uint arrB_subcell(const uint index) { return (index & arrB_indexMask) << arrB_bitwidthLog2; }
- int arrB_get2(__local const uint* const a, const uint cell, const uint subcell) { return (a[cell] >> subcell) & arrB_valueMask; }
- void arrB_set2(__local uint* const a, const uint cell, const uint subcell, const int v) { uint original = 0; uint current = 0; do { original = a[cell]; const uint cleared = original & ~(arrB_valueMask << subcell); const uint desired = cleared | v << subcell; current = atomic_cmpxchg(&a[cell], original, desired); } while (current != original);}
- void arrB_set(__local uint* const a, const uint index, const uint v) { arrB_set2(a, arrB_cell(index), arrB_subcell(index), v); }
- int arrB_get(__local const uint* const a, const uint index) { const uint x = index % 8; return arrB_get2(a, arrB_cell(x), arrB_subcell(x)); }
- #define LINEAR_LOCAL_ID_arrB (get_local_id(1) * get_local_size(0) + get_local_id(0))
- #define LINEAR_GROUP_ID_arrB (get_group_id(1) * get_num_groups(0) + get_group_id(0))
- #define LINEAR_GLOBAL_ID_arrB (LINEAR_GROUP_ID_arrB * 1 + LINEAR_LOCAL_ID_arrB)
- uint arrB_getSectionID() { return (get_group_id(0) * 1) / get_num_groups(0); }
- uint arrB_getSectionID_2D() { return LINEAR_GROUP_ID_arrB % 1;}
- #define INIT_VAR_arrB(global,local) __local uint (local)[2]; prefetch_arrB(arrB_getSectionID(), get_local_id(0), (global), (local));
- #define INIT_VAR_2D_arrB(global,local) __local uint (local)[2]; prefetch_arrB(arrB_getSectionID_2D(), LINEAR_LOCAL_ID_arrB, (global), (local));
- #define INIT_VAR_2D_row_major_arrB(global,local, tileID, M) __local uint (local)[1][1]; prefetch_2D_row_major_arrB(tileID, M, (global), (local));
- #define INIT_VAR_2D_col_major_arrB(global,local, tileID, K) __local uint (local)[1][1]; prefetch_2D_col_major_arrB(tileID, K, (global), (local));
- uint arrB_global_index(uint sectionID, uint local_index) {
- return (sectionID * 8 + local_index);
- }
- void prefetch_arrB(const uint sectionID, const uint threadID, __global const uint* const ga, __local uint* la) {
- const uint g_offset = sectionID * 2;
- #pragma unroll 2
- for (uint loop=0, cell=threadID; loop < 2; loop++, cell += 1) {
- const uint g_cell = g_offset + cell;
- la[cell] = ga[g_cell];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- void prefetch_2D_row_major_arrB(const uint tileID, const uint M, __global const uint* const ga, __local uint** la) {
- const int row = get_local_id(0);
- const int col = get_local_id(1);
- const int globalRow = 1 * get_group_id(0) + row;
- const int tiledCol = 1 * tileID + col;
- la[col][row] = ga[tiledCol*M + globalRow];
- }
- void prefetch_2D_col_major_arrB(const uint tileID, const uint K, __global const uint* const ga, __local uint** la) {
- const int row = get_local_id(0);
- const int col = get_local_id(1);
- const int globalCol = 1 * get_group_id(1) + col;
- const int tiledRow = 1 * tileID + row;
- la[col][row] = ga[globalCol*K + tiledRow];
- }
- __kernel void multi_tile(const int M, const int N, const int K,
- const __global int* A,
- const __global int* B,
- const __global int* Target,
- __global int* C) {
- const int row = get_local_id(0); // Local row ID (max: arrA_GROUP_SIZE)
- const int col = get_local_id(1); // Local col ID (max: arrA_GROUP_SIZE)
- const int globalRow = arrA_GROUP_SIZE*get_group_id(0) + row; // Row ID of C (0..M)
- const int globalCol = arrA_GROUP_SIZE*get_group_id(1) + col; // Col ID of C (0..N)
- int acc = 0;
- const int numTiles = K/arrA_GROUP_SIZE;
- for (int t=0; t<numTiles; t++) {
- INIT_VAR_2D_col_major_arrA(A, A_sub, t, M)
- INIT_VAR_2D_row_major_arrB(B, B_sub, t, K)
- barrier(CLK_LOCAL_MEM_FENCE);
- for (int k=0; k<arrA_GROUP_SIZE; k++) {
- acc += A_sub[k][row] * B_sub[col][k];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
- }
- // Store the final result in C
- printf("%d, %d - %d\n", row, col, acc);
- C[globalCol*M + globalRow] = acc;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement