Advertisement
Guest User

Untitled

a guest
Mar 22nd, 2017
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.70 KB | None | 0 0
  1. #define arrA_GROUP_SIZE 1
  2. #define arrA_bitwidth 32
  3. #define arrA_bitwidthLog2 5
  4. #define arrA_valuesPerCell 1
  5. #define arrA_valuesPerCellLog2 0
  6. #define arrA_valueMask 0xffffffff
  7. #define arrA_indexMask 0x0
  8. #define arrA_base_offset 0
  9. uint arrA_cell(const uint index) { return index >> arrA_valuesPerCellLog2; }
  10. uint arrA_subcell(const uint index) { return (index & arrA_indexMask) << arrA_bitwidthLog2; }
  11. int arrA_get2(__local const uint* const a, const uint cell, const uint subcell) { return (a[cell] >> subcell) & arrA_valueMask; }
  12. void arrA_set(__local uint* const a, const uint index, const int v) { atomic_cmpxchg(&a[index], a[index], v); }
  13. int arrA_get(__local const uint* const a, const uint index) { const uint x = index % 1024; return arrA_get2(a, arrA_cell(x), arrA_subcell(x)); }
  14. #define LINEAR_LOCAL_ID_arrA (get_local_id(1) * get_local_size(0) + get_local_id(0))
  15. #define LINEAR_GROUP_ID_arrA (get_group_id(1) * get_num_groups(0) + get_group_id(0))
  16. #define LINEAR_GLOBAL_ID_arrA (LINEAR_GROUP_ID_arrA * 1 + LINEAR_LOCAL_ID_arrA)
  17. uint arrA_getSectionID() { return (get_group_id(0) * 1) / get_num_groups(0); }
  18. uint arrA_getSectionID_2D() { return LINEAR_GROUP_ID_arrA % 1;}
  19. #define INIT_VAR_arrA(global,local) __local uint (local)[1024]; prefetch_arrA(arrA_getSectionID(), get_local_id(0), (global), (local));
  20. #define INIT_VAR_2D_arrA(global,local) __local uint (local)[1024]; prefetch_arrA(arrA_getSectionID_2D(), LINEAR_LOCAL_ID_arrA, (global), (local));
  21. #define INIT_VAR_2D_row_major_arrA(global,local, tileID, M) __local uint (local)[1][1]; prefetch_2D_row_major_arrA(tileID, M, (global), (local));
  22. #define INIT_VAR_2D_col_major_arrA(global,local, tileID, K) __local uint (local)[1][1]; prefetch_2D_col_major_arrA(tileID, K, (global), (local));
  23. uint arrA_global_index(uint sectionID, uint local_index) {
  24. return (sectionID * 1024 + local_index);
  25. }
  26. void prefetch_arrA(const uint sectionID, const uint threadID, __global const uint* const ga, __local uint* la) {
  27. const uint g_offset = sectionID * 1024;
  28. #pragma unroll 1024
  29. for (uint loop=0, cell=threadID; loop < 1024; loop++, cell += 1) {
  30. const uint g_cell = g_offset + cell;
  31. la[cell] = ga[g_cell];
  32. }
  33. barrier(CLK_LOCAL_MEM_FENCE);
  34. }
  35. void prefetch_2D_row_major_arrA(const uint tileID, const uint M, __global const uint* const ga, __local uint** la) {
  36. const int row = get_local_id(0);
  37. const int col = get_local_id(1);
  38. const int globalRow = 1 * get_group_id(0) + row;
  39. const int tiledCol = 1 * tileID + col;
  40. la[col][row] = ga[tiledCol*M + globalRow];
  41. }
  42. void prefetch_2D_col_major_arrA(const uint tileID, const uint K, __global const uint* const ga, __local uint** la) {
  43. const int row = get_local_id(0);
  44. const int col = get_local_id(1);
  45. const int globalCol = 1 * get_group_id(1) + col;
  46. const int tiledRow = 1 * tileID + row;
  47. la[col][row] = ga[globalCol*K + tiledRow];
  48. }
  49. #define arrB_GROUP_SIZE 1
  50. #define arrB_bitwidth 32
  51. #define arrB_bitwidthLog2 5
  52. #define arrB_valuesPerCell 1
  53. #define arrB_valuesPerCellLog2 0
  54. #define arrB_valueMask 0xffffffff
  55. #define arrB_indexMask 0x0
  56. #define arrB_base_offset 0
  57. uint arrB_cell(const uint index) { return index >> arrB_valuesPerCellLog2; }
  58. uint arrB_subcell(const uint index) { return (index & arrB_indexMask) << arrB_bitwidthLog2; }
  59. int arrB_get2(__local const uint* const a, const uint cell, const uint subcell) { return (a[cell] >> subcell) & arrB_valueMask; }
  60. void arrB_set(__local uint* const a, const uint index, const int v) { atomic_cmpxchg(&a[index], a[index], v); }
  61. int arrB_get(__local const uint* const a, const uint index) { const uint x = index % 1024; return arrB_get2(a, arrB_cell(x), arrB_subcell(x)); }
  62. #define LINEAR_LOCAL_ID_arrB (get_local_id(1) * get_local_size(0) + get_local_id(0))
  63. #define LINEAR_GROUP_ID_arrB (get_group_id(1) * get_num_groups(0) + get_group_id(0))
  64. #define LINEAR_GLOBAL_ID_arrB (LINEAR_GROUP_ID_arrB * 1 + LINEAR_LOCAL_ID_arrB)
  65. uint arrB_getSectionID() { return (get_group_id(0) * 1) / get_num_groups(0); }
  66. uint arrB_getSectionID_2D() { return LINEAR_GROUP_ID_arrB % 1;}
  67. #define INIT_VAR_arrB(global,local) __local uint (local)[1024]; prefetch_arrB(arrB_getSectionID(), get_local_id(0), (global), (local));
  68. #define INIT_VAR_2D_arrB(global,local) __local uint (local)[1024]; prefetch_arrB(arrB_getSectionID_2D(), LINEAR_LOCAL_ID_arrB, (global), (local));
  69. #define INIT_VAR_2D_row_major_arrB(global,local, tileID, M) __local uint (local)[1][1]; prefetch_2D_row_major_arrB(tileID, M, (global), (local));
  70. #define INIT_VAR_2D_col_major_arrB(global,local, tileID, K) __local uint (local)[1][1]; prefetch_2D_col_major_arrB(tileID, K, (global), (local));
  71. uint arrB_global_index(uint sectionID, uint local_index) {
  72. return (sectionID * 1024 + local_index);
  73. }
  74. void prefetch_arrB(const uint sectionID, const uint threadID, __global const uint* const ga, __local uint* la) {
  75. const uint g_offset = sectionID * 1024;
  76. #pragma unroll 1024
  77. for (uint loop=0, cell=threadID; loop < 1024; loop++, cell += 1) {
  78. const uint g_cell = g_offset + cell;
  79. la[cell] = ga[g_cell];
  80. }
  81. barrier(CLK_LOCAL_MEM_FENCE);
  82. }
  83. void prefetch_2D_row_major_arrB(const uint tileID, const uint M, __global const uint* const ga, __local uint** la) {
  84. const int row = get_local_id(0);
  85. const int col = get_local_id(1);
  86. const int globalRow = 1 * get_group_id(0) + row;
  87. const int tiledCol = 1 * tileID + col;
  88. la[col][row] = ga[tiledCol*M + globalRow];
  89. }
  90. void prefetch_2D_col_major_arrB(const uint tileID, const uint K, __global const uint* const ga, __local uint** la) {
  91. const int row = get_local_id(0);
  92. const int col = get_local_id(1);
  93. const int globalCol = 1 * get_group_id(1) + col;
  94. const int tiledRow = 1 * tileID + row;
  95. la[col][row] = ga[globalCol*K + tiledRow];
  96. }
  97. __kernel void multi_tile(const int M, const int N, const int K,
  98. const __global int* A,
  99. const __global int* B,
  100. const __global int* Target,
  101. __global int* C) {
  102.  
  103. const int row = get_local_id(0); // Local row ID (max: arrA_GROUP_SIZE)
  104. const int col = get_local_id(1); // Local col ID (max: arrA_GROUP_SIZE)
  105. const int globalRow = arrA_GROUP_SIZE*get_group_id(0) + row; // Row ID of C (0..M)
  106. const int globalCol = arrA_GROUP_SIZE*get_group_id(1) + col; // Col ID of C (0..N)
  107.  
  108. int acc = 0.0f;
  109.  
  110. const int numTiles = K/arrA_GROUP_SIZE;
  111. for (int t=0; t<numTiles; t++) {
  112. INIT_VAR_2D_row_major_arrA(A, A_sub, t, M)
  113. INIT_VAR_2D_col_major_arrB(B, B_sub, t, K)
  114. barrier(CLK_LOCAL_MEM_FENCE);
  115.  
  116. for (int k=0; k<arrA_GROUP_SIZE; k++) {
  117. acc += A_sub[k][row] * B_sub[col][k];
  118. }
  119.  
  120. barrier(CLK_LOCAL_MEM_FENCE);
  121. }
  122.  
  123. // Store the final result in C
  124. C[globalCol*M + globalRow] = acc;
  125. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement