Guest User

Untitled

a guest
Jul 17th, 2018
92
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.35 KB | None | 0 0
  1. <n>
  2. <power>
  3. <element>
  4. .....
  5.  
  6. #include <assert.h>
  7. #include <stdio.h>
  8. #include <stdlib.h>
  9. #include <sys/resource.h>
  10.  
  11. #define BLOCK 8
  12. #define SIZE (BLOCK * 64)
  13. #define TILE_SIZE (8)
  14.  
  15. int n;
  16.  
  17.  
  18. float *
  19. create_matrix_h(unsigned int w, unsigned int h) {
  20. float *m;
  21. m = (float *) malloc(w * h * sizeof(float));
  22. if (m == NULL) {
  23. fprintf(stderr, "Failed to malloc.n");
  24. exit(1);
  25. }
  26. return m;
  27. }
  28.  
  29. __global__ void
  30. kernel3(const float *m1, const float *m2, float *m3, unsigned int width) {
  31. const unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
  32. const unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
  33. unsigned int t, i;
  34. float result = 0, a, b;
  35.  
  36. for (t = 0; t < width / TILE_SIZE; ++t) {
  37. for (i = 0; i != TILE_SIZE; ++i) {
  38. a = m1[row*width + t*TILE_SIZE + i];
  39. b = m2[(t*TILE_SIZE + i)*width + col];
  40. result += a * b;
  41. }
  42. __syncthreads();
  43. }
  44. m3[row*width + col] = result;
  45. }
  46.  
  47. float *
  48. create_matrix_d(int w, int h) {
  49. float *m;
  50. if (cudaMalloc(&m, w * h * sizeof(float)) == cudaErrorMemoryAllocation) {
  51. fprintf(stderr, "Failed to cudaMalloc.n");
  52. return NULL;
  53. //exit(1);
  54. }
  55. return m;
  56. }
  57.  
  58. void
  59. fill_matrix_h(float *const m, int w, int h, float *const values, int nvalues) {
  60. int i, j = 0;
  61. for (i = 0; i != w * h; ++i) {
  62. m[i] = values[j];
  63. j = (j + 1) % nvalues;
  64. }
  65. }
  66.  
  67. int
  68. main(void) {
  69. int k;
  70. if (scanf("%d", &n) !=1 || n<1){
  71. return 0;
  72. }
  73. if (scanf(" %d", &k) !=1 || k<0){
  74. return 0;
  75. }
  76. float *hm[3], *dm[3];
  77. dim3 bdim(TILE_SIZE, TILE_SIZE);
  78. dim3 gdim(SIZE/TILE_SIZE, SIZE/TILE_SIZE);
  79. int i;
  80. for(i=0; i<3; ++i) {
  81. hm[i] = create_matrix_h(SIZE, SIZE);
  82. dm[i] = create_matrix_d(SIZE, SIZE);
  83. }
  84. float tem[n*n];
  85. for(i=0; i<n*n; ++i) {
  86. if (scanf(" %f", &tem[i]) !=1){
  87. return 0;
  88. }
  89. }
  90. float temid[n*n];
  91. int j = 0;
  92. for (i = 0; i != n*n; ++i) {
  93. if (i==0 || i == j + (n+1)) {
  94. temid[i] = 1;
  95. j = i;
  96. }
  97. else {
  98. temid[i] = 0;
  99. }
  100. }
  101. fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float));
  102. fill_matrix_h(hm[1], SIZE, SIZE, temid, sizeof(temid)/sizeof(float));
  103. cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
  104. int w;
  105. for (w=0; w<k; ++w) {
  106. cudaMemcpy(dm[1], hm[1], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
  107. kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE);
  108. cudaThreadSynchronize();
  109. cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
  110. hm[1] = hm[2];
  111. }
  112. printf(" %.3f ", hm[2][0]);
  113. return 0;
  114.  
  115. }
  116.  
  117. 2
  118. 2
  119. 1
  120. 2
  121. 3
  122. 4
  123.  
  124. #include <assert.h>
  125. #include <stdio.h>
  126. #include <stdlib.h>
  127. #include <sys/resource.h>
  128.  
  129. #define BLOCK 8
  130. #define SIZE (BLOCK * 64)
  131. #define TILE_SIZE (8)
  132.  
  133. int n;
  134.  
  135.  
  136. float *
  137. create_matrix_h(unsigned int w, unsigned int h) {
  138. float *m;
  139. m = (float *) malloc(w * h * sizeof(float));
  140. if (m == NULL) {
  141. fprintf(stderr, "Failed to malloc.n");
  142. exit(1);
  143. }
  144. return m;
  145. }
  146.  
  147. void
  148. print_matrix(const float *m, const int w, const int h) {
  149. int x, y;
  150. for (y = 0; y != h; ++y) {
  151. for (x = 0; x != w; ++x)
  152. printf("%.03f ", m[y*w + x]);
  153. printf("n");
  154. }
  155. }
  156.  
  157.  
  158. void
  159. cpu_mult(const float *m1, const float *m2, float *m3, unsigned int width) {
  160. unsigned int i, j, k;
  161. float result;
  162.  
  163. for (i = 0; i != width; ++i) {
  164. for (j = 0; j != width; ++j) {
  165. result = 0;
  166. for (k = 0; k != width; ++k)
  167. result += m1[i*width + k] * m2[k*width + j];
  168. m3[i*width + j] = result;
  169. }
  170. }
  171. }
  172.  
  173.  
  174. __global__ void
  175. kernel3(const float *m1, const float *m2, float *m3, unsigned int width) {
  176. const unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
  177. const unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
  178. unsigned int t, i;
  179. float result = 0, a, b;
  180.  
  181. for (t = 0; t < width / TILE_SIZE; ++t) {
  182. for (i = 0; i != TILE_SIZE; ++i) {
  183. a = m1[row*width + t*TILE_SIZE + i];
  184. b = m2[(t*TILE_SIZE + i)*width + col];
  185. result += a * b;
  186. }
  187. __syncthreads();
  188. }
  189. m3[row*width + col] = result;
  190. }
  191.  
  192. float *
  193. create_matrix_d(int w, int h) {
  194. float *m;
  195. if (cudaMalloc(&m, w * h * sizeof(float)) == cudaErrorMemoryAllocation) {
  196. fprintf(stderr, "Failed to cudaMalloc.n");
  197. return NULL;
  198. //exit(1);
  199. }
  200. return m;
  201. }
  202.  
  203. void
  204. fill_matrix_h(float *const m, int w, int h, float *const values, int nvalues) {
  205. int i, j = 0;
  206. for (i = 0; i != w * h; ++i) {
  207. m[i] = values[j];
  208. j = (j + 1) % nvalues;
  209. }
  210. }
  211.  
  212. int
  213. main(void) {
  214. int k;
  215. if (scanf("%d", &n) !=1 || n<1){
  216. return 0;
  217. }
  218. if (scanf(" %d", &k) !=1 || k<0){
  219. return 0;
  220. }
  221. float *hm[3], *dm[3];
  222. dim3 bdim(TILE_SIZE, TILE_SIZE);
  223. dim3 gdim(SIZE/TILE_SIZE, SIZE/TILE_SIZE);
  224. int i;
  225. for(i=0; i<3; ++i) {
  226. hm[i] = create_matrix_h(SIZE, SIZE);
  227. dm[i] = create_matrix_d(SIZE, SIZE);
  228. }
  229. float tem[n*n];
  230. for(i=0; i<n*n; ++i) {
  231. if (scanf(" %f", &tem[i]) !=1){
  232. return 0;
  233. }
  234. }
  235. float temid[n*n];
  236. int j = 0;
  237. for (i = 0; i != n*n; ++i) {
  238. if (i==0 || j == n) { // not j + (n+1)
  239. temid[i] = 1;
  240. j=0;
  241. }
  242. else {
  243. temid[i] = 0;
  244. j++;
  245. }
  246. }
  247. fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float));
  248. fill_matrix_h(hm[1], SIZE, SIZE, temid, sizeof(temid)/sizeof(float));
  249. cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
  250. dm[1] = dm[0]; // For the first iteration Result = A * A;
  251. int w;
  252. if (k==0) {
  253. hm[2] = hm[1];
  254. }
  255. else if (k==1) {
  256. hm[2] = hm[0];
  257. }
  258. else {
  259. for (w=1; w<k; ++w) {
  260. kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE);
  261. cudaThreadSynchronize();
  262. // No need to copy back to host
  263. // cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
  264. // Copy between device pointers
  265. cudaMemcpy(dm[1], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToDevice);
  266. }
  267. cudaMemcpy(hm[2], dm[1], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
  268. }
  269.  
  270.  
  271.  
  272. print_matrix(hm[2], n, n);
  273.  
  274. return 0;
  275.  
  276. }
  277.  
  278. for (i = 0; i != n*n; ++i) {
  279. if (i==0 || i == j + (n)) { // not j + (n+1)
  280. temid[i] = 1;
  281. j = i;
  282. }
  283. else {
  284. temid[i] = 0;
  285. }
  286. }
  287.  
  288. fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float));
  289. fill_matrix_h(hm[1], SIZE, SIZE, temid, sizeof(temid)/sizeof(float));
  290. cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
  291. int w;
  292. for (w=0; w<k; ++w) {
  293. cudaMemcpy(dm[1], hm[1], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
  294. kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE);
  295. cudaThreadSynchronize();
  296. cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
  297. hm[1] = hm[2];
  298. }
  299.  
  300. fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float));
  301. cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
  302. dm[1] = dm[0]; // For the first iteration Result = A * A;
  303. int w;
  304. for (w=0; w<k; ++w) {
  305. kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE);
  306. cudaThreadSynchronize();
  307. // No need to copy back to host
  308. // cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
  309. // Copy between device pointers
  310. cudaMemcpy(dm[1], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToDevice);
  311. }
Add Comment
Please, Sign In to add comment