Advertisement
Guest User

Untitled

a guest
Jan 17th, 2017
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 14.15 KB | None | 0 0
  1. #include <cstdio>
  2. #include <cstdlib>
  3. #include <iostream>
  4. #include <fstream>
  5. #include <omp.h>
  6.  
  7. int* d;
  8. int* graph;
  9.  
  10.  
  11.  
  12. __constant__ int cuda_bf;
  13. __constant__ int cuda_total_vertex;
  14. __constant__ int cuda_tempVertex;
  15. __constant__ int cuda_device_num;
  16. __constant__ int cuda_FW_block;
  17.  
  18. #define INF 1e9
  19. #define H2D cudaMemcpyHostToDevice
  20. #define D2H cudaMemcpyDeviceToHost
  21. #define D2D cudaMemcpyDeviceToDevice
  22.  
  23. using namespace std;
  24.  
  25. int
  26. init_device ()
  27. {
  28. cudaSetDevice(0);
  29. return 0;
  30. }
  31.  
  32. #define cudaCheckErrors(msg) \
  33. do { \
  34. cudaError_t __err = cudaGetLastError(); \
  35. if (__err != cudaSuccess) { \
  36. fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
  37. msg, cudaGetErrorString(__err), __FILE__, __LINE__); \
  38. fprintf(stderr, "*** FAILED - ABORTING\n"); \
  39. exit(1); \
  40. } \
  41. } while (0)
  42.  
  43. //extern __shared__ int D[];
  44. __global__ void floyd_warshall_1(int* dist,int k ,int kbf){
  45.  
  46. int idx,idy;
  47.  
  48.  
  49. idx = k ;
  50. idy = k ;
  51.  
  52. int i = cuda_bf * idx + threadIdx.y;
  53. int j = cuda_bf * idy + threadIdx.x;
  54. if(i>=cuda_total_vertex||j>=cuda_total_vertex)
  55. return ;
  56.  
  57. __shared__ int D[32*32];
  58. D[threadIdx.y*cuda_bf + threadIdx.x] = dist[i*cuda_tempVertex + j];
  59. __syncthreads();
  60. // Put to shared memory???
  61. int x = 0;
  62.  
  63. //int dij = dist[i*total_vertex + j];
  64. //int dik = dist[i*total_vertex + k];
  65. //int dkj = dist[k*total_vertex + j];
  66. int dij ,dik,dkj;
  67. int a = threadIdx.y * cuda_bf + threadIdx.x;
  68. int b = threadIdx.y * cuda_bf;
  69. while( x < cuda_bf ){
  70. dij = D[a];
  71. dik = D[b + x];
  72. dkj = D[x*cuda_bf + threadIdx.x];
  73. if(dij>dik+dkj){
  74. D[a] = dik + dkj;
  75. }
  76. __syncthreads();
  77. x++;
  78. }
  79.  
  80. dist[i*cuda_tempVertex + j] = D[threadIdx.y*cuda_bf + threadIdx.x];
  81.  
  82. return ;
  83. }
  84.  
  85. __global__ void floyd_warshall_2(int* dist,int k , int kbf ){
  86.  
  87. int idx,idy;
  88.  
  89. if(blockIdx.x % 2 == 0 ){
  90. idx = (blockIdx.x/2) >= k ? (blockIdx.x/2+1):(blockIdx.x/2);
  91. idy = k;
  92. }
  93. else {
  94. idx = k;
  95. idy = (blockIdx.x/2) >= k ? (blockIdx.x/2+1):(blockIdx.x/2);
  96. }
  97.  
  98. int i = cuda_bf * idx + threadIdx.y;
  99. int j = cuda_bf * idy + threadIdx.x;
  100. //bool flag = 0;
  101. //if(i>=cuda_total_vertex||j>=cuda_total_vertex)
  102. // return;
  103.  
  104. __shared__ int D2[32*32*2];
  105. D2[threadIdx.y * cuda_bf + threadIdx.x] = dist[i*cuda_tempVertex + j];
  106. D2[(cuda_bf*cuda_bf) + (threadIdx.y *cuda_bf ) + (threadIdx.x)] = dist[ (kbf+threadIdx.y) * cuda_tempVertex + (kbf +threadIdx.x)];
  107. __syncthreads();
  108. // Put to shared memory???
  109. int x = 0;
  110.  
  111. int dij ,dik,dkj;
  112. int a = (threadIdx.y * cuda_bf + threadIdx.x);
  113. int b;
  114. if(blockIdx.x%2==0){
  115. b = cuda_bf*cuda_bf + threadIdx.x;
  116. }
  117. else{
  118. b = cuda_bf*cuda_bf + cuda_bf*threadIdx.y;
  119. }
  120.  
  121. dij = D2[a];
  122.  
  123. while(x<cuda_bf){
  124.  
  125. if(blockIdx.x%2==0){
  126. dik = D2[cuda_bf*threadIdx.y + x];
  127. dkj = D2[b + (x*cuda_bf)];
  128. }
  129. else{
  130. dik = D2[b + x];
  131. dkj = D2[x*cuda_bf + threadIdx.x];
  132. }
  133. if(dij>dik+dkj){
  134. dij = dik + dkj;
  135. }
  136. __syncthreads();
  137. x++;
  138. }
  139. dist[i*cuda_tempVertex + j] = dij;
  140.  
  141. return ;
  142. }
  143.  
  144. __global__ void floyd_warshall_3(int* dist, int k ,int kbf,int ID){
  145.  
  146. int idx,idy;
  147.  
  148. int blockIdx_x = ((cuda_FW_block-1)/cuda_device_num)*ID + blockIdx.x;
  149.  
  150.  
  151. idy = blockIdx.y >= k? blockIdx.y + 1 : blockIdx.y;
  152. idx = blockIdx_x >= k? blockIdx_x + 1 : blockIdx_x;
  153.  
  154. int i = cuda_bf * idx + threadIdx.y;
  155. int j = cuda_bf * idy + threadIdx.x;
  156. //if(i>=cuda_total_vertex||j>=cuda_total_vertex)
  157. // return ;
  158.  
  159. __shared__ int D3[32*32*3];
  160. D3[threadIdx.y * cuda_bf + threadIdx.x] = dist[i*cuda_tempVertex + j];
  161. D3[(cuda_bf*cuda_bf) + (threadIdx.y*cuda_bf) + threadIdx.x] = dist[(cuda_bf*idx+threadIdx.y)*cuda_tempVertex + (kbf + threadIdx.x)];
  162. D3[(2*cuda_bf*cuda_bf) + (threadIdx.y*cuda_bf) + threadIdx.x] = dist[(kbf+threadIdx.y)*cuda_tempVertex + (idy*cuda_bf+threadIdx.x)];
  163. __syncthreads();
  164.  
  165. // Put to shared memory???
  166. int x = 0;
  167. int dij ,dik,dkj;
  168.  
  169. int a =threadIdx.y * cuda_bf + threadIdx.x;
  170. int b = cuda_bf*cuda_bf + threadIdx.y*cuda_bf;
  171. int c = 2*cuda_bf*cuda_bf + threadIdx.x;
  172.  
  173. dij = D3[a];
  174.  
  175. while(x<cuda_bf){
  176. dik = D3[b + x];
  177. dkj = D3[x*cuda_bf + c];
  178. if(dij>dik+dkj){
  179. dij = dik + dkj;
  180. }
  181. x++;
  182. }
  183. dist[i*cuda_tempVertex + j] = dij;
  184.  
  185. return ;
  186. }
  187.  
  188.  
  189.  
  190. __global__ void floyd_warshall_beta_1(int* dist, int k , int kbf ){
  191.  
  192. int idx,idy;
  193. idx = k;
  194. idy = k;
  195. int i = cuda_bf * idx + (blockIdx.x%cuda_bf);
  196. int j = cuda_bf * idy + threadIdx.x;
  197. if(i>=cuda_total_vertex||j>=cuda_total_vertex)
  198. return ;
  199.  
  200. // Put to shared memory???
  201.  
  202. int dij = dist[i*cuda_tempVertex + j];
  203. int dik = dist[i*cuda_tempVertex + kbf];
  204. int dkj = dist[kbf*cuda_tempVertex + j];
  205.  
  206. if(dij>dik+dkj){
  207. dist[i*cuda_tempVertex+j] = dik + dkj;
  208. }
  209.  
  210. return ;
  211. }
  212.  
  213. __global__ void floyd_warshall_beta_2(int* dist, int k , int kbf ){
  214.  
  215. int idx,idy;
  216. int temp = blockIdx.x / cuda_bf;
  217. if( (temp) % 2 == 0 ){
  218. idx = (temp/2) >= k ? (temp/2+1):(temp/2);
  219. idy = k;
  220. }
  221. else {
  222. idx = k;
  223. idy = (temp/2) >= k ? (temp/2+1):(temp/2);
  224. }
  225.  
  226. int i = cuda_bf * idx + (blockIdx.x%cuda_bf);
  227. int j = cuda_bf * idy + threadIdx.x;
  228. if(i>=cuda_total_vertex||j>=cuda_total_vertex)
  229. return ;
  230.  
  231. // Put to shared memory???
  232.  
  233. int dij = dist[i*cuda_tempVertex + j];
  234. int dik = dist[i*cuda_tempVertex + kbf];
  235. int dkj = dist[kbf*cuda_tempVertex + j];
  236.  
  237. if(dij>dik+dkj){
  238. dist[i*cuda_tempVertex+j] = dik + dkj;
  239. }
  240.  
  241. return ;
  242. }
  243.  
  244. __global__ void floyd_warshall_beta_3(int* dist, int k , int kbf ,int grid_size,int ID ){
  245.  
  246. int idx,idy;
  247.  
  248. int blockIdx_y = ((cuda_FW_block-1)/cuda_device_num)*ID + blockIdx.y;
  249. int temp = ((blockIdx_y*gridDim.x) + blockIdx.x) / cuda_bf;
  250. idx = temp/grid_size >= k? temp/grid_size + 1 : temp/grid_size;
  251. idy = temp % grid_size >= k? temp%grid_size + 1 : temp % grid_size;
  252.  
  253. int i = cuda_bf * idx + (blockIdx.x%cuda_bf);
  254.  
  255.  
  256. int j = cuda_bf * idy + threadIdx.x;
  257. if(i>=cuda_total_vertex||j>=cuda_total_vertex)
  258. return ;
  259.  
  260. // Put to shared memory???
  261.  
  262. int x = kbf + cuda_bf;
  263. int dij ,dik,dkj;
  264.  
  265. while(kbf<x){
  266. dij = dist[i*cuda_tempVertex + j];
  267. dik = dist[i*cuda_tempVertex + kbf];
  268. dkj = dist[kbf*cuda_tempVertex + j];
  269. if(dij>dik+dkj){
  270. dist[i*cuda_tempVertex + j] = dik + dkj;
  271. }
  272. //__syncthreads();
  273. kbf++;
  274. }
  275. return;
  276.  
  277. }
  278.  
  279.  
  280.  
  281.  
  282. int main(int argc,char* argv[]){
  283.  
  284. cudaEvent_t total_start, total_stop;
  285. cudaEvent_t com_start, com_stop;
  286. cudaEvent_t mem_start, mem_stop;
  287. cudaEvent_t io_start, io_stop;
  288.  
  289. float total_temp=0,total_total=0,io_temp =0 , io_total=0 , com_temp =0,com_total=0 , mem_temp=0 , mem_total=0;
  290.  
  291. cudaEventCreate(&total_start);
  292. cudaEventCreate(&total_stop);
  293. cudaEventCreate(&com_start);
  294. cudaEventCreate(&com_stop);
  295. cudaEventCreate(&mem_start);
  296. cudaEventCreate(&mem_stop);
  297. cudaEventCreate(&io_start);
  298. cudaEventCreate(&io_stop);
  299.  
  300.  
  301.  
  302. cudaSetDevice(0);
  303.  
  304. cudaEventRecord(total_start);
  305.  
  306. //
  307. struct cudaDeviceProp prop;
  308. cudaGetDeviceProperties(&prop,0);
  309. fprintf(stderr,"clock rate %lf\n",prop.clockRate);
  310.  
  311. int bf = atoi(argv[3]);
  312. int total_vertex;
  313. int edge_num;
  314.  
  315. int DEVICE_NUM = 2;
  316.  
  317. ifstream input;
  318. ofstream output;
  319. input.open(argv[1]);
  320.  
  321. input >> total_vertex;
  322. input >> edge_num;
  323.  
  324. int tempVertex = total_vertex % bf ? (total_vertex + (bf - (total_vertex%bf) )): total_vertex;
  325.  
  326.  
  327. fprintf(stderr,"tempVertex:%d\n",tempVertex);
  328.  
  329. d = new int[tempVertex*tempVertex];
  330. graph = new int[tempVertex*tempVertex];
  331.  
  332. for(int i=0;i<tempVertex;i++){
  333. for(int j=0;j<tempVertex;j++){
  334. graph[i*tempVertex+j] = INF;
  335. }
  336. graph[i*tempVertex + i ]=0;
  337. }
  338.  
  339.  
  340. cudaEventRecord(io_start);
  341. for(int i=0;i<edge_num;i++){
  342. int a,b;
  343. input >> a;
  344. input >> b;
  345. input >> graph[(a-1)*tempVertex + (b-1) ];
  346. fprintf(stderr,"graph %d %d :%d\n",a,b,graph[a*tempVertex+b]);
  347. }
  348. cudaEventRecord(io_stop);
  349. cudaEventSynchronize(io_stop);
  350. cudaEventElapsedTime(&io_temp,io_start,io_stop);
  351. io_total += io_temp;
  352.  
  353. int* cuda_graph[DEVICE_NUM];
  354. int* cuda_graph2;
  355.  
  356.  
  357.  
  358.  
  359. for(int i=0;i<DEVICE_NUM;i++){
  360. cudaSetDevice(i);
  361.  
  362. cudaMalloc((void**)&cuda_graph[i],sizeof(int)*tempVertex*tempVertex);
  363. cudaCheckErrors("malloc gpu");
  364.  
  365. }
  366.  
  367.  
  368. int FWblockDim = tempVertex / bf ;
  369.  
  370. cudaSetDevice(0);
  371. cudaEventRecord(mem_start);
  372. #pragma omp parallel num_threads(DEVICE_NUM)
  373. {
  374.  
  375. int i = omp_get_thread_num();
  376. cudaSetDevice(i);
  377.  
  378. cudaMemcpy( cuda_graph[i],graph,sizeof(int)*tempVertex*tempVertex,H2D);
  379. cudaCheckErrors("memcpy gpu");
  380.  
  381. cudaMemcpyToSymbol(cuda_bf,&bf,sizeof(int));
  382. cudaMemcpyToSymbol(cuda_total_vertex,&total_vertex,sizeof(int));
  383. cudaMemcpyToSymbol(cuda_tempVertex,&tempVertex,sizeof(int));
  384. cudaMemcpyToSymbol(cuda_device_num,&DEVICE_NUM,sizeof(int));
  385. cudaMemcpyToSymbol(cuda_FW_block,&FWblockDim,sizeof(int));
  386.  
  387. }
  388.  
  389. cudaSetDevice(0);
  390. cudaEventRecord(mem_stop);
  391. cudaEventSynchronize(mem_stop);
  392. cudaEventElapsedTime(&mem_temp,mem_start,mem_stop);
  393. mem_total += mem_temp;
  394.  
  395.  
  396. //int FWblockDim = total_vertex%bf ? (total_vertex/bf + 1) : total_vertex/bf;
  397. //int remainBF = total_vertex%bf? total_vertex%bf : bf ;
  398.  
  399.  
  400. dim3 threadStr(bf,bf);
  401. dim3 blockStr((FWblockDim-1)/DEVICE_NUM,(FWblockDim-1)/DEVICE_NUM);
  402. dim3 blockStr_mod((FWblockDim-1)%DEVICE_NUM,(FWblockDim-1)%DEVICE_NUM);
  403. dim3 blockStr2((FWblockDim-1)*bf,FWblockDim-1);
  404. cudaEventRecord(com_start);
  405.  
  406. int* copy = new int[tempVertex*tempVertex];
  407. if( bf<=32 ){
  408.  
  409. //int threadId = omp_get_thread_num();
  410. //cudaSetDevice(threadId);
  411. for(int K=0;K<FWblockDim;K++){
  412.  
  413. // Phase 1
  414. #pragma omp parallel num_threads(DEVICE_NUM)
  415. {
  416. int threadId = omp_get_thread_num();
  417. cudaSetDevice(threadId);
  418. printf("K=%d phase1 id=%d\n",K,threadId);
  419. floyd_warshall_1<<<1,threadStr>>>(cuda_graph[threadId],K,K*bf);
  420.  
  421. cudaCheckErrors("phase 1");
  422.  
  423. //cudaDeviceSynchronize()
  424. // Phase 2
  425. printf("K=%d phase2 id=%d\n",K,threadId);
  426.  
  427. if(FWblockDim>1){
  428.  
  429. floyd_warshall_2<<< ((FWblockDim-1))*2 ,threadStr>>>( cuda_graph[threadId],K,K*bf);
  430. cudaCheckErrors("phase 2 col");
  431.  
  432. // Phase 3
  433.  
  434. dim3 Str_normal((FWblockDim-1)/DEVICE_NUM,FWblockDim-1);
  435. dim3 Str_last((FWblockDim-1)/DEVICE_NUM + ((FWblockDim-1)%DEVICE_NUM), FWblockDim-1);
  436. printf("K=%d phase3\n",K);
  437.  
  438. if(threadId==(DEVICE_NUM-1)&&(((FWblockDim-1)%DEVICE_NUM)!=0)){
  439. floyd_warshall_3<<<Str_last,threadStr>>>(cuda_graph[threadId],K,K*bf,threadId);
  440. }
  441. else{
  442. floyd_warshall_3<<<Str_normal,threadStr>>>(cuda_graph[threadId],K,K*bf,threadId);
  443. }
  444. cudaCheckErrors("phase 3");
  445.  
  446. }
  447.  
  448. }
  449.  
  450. if(FWblockDim>1){
  451. cudaSetDevice(0);
  452. cudaEventRecord(com_stop);
  453. cudaEventSynchronize(com_stop);
  454. cudaEventElapsedTime(&com_temp,com_start,com_stop);
  455. com_total += com_temp;
  456. cudaEventRecord(mem_start);
  457. int offset,count;
  458.  
  459. for(int i=0;i<DEVICE_NUM;i++){
  460. for(int j=0;j<DEVICE_NUM;j++){
  461. if(i==j)
  462. continue;
  463.  
  464. fprintf(stderr,"%d %d\n",i,j);
  465. offset = tempVertex*sizeof(int)*(tempVertex/DEVICE_NUM)*j;
  466. count = j == DEVICE_NUM-1? tempVertex*sizeof(int)*(tempVertex/DEVICE_NUM) : tempVertex*sizeof(int)*((tempVertex/DEVICE_NUM)+(tempVertex%DEVICE_NUM));
  467. cudaMemcpy(cuda_graph[i]+offset,cuda_graph[j]+offset,count,D2D);
  468. cudaCheckErrors("memcpy");
  469. }
  470. }
  471.  
  472. cudaSetDevice(0);
  473. cudaEventRecord(mem_stop);
  474. cudaEventSynchronize(mem_stop);
  475. cudaEventElapsedTime(&mem_temp, mem_start, mem_stop);
  476. mem_total += mem_temp;
  477.  
  478. }
  479.  
  480. }
  481. }
  482. else{
  483.  
  484. #pragma omp parallel num_threads(DEVICE_NUM)
  485. {
  486. int threadId = omp_get_thread_num();
  487.  
  488. for(int K=0;K<FWblockDim;K++){
  489. // Phase 1
  490. printf("K=%d phase1\n",K);
  491. for(int i=0;i<bf;i++){
  492. floyd_warshall_beta_1<<<bf,bf>>>(cuda_graph[threadId],K,K*bf + i);
  493. cudaCheckErrors("phase 1");
  494. }
  495.  
  496.  
  497. printf("K=%d phase2\n",K);
  498. //Phase 2
  499.  
  500.  
  501. if(FWblockDim>1){
  502. for(int i=0;i<bf;i++){
  503. floyd_warshall_beta_2<<<(FWblockDim-1)*2*bf,bf>>>(cuda_graph[threadId],K,K*bf + i );
  504. cudaCheckErrors("phase 2 col");
  505. }
  506.  
  507.  
  508.  
  509. printf("K=%d phase3\n",K);
  510. //Phase 3
  511. dim3 Str_normal((FWblockDim-1)*bf,(FWblockDim-1)/DEVICE_NUM);
  512. dim3 Str_last((FWblockDim-1)*bf, (FWblockDim-1)/DEVICE_NUM + ((FWblockDim-1)%DEVICE_NUM) );
  513. if(threadId==(DEVICE_NUM-1)&&(FWblockDim%DEVICE_NUM!=0)){
  514. floyd_warshall_beta_3<<<Str_last,bf>>>(cuda_graph[threadId],K,K*bf,FWblockDim-1,threadId);
  515. }
  516. else{
  517. floyd_warshall_beta_3<<<Str_normal,bf>>>(cuda_graph[threadId],K,K*bf,FWblockDim-1,threadId);
  518. }
  519.  
  520. cudaCheckErrors("phase 3");
  521.  
  522. }
  523. }
  524. }
  525. }
  526.  
  527. for(int i=0;i<DEVICE_NUM;i++){
  528. cudaSetDevice(i);
  529. cudaDeviceSynchronize();
  530. }
  531.  
  532. cudaSetDevice(0);
  533.  
  534. // 時間計算是否要擺到前面??
  535. cudaEventRecord(com_stop);
  536. cudaEventSynchronize(com_stop);
  537. cudaEventElapsedTime(&com_temp,com_start,com_stop);
  538. com_total+=com_temp;
  539.  
  540. cudaEventRecord(mem_start);
  541. //fprintf(stderr,"qqq %s %s \n",typeid((&graph[0])+10),typeid(cuda_graph[1]+10));
  542.  
  543. #pragma omp parallel num_threads(DEVICE_NUM)
  544. {
  545.  
  546. int* tempGraph = new int[tempVertex*tempVertex];
  547. int threadId = omp_get_thread_num();
  548.  
  549. cudaSetDevice(threadId);
  550.  
  551. int offset = tempVertex*sizeof(int)*(tempVertex/DEVICE_NUM) * threadId;
  552. int count = threadId==(DEVICE_NUM-1)? tempVertex*sizeof(int)*( (tempVertex/DEVICE_NUM) + (tempVertex%DEVICE_NUM)) : tempVertex*sizeof(int)*(tempVertex/DEVICE_NUM) ;
  553.  
  554.  
  555.  
  556. cudaMemcpy(tempGraph,(cuda_graph[threadId]),sizeof(int)*tempVertex*tempVertex,D2H);
  557. for(int i=offset;i<offset+count;i++){
  558. graph[i] = tempGraph[i];
  559. }
  560. cudaCheckErrors("copy back error");
  561.  
  562.  
  563.  
  564. mem_total += mem_temp;
  565.  
  566. }
  567. cudaSetDevice(0);
  568.  
  569. cudaEventRecord(mem_stop);
  570. cudaEventSynchronize(mem_stop);
  571. cudaEventElapsedTime(&mem_temp,mem_start,mem_stop);
  572.  
  573.  
  574. cudaEventRecord(io_start);
  575. output.open(argv[2]);
  576. // 每行最後面到底要不要加SPACE!!!!!!!
  577.  
  578.  
  579.  
  580. for(int i=0;i<total_vertex;i++){
  581.  
  582. for(int j=0;j<total_vertex;j++){
  583. if(graph[i*tempVertex+j]==INF){
  584. output<<"INF";
  585. }
  586. else{
  587. output<<graph[i*tempVertex+j];
  588. }
  589. output<<" ";
  590. }
  591. output<<endl;
  592. }
  593.  
  594.  
  595. cudaEventRecord(io_stop);
  596. cudaEventSynchronize(io_stop);
  597. cudaEventElapsedTime(&io_temp,io_start,io_stop);
  598. io_total += io_temp;
  599.  
  600. cudaEventRecord(total_stop);
  601. cudaEventSynchronize(total_stop);
  602. cudaEventElapsedTime(&total_temp, total_start, total_stop);
  603.  
  604.  
  605.  
  606. fprintf(stderr, "\n\n");
  607. fprintf(stderr, "TOTAL = %f\n", total_temp);
  608. fprintf(stderr, "COMPUTE = %f\n", com_total);
  609. fprintf(stderr, "MEMORY = %f\n", mem_total);
  610. fprintf(stderr, "IO = %f\n", io_total);
  611.  
  612. return 0;
  613. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement