Advertisement
Guest User

Untitled

a guest
Feb 14th, 2014
230
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.24 KB | None | 0 0
  1. $ cat t3.c
  2. #include <stdio.h>
  3. #include <stdlib.h>
  4. #include <math.h>
  5. #define DSIZE 1000
  6. #define TOL 0.01f
  7. #define DAT 2.0f
  8. int main(){
  9.  
  10. float temp;
  11. int i_v, ntv, i_el, len_tv;
  12. float *tva, *tv_sq, *tv_sq_cpu;
  13. tva = (float *)malloc(DSIZE*DSIZE*sizeof(float));
  14. tv_sq = (float *)malloc(DSIZE*sizeof(float));
  15. tv_sq_cpu = (float *)malloc(DSIZE*sizeof(float));
  16. if ((tv_sq == 0) || (tva == 0)) {printf("malloc fail\n"); return 1;}
  17. for (i_v = 0; i_v < ntv; i_v++)
  18. for (i_el = 0; i_el < len_tv; i_el++)
  19. tva[(i_v*len_tv) + i_el] = DAT;
  20. len_tv = DSIZE;
  21. ntv = DSIZE;
  22.  
  23. for( i_v = 0; i_v < ntv; i_v++ )
  24. {
  25. temp = 0;
  26.  
  27. for( i_el = 0; i_el < len_tv; i_el++ )
  28. temp += pow( tva[i_v*len_tv + i_el], (float)2.0 );
  29.  
  30. tv_sq_cpu[i_v]=temp;
  31. }
  32.  
  33. #pragma acc data copyin(tva[:(len_tv*ntv)]) copyout(tv_sq[:ntv]) create(temp)
  34. {
  35. #pragma acc kernels loop independent
  36. for( i_v = 0; i_v < ntv; i_v++ )
  37. {
  38. temp = 0;
  39.  
  40. #pragma acc loop independent gang vector reduction(+:temp)
  41. for( i_el = 0; i_el < len_tv; i_el++ )
  42. temp += pow( tva[i_v*len_tv + i_el], (float)2.0 );
  43.  
  44. tv_sq[i_v]=temp;
  45. }
  46. }
  47.  
  48. for (i_v = 0; i_v < ntv; i_v++)
  49. if (abs(tv_sq[i_v] - tv_sq_cpu[i_v]) > TOL) {printf("mismatch at idx: %d cpu: %f gpu: %f\n", i_v, tv_sq_cpu[i_v], tv_sq[i_v]); return 1;}
  50. printf("Success\n");
  51. return 0;
  52.  
  53. }
  54.  
  55. $ pgcc -O3 -acc -ta=nvidia,cc20,cuda5.0 -Minfo=accel t3.c -o t3
  56. main:
  57. 32, Generating create(temp)
  58. Generating copyout(tv_sq[0:ntv])
  59. Generating copyin(tva[0:ntv*len_tv])
  60. 34, Generating present_or_copyout(tv_sq[0:ntv])
  61. Generating present_or_copyin(tva[0:ntv*len_tv])
  62. Generating NVIDIA code
  63. Generating compute capability 2.0 binary
  64. 35, Loop is parallelizable
  65. Accelerator kernel generated
  66. 35, #pragma acc loop gang /* blockIdx.x */
  67. 40, #pragma acc loop vector(128) /* threadIdx.x */
  68. Loop is parallelizable
  69. $ ./t3
  70. Success
  71. $ pgcc -V
  72.  
  73. pgcc 13.10-0 64-bit target on x86-64 Linux -tp nehalem
  74. The Portland Group - PGI Compilers and Tools
  75. Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved.
  76. $
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement