Advertisement
Guest User

Untitled

a guest
Jul 14th, 2014
242
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.59 KB | None | 0 0
  1. nvprof --help
  2.  
  3. $ cat t476.cu
  4. #include <stdio.h>
  5. #define cudaCheckErrors(msg)
  6. do {
  7. cudaError_t __err = cudaGetLastError();
  8. if (__err != cudaSuccess) {
  9. fprintf(stderr, "Fatal error: %s (%s at %s:%d)n",
  10. msg, cudaGetErrorString(__err),
  11. __FILE__, __LINE__);
  12. fprintf(stderr, "*** FAILED - ABORTINGn");
  13. exit(1);
  14. }
  15. } while (0)
  16.  
  17. __global__ void mykernel(int *d_data){
  18.  
  19. printf("Data = %dn", *d_data);
  20. *d_data = 0;
  21. }
  22.  
  23. int main(){
  24.  
  25. cudaDeviceProp myprop;
  26. int mydevice;
  27. int numdevices;
  28. cudaGetDeviceCount(&numdevices);
  29. cudaCheckErrors("get dev count fail");
  30. for (mydevice = 0; mydevice < numdevices; mydevice++){
  31. cudaGetDeviceProperties(&myprop, mydevice);
  32. printf("device %d: %sn", mydevice, myprop.name);
  33. printf("device %d supports unified addressing: ", mydevice);
  34. if (myprop.unifiedAddressing) printf(" yesn");
  35. else printf(" non");
  36. printf("device %d supports managed memory: ", mydevice);
  37. if (myprop.managedMemory) printf(" yesn");
  38. else printf(" non");
  39. }
  40. cudaSetDevice(--mydevice);
  41. printf("using device %dn", mydevice);
  42. int h_data = 1;
  43. int *d_data;
  44. cudaMalloc(&d_data, sizeof(int));
  45. cudaMemcpy(d_data, &h_data, sizeof(int), cudaMemcpyHostToDevice);
  46. mykernel<<<1,1>>>(d_data);
  47. cudaMemcpy(&h_data, d_data, sizeof(int), cudaMemcpyDeviceToHost);
  48. printf("data = %dn", h_data);
  49. printf("now testing managed memoryn");
  50. int *m_data;
  51. cudaMallocManaged(&m_data, sizeof(int));
  52. cudaCheckErrors("managed mem fail");
  53. *m_data = 1;
  54. mykernel<<<1,1>>>(m_data);
  55. cudaDeviceSynchronize();
  56. printf("data = %dn", m_data);
  57. cudaCheckErrors("some error");
  58. return 0;
  59. }
  60. $ nvcc -arch=sm_35 -o t476 t476.cu
  61. $ nvprof --print-gpu-trace --unified-memory-profiling per-process-device ./t476
  62. ==5114== NVPROF is profiling process 5114, command: ./t476
  63. device 0: GeForce GT 640
  64. device 0 supports unified addressing: yes
  65. device 0 supports managed memory: yes
  66. using device 0
  67. Data = 1
  68. data = 0
  69. now testing managed memory
  70. Data = 1
  71. data = 0
  72. ==5114== Profiling application: ./t476
  73. ==5114== Profiling result:
  74. Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Unified Memory Name
  75. 1.10622s 1.1200us - - - - - 4B 3.5714MB/s GeForce GT 640 1 7 - [CUDA memcpy HtoD]
  76. 1.10687s 64.481us (1 1 1) (1 1 1) 32 0B 0B - - GeForce GT 640 1 7 - mykernel(int*) [102]
  77. 1.10693s 2.3360us - - - - - 4B 1.7123MB/s GeForce GT 640 1 7 - [CUDA memcpy DtoH]
  78. 1.12579s - - - - - - - - GeForce GT 640 - - 0 [Unified Memory CPU page faults]
  79. 1.12579s - - - - - - - - GeForce GT 640 - - 0 B [Unified Memory Memcpy DtoH]
  80. 1.12579s - - - - - - - - GeForce GT 640 - - 0 B [Unified Memory Memcpy HtoD]
  81. 1.12590s 64.097us (1 1 1) (1 1 1) 32 0B 0B - - GeForce GT 640 1 7 - mykernel(int*) [108]
  82. 1.12603s - - - - - - - - GeForce GT 640 - - 4096 B [Unified Memory Memcpy DtoH]
  83. 1.12603s - - - - - - - - GeForce GT 640 - - 4096 B [Unified Memory Memcpy HtoD]
  84. 1.12603s - - - - - - - - GeForce GT 640 - - 1 [Unified Memory CPU page faults]
  85.  
  86. Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
  87. SSMem: Static shared memory allocated per CUDA block.
  88. DSMem: Dynamic shared memory allocated per CUDA block.
  89. $
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement