Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- nvprof --help
- $ cat t476.cu
- #include <stdio.h>
- #define cudaCheckErrors(msg)
- do {
- cudaError_t __err = cudaGetLastError();
- if (__err != cudaSuccess) {
- fprintf(stderr, "Fatal error: %s (%s at %s:%d)n",
- msg, cudaGetErrorString(__err),
- __FILE__, __LINE__);
- fprintf(stderr, "*** FAILED - ABORTINGn");
- exit(1);
- }
- } while (0)
- __global__ void mykernel(int *d_data){
- printf("Data = %dn", *d_data);
- *d_data = 0;
- }
- int main(){
- cudaDeviceProp myprop;
- int mydevice;
- int numdevices;
- cudaGetDeviceCount(&numdevices);
- cudaCheckErrors("get dev count fail");
- for (mydevice = 0; mydevice < numdevices; mydevice++){
- cudaGetDeviceProperties(&myprop, mydevice);
- printf("device %d: %sn", mydevice, myprop.name);
- printf("device %d supports unified addressing: ", mydevice);
- if (myprop.unifiedAddressing) printf(" yesn");
- else printf(" non");
- printf("device %d supports managed memory: ", mydevice);
- if (myprop.managedMemory) printf(" yesn");
- else printf(" non");
- }
- cudaSetDevice(--mydevice);
- printf("using device %dn", mydevice);
- int h_data = 1;
- int *d_data;
- cudaMalloc(&d_data, sizeof(int));
- cudaMemcpy(d_data, &h_data, sizeof(int), cudaMemcpyHostToDevice);
- mykernel<<<1,1>>>(d_data);
- cudaMemcpy(&h_data, d_data, sizeof(int), cudaMemcpyDeviceToHost);
- printf("data = %dn", h_data);
- printf("now testing managed memoryn");
- int *m_data;
- cudaMallocManaged(&m_data, sizeof(int));
- cudaCheckErrors("managed mem fail");
- *m_data = 1;
- mykernel<<<1,1>>>(m_data);
- cudaDeviceSynchronize();
- printf("data = %dn", m_data);
- cudaCheckErrors("some error");
- return 0;
- }
- $ nvcc -arch=sm_35 -o t476 t476.cu
- $ nvprof --print-gpu-trace --unified-memory-profiling per-process-device ./t476
- ==5114== NVPROF is profiling process 5114, command: ./t476
- device 0: GeForce GT 640
- device 0 supports unified addressing: yes
- device 0 supports managed memory: yes
- using device 0
- Data = 1
- data = 0
- now testing managed memory
- Data = 1
- data = 0
- ==5114== Profiling application: ./t476
- ==5114== Profiling result:
- Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Unified Memory Name
- 1.10622s 1.1200us - - - - - 4B 3.5714MB/s GeForce GT 640 1 7 - [CUDA memcpy HtoD]
- 1.10687s 64.481us (1 1 1) (1 1 1) 32 0B 0B - - GeForce GT 640 1 7 - mykernel(int*) [102]
- 1.10693s 2.3360us - - - - - 4B 1.7123MB/s GeForce GT 640 1 7 - [CUDA memcpy DtoH]
- 1.12579s - - - - - - - - GeForce GT 640 - - 0 [Unified Memory CPU page faults]
- 1.12579s - - - - - - - - GeForce GT 640 - - 0 B [Unified Memory Memcpy DtoH]
- 1.12579s - - - - - - - - GeForce GT 640 - - 0 B [Unified Memory Memcpy HtoD]
- 1.12590s 64.097us (1 1 1) (1 1 1) 32 0B 0B - - GeForce GT 640 1 7 - mykernel(int*) [108]
- 1.12603s - - - - - - - - GeForce GT 640 - - 4096 B [Unified Memory Memcpy DtoH]
- 1.12603s - - - - - - - - GeForce GT 640 - - 4096 B [Unified Memory Memcpy HtoD]
- 1.12603s - - - - - - - - GeForce GT 640 - - 1 [Unified Memory CPU page faults]
- Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
- SSMem: Static shared memory allocated per CUDA block.
- DSMem: Dynamic shared memory allocated per CUDA block.
- $
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement