Advertisement
Guest User

Untitled

a guest
Dec 2nd, 2012
31
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. // vim:foldenable:foldmethod=marker:foldmarker=[[,]]
  2. /**
  3. * @version 0.1.3 (2011)
  4. * @author Johannes Gilger <heipei@hackvalue.de>
  5. *
  6. * Copyright 2011 Johannes Gilger
  7. *
  8. * This file is part of engine-cuda
  9. *
  10. * engine-cuda is free software: you can redistribute it and/or modify
  11. * it under the terms of the GNU General Public License as published by
  12. * the Free Software Foundation, either version 3 of the License or
  13. * any later version.
  14. *
  15. * engine-cuda is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18. * GNU General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU General Public License
  21. * along with engine-cuda. If not, see <http://www.gnu.org/licenses/>.
  22. *
  23. */
  24. #include <stdint.h>
  25. #include <stdio.h>
  26. #include <stdlib.h>
  27. #include <cuda_runtime_api.h>
  28. #include <assert.h>
  29.  
  30. #include "cuda_common.h"
  31. #include "common.h"
  32.  
  33. #ifndef PAGEABLE
  34. extern "C" void transferHostToDevice_PINNED (const unsigned char *input, uint32_t *deviceMem, uint8_t *hostMem, size_t size) {
  35. cudaError_t cudaerrno;
  36. if(size <= 1048576) {
  37. memcpy(hostMem,input,size);
  38. _CUDA(cudaMemcpyAsync(deviceMem, hostMem, size, cudaMemcpyHostToDevice, 0));
  39. } else {
  40. //fprintf(stdout, "Now trying cudaMemcpy\n");
  41. _CUDA(cudaMemcpyAsync(deviceMem, input, size, cudaMemcpyHostToDevice,0));
  42. }
  43. }
  44. #if CUDART_VERSION >= 2020
  45. extern "C" void transferHostToDevice_ZEROCOPY (const unsigned char *input, uint32_t *deviceMem, uint8_t *hostMem, size_t size) {
  46. //cudaError_t cudaerrno;
  47. memcpy(hostMem,input,size);
  48. //_CUDA(cudaHostGetDevicePointer(&d_s,h_s, 0));
  49. }
  50. #endif
  51. #else
  52. extern "C" void transferHostToDevice_PAGEABLE (const unsigned char *input, uint32_t *deviceMem, uint8_t *hostMem, size_t size) {
  53. cudaError_t cudaerrno;
  54. _CUDA(cudaMemcpy(deviceMem, input, size, cudaMemcpyHostToDevice));
  55. }
  56. #endif
  57.  
  58. #ifndef PAGEABLE
  59. extern "C" void transferDeviceToHost_PINNED (unsigned char *output, uint32_t *deviceMem, uint8_t *hostMemS, uint8_t *hostMemOUT, size_t size) {
  60. cudaError_t cudaerrno;
  61. if(size <= 1048576) {
  62. _CUDA(cudaMemcpyAsync(hostMemS, deviceMem, size, cudaMemcpyDeviceToHost, 0));
  63. _CUDA(cudaThreadSynchronize());
  64. memcpy(output,hostMemS,size);
  65. } else {
  66. _CUDA(cudaMemcpyAsync(output, deviceMem, size, cudaMemcpyDeviceToHost, 0));
  67. }
  68. }
  69. #if CUDART_VERSION >= 2020
  70. extern "C" void transferDeviceToHost_ZEROCOPY (unsigned char *output, uint32_t *deviceMem, uint8_t *hostMemS, uint8_t *hostMemOUT, size_t size) {
  71. cudaError_t cudaerrno;
  72. _CUDA(cudaThreadSynchronize());
  73. memcpy(output,hostMemOUT,size);
  74. }
  75. #endif
  76. #else
  77. extern "C" void transferDeviceToHost_PAGEABLE (unsigned char *output, uint32_t *deviceMem, uint8_t *hostMemS, uint8_t *hostMemOUT, size_t size) {
  78. cudaError_t cudaerrno;
  79. _CUDA(cudaMemcpy(output,deviceMem,size, cudaMemcpyDeviceToHost));
  80. }
  81. #endif
  82.  
  83.  
  84. float time_elapsed;
  85. cudaEvent_t time_start,time_stop;
  86.  
  87. #ifdef DEBUG
  88. #include <sys/time.h>
  89. int timeval_subtract (struct timeval *result, struct timeval *x, struct timeval *y) {
  90. if (x->tv_usec < y->tv_usec) {
  91. int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
  92. y->tv_usec -= 1000000 * nsec;
  93. y->tv_sec += nsec;
  94. }
  95. if (x->tv_usec - y->tv_usec > 1000000) {
  96. int nsec = (x->tv_usec - y->tv_usec) / 1000000;
  97. y->tv_usec += 1000000 * nsec;
  98. y->tv_sec -= nsec;
  99. }
  100.  
  101. result->tv_sec = x->tv_sec - y->tv_sec;
  102. result->tv_usec = x->tv_usec - y->tv_usec;
  103.  
  104. return x->tv_sec < y->tv_sec;
  105. }
  106. #endif
  107.  
  108. void checkCUDADevice(struct cudaDeviceProp *deviceProp, int output_verbosity) {
  109. int deviceCount;
  110. cudaError_t cudaerrno;
  111.  
  112. _CUDA(cudaGetDeviceCount(&deviceCount));
  113.  
  114. if (!deviceCount) {
  115. if (output_verbosity!=OUTPUT_QUIET)
  116. fprintf(stderr,"There is no device supporting CUDA.\n");
  117. exit(EXIT_FAILURE);
  118. }
  119.  
  120. if (output_verbosity>=OUTPUT_NORMAL)
  121. fprintf(stdout,"Successfully found %d CUDA devices (CUDART_VERSION %d).\n",deviceCount, CUDART_VERSION);
  122.  
  123. _CUDA(cudaSetDevice(6));
  124. _CUDA(cudaGetDeviceProperties(deviceProp, 6));
  125.  
  126. if (output_verbosity==OUTPUT_VERBOSE) {
  127. fprintf(stdout,"\nDevice %d: \"%s\"\n", 6, deviceProp->name);
  128. fprintf(stdout," CUDA Compute Capability: %d.%d\n", deviceProp->major,deviceProp->minor);
  129. #if CUDART_VERSION >= 2000
  130. fprintf(stdout," Number of multiprocessors (SM): %d\n", deviceProp->multiProcessorCount);
  131. #endif
  132. #if CUDART_VERSION >= 2020
  133. fprintf(stdout," Integrated: %s\n", deviceProp->integrated ? "Yes" : "No");
  134. fprintf(stdout," Support host page-locked memory mapping: %s\n", deviceProp->canMapHostMemory ? "Yes" : "No");
  135. #endif
  136. fprintf(stdout,"\n");
  137. }
  138. }
  139.  
  140. extern "C" void cuda_device_init(int *nm, int buffer_size, int output_verbosity, uint8_t **host_data, uint64_t **device_data, uint64_t **device_data_out) {
  141. assert(nm);
  142. cudaError_t cudaerrno;
  143. cudaDeviceProp deviceProp;
  144.  
  145. checkCUDADevice(&deviceProp, output_verbosity);
  146.  
  147. if(buffer_size==0)
  148. buffer_size=MAX_CHUNK_SIZE;
  149.  
  150. //_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleYield));
  151. //_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleSpin));
  152. //_CUDA(cudaSetDeviceFlags(cudaDeviceBlockingSync));
  153. //_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleYield|cudaDeviceBlockingSync));
  154. #if CUDART_VERSION >= 2000
  155. *nm=deviceProp.multiProcessorCount;
  156. #endif
  157.  
  158. #ifndef PAGEABLE
  159. #if CUDART_VERSION >= 2020
  160. isIntegrated=deviceProp.integrated;
  161. if(isIntegrated) {
  162. //zero-copy memory mode - use special function to get OS-pinned memory
  163. _CUDA(cudaSetDeviceFlags(cudaDeviceMapHost));
  164. if (output_verbosity!=OUTPUT_QUIET) fprintf(stdout,"Using zero-copy memory.\n");
  165. _CUDA(cudaHostAlloc((void**)host_data,buffer_size,cudaHostAllocMapped));
  166. transferHostToDevice = transferHostToDevice_ZEROCOPY; // set memory transfer function
  167. transferDeviceToHost = transferDeviceToHost_ZEROCOPY; // set memory transfer function
  168. _CUDA(cudaHostGetDevicePointer(device_data,host_data, 0));
  169. } else {
  170. //pinned memory mode - use special function to get OS-pinned memory
  171. _CUDA(cudaHostAlloc( (void**)host_data, buffer_size, cudaHostAllocDefault));
  172. if (output_verbosity!=OUTPUT_QUIET) fprintf(stdout,"Using pinned memory: cudaHostAllocDefault.\n");
  173. transferHostToDevice = transferHostToDevice_PINNED; // set memory transfer function
  174. transferDeviceToHost = transferDeviceToHost_PINNED; // set memory transfer function
  175. _CUDA(cudaMalloc((void **)device_data,buffer_size));
  176. _CUDA(cudaMalloc((void **)device_data_out,buffer_size));
  177. }
  178. #else
  179. //pinned memory mode - use special function to get OS-pinned memory
  180. _CUDA(cudaMallocHost((void**)&h_s, buffer_size));
  181. if (output_verbosity!=OUTPUT_QUIET) fprintf(stdout,"Using pinned memory: cudaHostAllocDefault.\n");
  182. transferHostToDevice = transferHostToDevice_PINNED; // set memory transfer function
  183. transferDeviceToHost = transferDeviceToHost_PINNED; // set memory transfer function
  184. _CUDA(cudaMalloc((void **)device_data,buffer_size));
  185. _CUDA(cudaMalloc((void **)device_data_out,buffer_size));
  186. #endif
  187. #else
  188. if (output_verbosity!=OUTPUT_QUIET) fprintf(stdout,"Using pageable memory.\n");
  189. transferHostToDevice = transferHostToDevice_PAGEABLE; // set memory transfer function
  190. transferDeviceToHost = transferDeviceToHost_PAGEABLE; // set memory transfer function
  191. _CUDA(cudaMalloc((void **)device_data,buffer_size));
  192. _CUDA(cudaMalloc((void **)device_data_out,buffer_size));
  193. #endif
  194.  
  195. if (output_verbosity!=OUTPUT_QUIET) fprintf(stdout,"The current buffer size is %d.\n\n", buffer_size);
  196.  
  197. if(output_verbosity>=OUTPUT_NORMAL) {
  198. _CUDA(cudaEventCreate(&time_start));
  199. _CUDA(cudaEventCreate(&time_stop));
  200. _CUDA(cudaEventRecord(time_start,0));
  201. }
  202.  
  203. }
  204.  
  205. extern "C" void cuda_device_finish(uint8_t *host_data, uint64_t *device_data) {
  206. cudaError_t cudaerrno;
  207.  
  208. if (output_verbosity>=OUTPUT_NORMAL) fprintf(stdout, "\nDone. Finishing up...\n");
  209.  
  210. #ifndef PAGEABLE
  211. #if CUDART_VERSION >= 2020
  212. if(isIntegrated) {
  213. _CUDA(cudaFreeHost(host_data));
  214. } else {
  215. _CUDA(cudaFree(device_data));
  216. }
  217. #else
  218. _CUDA(cudaFree(device_data));
  219. #endif
  220. #else
  221. _CUDA(cudaFree(device_data));
  222. #endif
  223.  
  224. if(output_verbosity>=OUTPUT_NORMAL) {
  225. _CUDA(cudaEventRecord(time_stop,0));
  226. _CUDA(cudaEventSynchronize(time_stop));
  227. _CUDA(cudaEventElapsedTime(&time_elapsed,time_start,time_stop));
  228. fprintf(stdout,"\nTotal time: %f milliseconds\n",time_elapsed);
  229. }
  230. }
Advertisement
RAW Paste Data Copied
Advertisement