Advertisement
Guest User

Problem(-G)flag

a guest
Jul 12th, 2013
185
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 11.11 KB | None | 0 0
  1. #include <algorithm>
  2. #include <iostream>
  3. #include <utility>
  4. #include <cstdlib>
  5. #include <cstdio>
  6. #include <cstring>
  7. #include <string>
  8. #include <cmath>
  9. #include <ctime>
  10. #include <cuda.h>
  11. #include <math_functions.h>
  12. #include "cuda_runtime.h"
  13. #include "device_launch_parameters.h"
  14. #include "device_functions.h"
  15. //#define WIN32_LEAN_AND_MEAN
  16. #include <Windows.h>
  17. #include <MMSystem.h>
  18. #pragma comment(lib, "winmm.lib")
  19. #define _CRTDBG_MAP_ALLOC
  20. #include <crtdbg.h>//to detect host memory leaks, so far no leaks
  21. using namespace std;
  22.  
  23. #define _DTH cudaMemcpyDeviceToHost
  24. #define _DTD cudaMemcpyDeviceToDevice
  25. #define _HTD cudaMemcpyHostToDevice
  26. #define THREADS 64
  27. #define NUM_ELEMENTS (1<<24)
  28. #define inf 99999999.9f
  29. #define DO_TEST 1
  30.  
  31. const int blockSize1 = 2048;
  32. const int threads = 64;
  33.  
  34. typedef pair<float,int> Pfi;
  35. typedef pair<Pfi,Pfi> Pffii;
  36.  
  37. bool InitMMTimer(UINT wTimerRes);
  38. void DestroyMMTimer(UINT wTimerRes, bool init);
  39. Pffii cpu_max_min(const float *Arr,const int sz);
  40.  
  41. void findBlockSize(int *whichSize, int *num_el);
  42.  
  43. __device__ void warp_reduce_max( float smem[64]){
  44. smem[threadIdx.x] = smem[threadIdx.x+32] > smem[threadIdx.x] ? smem[threadIdx.x+32] : smem[threadIdx.x];
  45. smem[threadIdx.x] = smem[threadIdx.x+16] > smem[threadIdx.x] ? smem[threadIdx.x+16] : smem[threadIdx.x];
  46. smem[threadIdx.x] = smem[threadIdx.x+8] > smem[threadIdx.x] ? smem[threadIdx.x+8] : smem[threadIdx.x];
  47. smem[threadIdx.x] = smem[threadIdx.x+4] > smem[threadIdx.x] ? smem[threadIdx.x+4] : smem[threadIdx.x];
  48. smem[threadIdx.x] = smem[threadIdx.x+2] > smem[threadIdx.x] ? smem[threadIdx.x+2] : smem[threadIdx.x];
  49. smem[threadIdx.x] = smem[threadIdx.x+1] > smem[threadIdx.x] ? smem[threadIdx.x+1] : smem[threadIdx.x];
  50. }
  51. __device__ void warp_reduce_min( float smem[64]){
  52. smem[threadIdx.x] = smem[threadIdx.x+32] < smem[threadIdx.x] ? smem[threadIdx.x+32] : smem[threadIdx.x];
  53. smem[threadIdx.x] = smem[threadIdx.x+16] < smem[threadIdx.x] ? smem[threadIdx.x+16] : smem[threadIdx.x];
  54. smem[threadIdx.x] = smem[threadIdx.x+8] < smem[threadIdx.x] ? smem[threadIdx.x+8] : smem[threadIdx.x];
  55. smem[threadIdx.x] = smem[threadIdx.x+4] < smem[threadIdx.x] ? smem[threadIdx.x+4] : smem[threadIdx.x];
  56. smem[threadIdx.x] = smem[threadIdx.x+2] < smem[threadIdx.x] ? smem[threadIdx.x+2] : smem[threadIdx.x];
  57. smem[threadIdx.x] = smem[threadIdx.x+1] < smem[threadIdx.x] ? smem[threadIdx.x+1] : smem[threadIdx.x];
  58. }
  59.  
  60. template<int threads>
  61. __global__ void find_min_max_dynamic(float* in, float* out, int* indices, int n, int start_adr, int num_blocks){//this is the last step which handles the tail and brings together the other results
  62.  
  63. __shared__ float smem_min[64];
  64. __shared__ float smem_max[64];
  65.  
  66. int tid = threadIdx.x + start_adr;
  67.  
  68. float max = -inf;
  69. float min = inf;
  70. float val;
  71.  
  72. int min_index = -1;
  73. int max_index = -1;
  74.  
  75. // tail part
  76. int mult = 0;
  77. for(int i = 1; mult + tid < n; i++){
  78. val = in[tid + mult];
  79.  
  80. if( val < min){
  81. min = val;
  82. min_index = tid+mult;
  83. }
  84.  
  85. if(val > max){
  86. max = val;
  87. max_index = tid+mult;
  88. }
  89.  
  90. mult = i*threads;
  91. }
  92.  
  93. // previously reduced MIN part
  94. mult = 0;
  95. int i;
  96. for(i = 1; mult+threadIdx.x < num_blocks; i++){
  97. val = out[threadIdx.x + mult];
  98. if( val < min){
  99. min = val;
  100. min_index = threadIdx.x+mult;
  101. }
  102.  
  103. mult = i*threads;
  104. }
  105.  
  106. // MAX part
  107. for(; mult+threadIdx.x < num_blocks*2; i++){
  108. val = out[threadIdx.x + mult];
  109.  
  110. if(val > max){
  111. max = val;
  112. max_index = threadIdx.x+mult;
  113. }
  114. mult = i*threads;
  115. }
  116.  
  117.  
  118. if(threads == 32){
  119. smem_min[threadIdx.x+32] = 0.0f;
  120. smem_max[threadIdx.x+32] = 0.0f;
  121. }
  122.  
  123. smem_min[threadIdx.x] = min;
  124. smem_max[threadIdx.x] = max;
  125.  
  126. __syncthreads();
  127.  
  128. if(threadIdx.x < 32){
  129. warp_reduce_min(smem_min);
  130. warp_reduce_max(smem_max);
  131. }
  132.  
  133. __syncthreads();
  134.  
  135. if(threadIdx.x == 0){
  136. out[blockIdx.x] = smem_min[threadIdx.x];
  137. out[blockIdx.x + gridDim.x] = smem_max[threadIdx.x];
  138. }
  139.  
  140. if(smem_min[0] == min){
  141.  
  142. if(min_index >= num_blocks){ // from tail part
  143. indices[0] = min_index;
  144. }
  145. else{
  146. int index = indices[min_index];
  147. indices[0] = index;
  148. }
  149. }
  150.  
  151. if(smem_max[0] == max){
  152.  
  153. if(max_index >= 2*num_blocks){
  154. indices[1] = max_index;
  155. }else{
  156. int index = indices[max_index];
  157. indices[1] = index;
  158. }
  159. }
  160. }
  161. template<int els_per_block, int threads>
  162. __global__ void find_min_max(float *in, float *out, int *indices){
  163.  
  164. __shared__ float smem_min[64];
  165. __shared__ float smem_max[64];
  166.  
  167. int tid = threadIdx.x + blockIdx.x*els_per_block;//this sets up the starting point at which this thread will collect other answers
  168.  
  169. float max = -inf;
  170. float min = inf;
  171. float val;
  172.  
  173. int min_index = -1;
  174. int max_index = -1;
  175.  
  176. const int iters = els_per_block/threads;
  177.  
  178. #pragma unroll
  179. for(int i = 0; i < iters; i++){
  180.  
  181. val = in[tid + i*threads];
  182.  
  183. if(val < min){
  184. min = val;
  185. min_index = tid+i*threads;
  186. }
  187.  
  188. if(val > max){
  189. max = val;
  190. max_index = tid+i*threads;
  191. }
  192. }
  193.  
  194. if(threads==32){
  195. smem_min[threadIdx.x+32] = 0.0f;
  196. smem_max[threadIdx.x+32] = 0.0f;
  197. }
  198.  
  199. smem_min[threadIdx.x] = min;
  200. smem_max[threadIdx.x] = max;
  201.  
  202.  
  203. __syncthreads();
  204.  
  205. if(threadIdx.x < 32){
  206. warp_reduce_min(smem_min);
  207. warp_reduce_max(smem_max);
  208. }
  209.  
  210. __syncthreads();
  211.  
  212. if(threadIdx.x == 0){
  213. out[blockIdx.x] = smem_min[threadIdx.x]; // out[0] == ans
  214. out[blockIdx.x + gridDim.x] = smem_max[threadIdx.x];
  215. }
  216.  
  217. // fix indices
  218.  
  219. if( smem_min[0] == min){
  220. // write min index
  221. indices[blockIdx.x] = min_index; // MIN
  222. }
  223. if(smem_max[0] == max){
  224. indices[blockIdx.x + gridDim.x] = max_index; // MAX
  225. }
  226. }
  227. void compute_reduction(float *d_in, float *d_out, int *d_indices, int num_els){
  228.  
  229. int whichSize = -1;
  230.  
  231. findBlockSize(&whichSize,&num_els);
  232.  
  233. int block_size = int(powf(2.0f,float(whichSize-1))*float(blockSize1));
  234. int num_blocks = num_els/block_size;
  235. int tail = num_els - num_blocks*block_size;
  236. int start_adr = num_els - tail;
  237.  
  238.  
  239. if(whichSize == 1)
  240. find_min_max<blockSize1,threads><<< num_blocks, threads>>>(d_in, d_out, d_indices);
  241. else if(whichSize == 2)
  242. find_min_max<blockSize1*2,threads><<< num_blocks, threads>>>(d_in, d_out, d_indices);
  243. else if(whichSize == 3)
  244. find_min_max<blockSize1*4,threads><<< num_blocks, threads>>>(d_in, d_out, d_indices);
  245. else if(whichSize == 4)
  246. find_min_max<blockSize1*8,threads><<< num_blocks, threads>>>(d_in, d_out, d_indices);
  247. else
  248. find_min_max<blockSize1*16,threads><<< num_blocks, threads>>>(d_in, d_out, d_indices);
  249.  
  250. find_min_max_dynamic<threads><<<1,threads>>>(d_in, d_out, d_indices, num_els, start_adr, num_blocks);
  251.  
  252. }
  253.  
  254.  
  255. int main(){
  256. char ch;
  257. srand(time(NULL));
  258.  
  259. float *Arr=(float *)malloc(NUM_ELEMENTS*sizeof(float));
  260. for(int i=0;i<NUM_ELEMENTS;i++){
  261. Arr[i]=float(rand()+rand()+3)/float(rand()+1);
  262. if(rand()%7==0)Arr[i]*=(float(rand())/float(rand()+7));
  263. if(rand()%11==0)Arr[i]*=-1.0f;
  264. }
  265.  
  266. UINT wTimerRes = 0;
  267. bool init = InitMMTimer(wTimerRes);
  268. cout<<"Starting CPU testing:\n";
  269.  
  270. Pffii cpu_ans=make_pair(make_pair(0.0f,0),make_pair(0.0f,0)),gpu_ans=make_pair(make_pair(0.0f,0),make_pair(0.0f,0));
  271. DWORD startTime = timeGetTime(),GPUtime=0,CPUtime=0;
  272.  
  273. cpu_ans=cpu_max_min(Arr,NUM_ELEMENTS);
  274.  
  275. DWORD endTime = timeGetTime();
  276. CPUtime=endTime-startTime;
  277. cout<<"CPU time: "<<float(CPUtime)/1000.0f<< " seconds.\n";
  278. DestroyMMTimer(wTimerRes, init);
  279. cout<<"The cpu min is:"<<cpu_ans.first.first<<" at index "<<cpu_ans.first.second<<'\n';
  280. cout<<"The cpu max is:"<<cpu_ans.second.first<<" at index "<<cpu_ans.second.second<<'\n';
  281.  
  282. if(DO_TEST){
  283.  
  284. int index_size=4096,num_els=NUM_ELEMENTS;
  285. cout<<"Starting GPU testing:\n";
  286. float *d_in,*d_out;
  287. int *idx_val;
  288. cudaError_t err=cudaDeviceReset();
  289. if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
  290.  
  291. err=cudaMalloc((void **)&d_in,NUM_ELEMENTS*sizeof(float));
  292. if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
  293.  
  294. err=cudaMalloc((void **)&d_out,NUM_ELEMENTS*sizeof(float));
  295. if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
  296.  
  297. err=cudaMalloc((void **)&idx_val,index_size*sizeof(int));
  298. if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
  299.  
  300. err=cudaMemcpy(d_in,Arr,NUM_ELEMENTS*sizeof(float),_HTD);
  301. if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
  302.  
  303. init = InitMMTimer(wTimerRes);
  304. startTime = timeGetTime();
  305.  
  306. compute_reduction(d_in,d_out,idx_val,num_els);
  307. err=cudaThreadSynchronize();
  308. if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
  309.  
  310. err=cudaMemcpy(&gpu_ans.first.first,d_out,sizeof(float),_DTH);
  311. if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
  312.  
  313. err=cudaMemcpy(&gpu_ans.second.first,d_out+1,sizeof(float),_DTH);
  314. if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
  315.  
  316. err=cudaMemcpy(&gpu_ans.first.second,idx_val,sizeof(int),_DTH);
  317. if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
  318.  
  319. err=cudaMemcpy(&gpu_ans.second.second,idx_val+1,sizeof(int),_DTH);
  320. if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
  321.  
  322. endTime = timeGetTime();
  323. GPUtime=endTime-startTime;
  324. DestroyMMTimer(wTimerRes, init);
  325.  
  326. err=cudaFree(d_in);
  327. if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
  328. err=cudaFree(d_out);
  329. if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
  330. err=cudaFree(idx_val);
  331. if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
  332.  
  333. cout<<"GPU timing: "<<float(GPUtime)/1000.0f<<" seconds.\n";
  334. cout<<"The gpu min is:"<<gpu_ans.first.first<<" at index "<<gpu_ans.first.second<<'\n';
  335. cout<<"The gpu max is:"<<gpu_ans.second.first<<" at index "<<gpu_ans.second.second<<'\n';
  336. }
  337.  
  338. free(Arr);
  339. cin>>ch;
  340. return 0;
  341. }
  342.  
  343. bool InitMMTimer(UINT wTimerRes){
  344. TIMECAPS tc;
  345. if (timeGetDevCaps(&tc, sizeof(TIMECAPS)) != TIMERR_NOERROR) {return false;}
  346. wTimerRes = min(max(tc.wPeriodMin, 1), tc.wPeriodMax);
  347. timeBeginPeriod(wTimerRes);
  348. return true;
  349. }
  350.  
  351. void DestroyMMTimer(UINT wTimerRes, bool init){
  352. if(init)
  353. timeEndPeriod(wTimerRes);
  354. }
  355. Pffii cpu_max_min(const float *Arr,const int sz){
  356. float amax=-9999999.9f,amin=9999999.9f;
  357. int idx_max=-1,idx_min=-1;
  358. for(int i=0;i<sz;i++){
  359. if(Arr[i]>amax){
  360. amax=Arr[i];
  361. idx_max=i;
  362. }
  363. if(Arr[i]<amin){
  364. amin=Arr[i];
  365. idx_min=i;
  366. }
  367. }
  368. return make_pair(make_pair(amin,idx_min),make_pair(amax,idx_max));
  369. }
  370.  
  371. void findBlockSize(int *whichSize, int *num_el){
  372.  
  373. const float pretty_big_number = 24.0f*1024.0f*1024.0f;
  374.  
  375. float ratio = float((*num_el))/pretty_big_number;
  376.  
  377. if(ratio > 0.8f)
  378. (*whichSize) = 5;
  379. else if(ratio > 0.6f)
  380. (*whichSize) = 4;
  381. else if(ratio > 0.4f)
  382. (*whichSize) = 3;
  383. else if(ratio > 0.2f)
  384. (*whichSize) = 2;
  385. else
  386. (*whichSize) = 1;
  387. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement