Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <algorithm>
- #include <iostream>
- #include <utility>
- #include <cstdlib>
- #include <cstdio>
- #include <cstring>
- #include <string>
- #include <cmath>
- #include <ctime>
- #include <cuda.h>
- #include <math_functions.h>
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include "device_functions.h"
- //#define WIN32_LEAN_AND_MEAN
- #include <Windows.h>
- #include <MMSystem.h>
- #pragma comment(lib, "winmm.lib")
- #define _CRTDBG_MAP_ALLOC
- #include <crtdbg.h>//to detect host memory leaks, so far no leaks
- using namespace std;
- #define _DTH cudaMemcpyDeviceToHost
- #define _DTD cudaMemcpyDeviceToDevice
- #define _HTD cudaMemcpyHostToDevice
- #define THREADS 64
- #define NUM_ELEMENTS (1<<24)
- #define inf 99999999.9f
- #define DO_TEST 1
- const int blockSize1 = 2048;
- const int threads = 64;
- typedef pair<float,int> Pfi;
- typedef pair<Pfi,Pfi> Pffii;
- bool InitMMTimer(UINT wTimerRes);
- void DestroyMMTimer(UINT wTimerRes, bool init);
- Pffii cpu_max_min(const float *Arr,const int sz);
- void findBlockSize(int *whichSize, int *num_el);
- __device__ void warp_reduce_max( float smem[64]){
- smem[threadIdx.x] = smem[threadIdx.x+32] > smem[threadIdx.x] ? smem[threadIdx.x+32] : smem[threadIdx.x];
- smem[threadIdx.x] = smem[threadIdx.x+16] > smem[threadIdx.x] ? smem[threadIdx.x+16] : smem[threadIdx.x];
- smem[threadIdx.x] = smem[threadIdx.x+8] > smem[threadIdx.x] ? smem[threadIdx.x+8] : smem[threadIdx.x];
- smem[threadIdx.x] = smem[threadIdx.x+4] > smem[threadIdx.x] ? smem[threadIdx.x+4] : smem[threadIdx.x];
- smem[threadIdx.x] = smem[threadIdx.x+2] > smem[threadIdx.x] ? smem[threadIdx.x+2] : smem[threadIdx.x];
- smem[threadIdx.x] = smem[threadIdx.x+1] > smem[threadIdx.x] ? smem[threadIdx.x+1] : smem[threadIdx.x];
- }
- __device__ void warp_reduce_min( float smem[64]){
- smem[threadIdx.x] = smem[threadIdx.x+32] < smem[threadIdx.x] ? smem[threadIdx.x+32] : smem[threadIdx.x];
- smem[threadIdx.x] = smem[threadIdx.x+16] < smem[threadIdx.x] ? smem[threadIdx.x+16] : smem[threadIdx.x];
- smem[threadIdx.x] = smem[threadIdx.x+8] < smem[threadIdx.x] ? smem[threadIdx.x+8] : smem[threadIdx.x];
- smem[threadIdx.x] = smem[threadIdx.x+4] < smem[threadIdx.x] ? smem[threadIdx.x+4] : smem[threadIdx.x];
- smem[threadIdx.x] = smem[threadIdx.x+2] < smem[threadIdx.x] ? smem[threadIdx.x+2] : smem[threadIdx.x];
- smem[threadIdx.x] = smem[threadIdx.x+1] < smem[threadIdx.x] ? smem[threadIdx.x+1] : smem[threadIdx.x];
- }
- template<int threads>
- __global__ void find_min_max_dynamic(float* in, float* out, int* indices, int n, int start_adr, int num_blocks){//this is the last step which handles the tail and brings together the other results
- __shared__ float smem_min[64];
- __shared__ float smem_max[64];
- int tid = threadIdx.x + start_adr;
- float max = -inf;
- float min = inf;
- float val;
- int min_index = -1;
- int max_index = -1;
- // tail part
- int mult = 0;
- for(int i = 1; mult + tid < n; i++){
- val = in[tid + mult];
- if( val < min){
- min = val;
- min_index = tid+mult;
- }
- if(val > max){
- max = val;
- max_index = tid+mult;
- }
- mult = i*threads;
- }
- // previously reduced MIN part
- mult = 0;
- int i;
- for(i = 1; mult+threadIdx.x < num_blocks; i++){
- val = out[threadIdx.x + mult];
- if( val < min){
- min = val;
- min_index = threadIdx.x+mult;
- }
- mult = i*threads;
- }
- // MAX part
- for(; mult+threadIdx.x < num_blocks*2; i++){
- val = out[threadIdx.x + mult];
- if(val > max){
- max = val;
- max_index = threadIdx.x+mult;
- }
- mult = i*threads;
- }
- if(threads == 32){
- smem_min[threadIdx.x+32] = 0.0f;
- smem_max[threadIdx.x+32] = 0.0f;
- }
- smem_min[threadIdx.x] = min;
- smem_max[threadIdx.x] = max;
- __syncthreads();
- if(threadIdx.x < 32){
- warp_reduce_min(smem_min);
- warp_reduce_max(smem_max);
- }
- __syncthreads();
- if(threadIdx.x == 0){
- out[blockIdx.x] = smem_min[threadIdx.x];
- out[blockIdx.x + gridDim.x] = smem_max[threadIdx.x];
- }
- if(smem_min[0] == min){
- if(min_index >= num_blocks){ // from tail part
- indices[0] = min_index;
- }
- else{
- int index = indices[min_index];
- indices[0] = index;
- }
- }
- if(smem_max[0] == max){
- if(max_index >= 2*num_blocks){
- indices[1] = max_index;
- }else{
- int index = indices[max_index];
- indices[1] = index;
- }
- }
- }
- template<int els_per_block, int threads>
- __global__ void find_min_max(float *in, float *out, int *indices){
- __shared__ float smem_min[64];
- __shared__ float smem_max[64];
- int tid = threadIdx.x + blockIdx.x*els_per_block;//this sets up the starting point at which this thread will collect other answers
- float max = -inf;
- float min = inf;
- float val;
- int min_index = -1;
- int max_index = -1;
- const int iters = els_per_block/threads;
- #pragma unroll
- for(int i = 0; i < iters; i++){
- val = in[tid + i*threads];
- if(val < min){
- min = val;
- min_index = tid+i*threads;
- }
- if(val > max){
- max = val;
- max_index = tid+i*threads;
- }
- }
- if(threads==32){
- smem_min[threadIdx.x+32] = 0.0f;
- smem_max[threadIdx.x+32] = 0.0f;
- }
- smem_min[threadIdx.x] = min;
- smem_max[threadIdx.x] = max;
- __syncthreads();
- if(threadIdx.x < 32){
- warp_reduce_min(smem_min);
- warp_reduce_max(smem_max);
- }
- __syncthreads();
- if(threadIdx.x == 0){
- out[blockIdx.x] = smem_min[threadIdx.x]; // out[0] == ans
- out[blockIdx.x + gridDim.x] = smem_max[threadIdx.x];
- }
- // fix indices
- if( smem_min[0] == min){
- // write min index
- indices[blockIdx.x] = min_index; // MIN
- }
- if(smem_max[0] == max){
- indices[blockIdx.x + gridDim.x] = max_index; // MAX
- }
- }
- void compute_reduction(float *d_in, float *d_out, int *d_indices, int num_els){
- int whichSize = -1;
- findBlockSize(&whichSize,&num_els);
- int block_size = int(powf(2.0f,float(whichSize-1))*float(blockSize1));
- int num_blocks = num_els/block_size;
- int tail = num_els - num_blocks*block_size;
- int start_adr = num_els - tail;
- if(whichSize == 1)
- find_min_max<blockSize1,threads><<< num_blocks, threads>>>(d_in, d_out, d_indices);
- else if(whichSize == 2)
- find_min_max<blockSize1*2,threads><<< num_blocks, threads>>>(d_in, d_out, d_indices);
- else if(whichSize == 3)
- find_min_max<blockSize1*4,threads><<< num_blocks, threads>>>(d_in, d_out, d_indices);
- else if(whichSize == 4)
- find_min_max<blockSize1*8,threads><<< num_blocks, threads>>>(d_in, d_out, d_indices);
- else
- find_min_max<blockSize1*16,threads><<< num_blocks, threads>>>(d_in, d_out, d_indices);
- find_min_max_dynamic<threads><<<1,threads>>>(d_in, d_out, d_indices, num_els, start_adr, num_blocks);
- }
- int main(){
- char ch;
- srand(time(NULL));
- float *Arr=(float *)malloc(NUM_ELEMENTS*sizeof(float));
- for(int i=0;i<NUM_ELEMENTS;i++){
- Arr[i]=float(rand()+rand()+3)/float(rand()+1);
- if(rand()%7==0)Arr[i]*=(float(rand())/float(rand()+7));
- if(rand()%11==0)Arr[i]*=-1.0f;
- }
- UINT wTimerRes = 0;
- bool init = InitMMTimer(wTimerRes);
- cout<<"Starting CPU testing:\n";
- Pffii cpu_ans=make_pair(make_pair(0.0f,0),make_pair(0.0f,0)),gpu_ans=make_pair(make_pair(0.0f,0),make_pair(0.0f,0));
- DWORD startTime = timeGetTime(),GPUtime=0,CPUtime=0;
- cpu_ans=cpu_max_min(Arr,NUM_ELEMENTS);
- DWORD endTime = timeGetTime();
- CPUtime=endTime-startTime;
- cout<<"CPU time: "<<float(CPUtime)/1000.0f<< " seconds.\n";
- DestroyMMTimer(wTimerRes, init);
- cout<<"The cpu min is:"<<cpu_ans.first.first<<" at index "<<cpu_ans.first.second<<'\n';
- cout<<"The cpu max is:"<<cpu_ans.second.first<<" at index "<<cpu_ans.second.second<<'\n';
- if(DO_TEST){
- int index_size=4096,num_els=NUM_ELEMENTS;
- cout<<"Starting GPU testing:\n";
- float *d_in,*d_out;
- int *idx_val;
- cudaError_t err=cudaDeviceReset();
- if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
- err=cudaMalloc((void **)&d_in,NUM_ELEMENTS*sizeof(float));
- if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
- err=cudaMalloc((void **)&d_out,NUM_ELEMENTS*sizeof(float));
- if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
- err=cudaMalloc((void **)&idx_val,index_size*sizeof(int));
- if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
- err=cudaMemcpy(d_in,Arr,NUM_ELEMENTS*sizeof(float),_HTD);
- if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
- init = InitMMTimer(wTimerRes);
- startTime = timeGetTime();
- compute_reduction(d_in,d_out,idx_val,num_els);
- err=cudaThreadSynchronize();
- if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
- err=cudaMemcpy(&gpu_ans.first.first,d_out,sizeof(float),_DTH);
- if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
- err=cudaMemcpy(&gpu_ans.second.first,d_out+1,sizeof(float),_DTH);
- if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
- err=cudaMemcpy(&gpu_ans.first.second,idx_val,sizeof(int),_DTH);
- if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
- err=cudaMemcpy(&gpu_ans.second.second,idx_val+1,sizeof(int),_DTH);
- if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
- endTime = timeGetTime();
- GPUtime=endTime-startTime;
- DestroyMMTimer(wTimerRes, init);
- err=cudaFree(d_in);
- if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
- err=cudaFree(d_out);
- if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
- err=cudaFree(idx_val);
- if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
- cout<<"GPU timing: "<<float(GPUtime)/1000.0f<<" seconds.\n";
- cout<<"The gpu min is:"<<gpu_ans.first.first<<" at index "<<gpu_ans.first.second<<'\n';
- cout<<"The gpu max is:"<<gpu_ans.second.first<<" at index "<<gpu_ans.second.second<<'\n';
- }
- free(Arr);
- cin>>ch;
- return 0;
- }
- bool InitMMTimer(UINT wTimerRes){
- TIMECAPS tc;
- if (timeGetDevCaps(&tc, sizeof(TIMECAPS)) != TIMERR_NOERROR) {return false;}
- wTimerRes = min(max(tc.wPeriodMin, 1), tc.wPeriodMax);
- timeBeginPeriod(wTimerRes);
- return true;
- }
- void DestroyMMTimer(UINT wTimerRes, bool init){
- if(init)
- timeEndPeriod(wTimerRes);
- }
- Pffii cpu_max_min(const float *Arr,const int sz){
- float amax=-9999999.9f,amin=9999999.9f;
- int idx_max=-1,idx_min=-1;
- for(int i=0;i<sz;i++){
- if(Arr[i]>amax){
- amax=Arr[i];
- idx_max=i;
- }
- if(Arr[i]<amin){
- amin=Arr[i];
- idx_min=i;
- }
- }
- return make_pair(make_pair(amin,idx_min),make_pair(amax,idx_max));
- }
- void findBlockSize(int *whichSize, int *num_el){
- const float pretty_big_number = 24.0f*1024.0f*1024.0f;
- float ratio = float((*num_el))/pretty_big_number;
- if(ratio > 0.8f)
- (*whichSize) = 5;
- else if(ratio > 0.6f)
- (*whichSize) = 4;
- else if(ratio > 0.4f)
- (*whichSize) = 3;
- else if(ratio > 0.2f)
- (*whichSize) = 2;
- else
- (*whichSize) = 1;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement