Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <thrust/device_vector.h>
- #include <cmath>
- #include <thrust/sort.h>
- #include <ctime>
- #include <vector>
- #include <sys/time.h>
- __host__ void initialize(int*& dev_data, int N){
- //generate random data incrementally
- int start = 0;
- int r = 2;
- int i;
- srand(time(NULL));
- //std::cout<<"Initial data: "<<std::endl;
- for(i=0;i<N;i++){
- dev_data[i] = start + rand()%r;
- //std::cout<<" "<<dev_data[i]<<" ";
- start = dev_data[i];
- }
- //std::cout<<std::endl;
- }
- __global__ void findPositions(int *device_data, int totalAmountOfValuesPerThread, int* pos_ptr, int N){
- int res1 = 9999999;
- int res2 = 9999999;
- int index = totalAmountOfValuesPerThread*(threadIdx.x + blockIdx.x*blockDim.x);
- int start = index; //from this index each thread will begin searching
- if(start < N){ //if the index is out of bounds do nothing
- if(start!=0){ //if start is not in the beginning, check the previous value
- if(device_data[start-1] != device_data[start]){
- res1 = start;
- }
- }
- else res1 = start; //since it's the beginning we update the first output buffer for the thread
- pos_ptr[index] = res1;
- start++; //move to the next place and see if the second output buffer needs updating or not
- if(start < N && device_data[start] != device_data[start-1]){
- res2 = start;
- }
- if((index+1) < N)
- pos_ptr[index+ 1] = res2;
- }
- }
- __host__ int main(void){
- int N = 100;
- int threadsPerBlock = 1024;
- int totalAmountOfValuesPerThread = 2;
- int totalBLocks = (N/threadsPerBlock) + 1;
- int sizeOfDataInBytes = N*sizeof(int);
- int *host_data = new int[N];
- int *device_data;
- initialize(host_data, N);
- timeval tim;
- gettimeofday(&tim, NULL);
- double before = tim.tv_sec + (tim.tv_usec/1000000.0);
- //run cpu version
- std::vector<int> results;
- int index = 0;
- //std::cout<<"CPU data: "<<std::endl;
- while(index<N){
- results.push_back(index);
- //std::cout<<results.back()<<" ";
- int nextIndex = index+1;
- while(nextIndex<N && host_data[index] == host_data[nextIndex] )
- nextIndex++;
- index = nextIndex;
- }
- gettimeofday(&tim, NULL);
- double after = tim.tv_sec + (tim.tv_usec/1000000.0);
- std::cout<<"CPU time: "<<after-before<<" seconds"<<std::endl;
- //transfer data to the gpu
- gettimeofday(&tim, NULL);
- before = tim.tv_sec + (tim.tv_usec/1000000.0);
- cudaMalloc((void**)&device_data, sizeOfDataInBytes);
- cudaMemcpy(device_data, host_data, sizeOfDataInBytes, cudaMemcpyHostToDevice);
- thrust::device_vector<int> dev_pos; //saves the positions that each thread will have to start from
- dev_pos.resize(N);
- int* pos_ptr = thrust::raw_pointer_cast(dev_pos.data());
- //call kernel
- dim3 dimGrid(totalBLocks,1,1);
- dim3 dimBlock(threadsPerBlock, 1, 1);
- findPositions<<<dimGrid, dimBlock>>>(device_data, totalAmountOfValuesPerThread, pos_ptr, N);
- thrust::sort(dev_pos.begin(), dev_pos.end());
- thrust::host_vector<int> host_pos(N);
- thrust::copy(dev_pos.begin(), dev_pos.end(), host_pos.begin());
- //free data
- cudaFree(device_data);
- dev_pos.clear();
- dev_pos.shrink_to_fit();
- gettimeofday(&tim, NULL);
- after = tim.tv_sec + (tim.tv_usec/1000000.0);
- std::cout<<"GPU time: "<<after-before<<" seconds"<<std::endl;
- //check correctness
- // std::cout<<"GPU data"<<std::endl;
- bool correct = true;
- for(size_t i=0; i<results.size(); i++){
- if(results[i] != host_pos[i]){
- correct = false;
- std::cout<<"CPU: "<<results[i]<<" GPU: "<<host_pos[i]<<" index: "<<i<<std::endl;
- break;
- }
- }
- //std::cout<<std::endl;
- if(correct) std::cout<<"GPU implementation is correct"<<std::endl;
- else std::cout<<"GPU implementation is wrong"<<std::endl;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement