Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include <cuda.h>
- #include <device_functions.h>
- #include <stdio.h>
- #include <iostream>
- #include <conio.h>
- #include <fstream>
- #include <sstream>
- #include <string>
- #include <iomanip>
- #define ARRAY_SIZE 12000
- #define BLOCK_NUMBER 24
- #define THREAD_NUMBER 512
- __global__ void kernel1(int *inputData, int *outputData) {
- __shared__ int data[ARRAY_SIZE];
- int threadIndex = threadIdx.x;
- int index = blockIdx.x*blockDim.x + threadIdx.x;
- data[threadIndex] = inputData[index];
- __syncthreads();
- //reduction
- for (int i = 1; i < blockDim.x; i *= 2) {
- if (threadIndex % (2 * i) == 0) {
- data[threadIndex] += data[threadIndex + i];
- }
- __syncthreads();
- }
- if (threadIndex == 0) {
- outputData[blockIdx.x] = data[0];
- }
- }
- __global__ void kernel2(int *inputData, int *outputData) {
- __shared__ int data[ARRAY_SIZE];
- int threadIndex = threadIdx.x;
- int index = blockIdx.x*blockDim.x + threadIdx.x;
- data[threadIndex] = inputData[index];
- __syncthreads();
- //reduction
- for (int i = 1; i < blockDim.x; i *= 2) {
- int index = 2 * i * threadIndex;
- if (index < blockDim.x) {
- data[threadIndex] += data[threadIndex + i];
- }
- __syncthreads();
- }
- if (threadIndex == 0) {
- outputData[blockIdx.x] = data[0];
- }
- }
- __global__ void kernel3(int *inputData, int *outputData) {
- __shared__ int data[ARRAY_SIZE];
- int threadIndex = threadIdx.x;
- int index = blockIdx.x*blockDim.x + threadIdx.x;
- data[threadIndex] = inputData[index];
- __syncthreads();
- //reduction
- for (int s = blockDim.x / 2; s > 0; s >>= 1) {
- if (threadIndex < s) {
- data[threadIndex] += data[threadIndex + s];
- }
- __syncthreads();
- }
- if (threadIndex == 0) {
- outputData[blockIdx.x] = data[0];
- }
- }
- __global__ void kernel4(int *inputData, int *outputData) {
- __shared__ int data[THREAD_NUMBER];
- int threadIndex = threadIdx.x;
- int index = blockIdx.x*blockDim.x + threadIdx.x;
- data[threadIndex] = inputData[index] + inputData[index + blockDim.x];
- __syncthreads();
- //reduction
- for (int s = blockDim.x / 2; s > 0; s >>= 1) {
- if (threadIndex < s) {
- data[threadIndex] += data[threadIndex + s];
- }
- __syncthreads();
- }
- if (threadIndex == 0) {
- outputData[blockIdx.x] = data[0];
- }
- }
- __global__ void kernel5(int *inputData, int *outputData) {
- __shared__ int data[THREAD_NUMBER];
- int threadIndex = threadIdx.x;
- int index = blockIdx.x*blockDim.x + threadIdx.x;
- data[threadIndex] = inputData[index] + inputData[index + blockDim.x];
- __syncthreads();
- //reduction
- for (int s = blockDim.x / 2; s > 32; s>>=1) {
- if (threadIndex < s) {
- data[threadIndex] += data[threadIndex + s];
- }
- __syncthreads();
- }
- if (threadIndex < 32) {
- data[threadIndex] += data[threadIndex + 32];
- data[threadIndex] += data[threadIndex + 16];
- data[threadIndex] += data[threadIndex + 8];
- data[threadIndex] += data[threadIndex + 4];
- data[threadIndex] += data[threadIndex + 2];
- data[threadIndex] += data[threadIndex + 1];
- }
- if (threadIndex == 0) {
- outputData[blockIdx.x] = data[0];
- }
- }
- template<unsigned int blockSize> __global__ void kernel6(int *inputData, int *outputData) {
- __shared__ int data[THREAD_NUMBER];
- int threadIndex = threadIdx.x;
- int index = blockIdx.x*blockDim.x + threadIdx.x;
- data[threadIndex] = inputData[index] + inputData[index + blockDim.x];
- __syncthreads();
- if (blockSize >= 512) {
- if (threadIndex < 256) {
- data[threadIndex] += data[threadIndex + 256];
- __syncthreads();
- }
- }
- if (blockSize >= 256) {
- if (threadIndex < 128) {
- data[threadIndex] += data[threadIndex + 128];
- __syncthreads();
- }
- }
- if (blockSize >= 128) {
- if (threadIndex < 64) {
- data[threadIndex] += data[threadIndex + 64];
- __syncthreads();
- }
- }
- if (threadIndex < 32) {
- data[threadIndex] += data[threadIndex + 32];
- data[threadIndex] += data[threadIndex + 16];
- data[threadIndex] += data[threadIndex + 8];
- data[threadIndex] += data[threadIndex + 4];
- data[threadIndex] += data[threadIndex + 2];
- data[threadIndex] += data[threadIndex + 1];
- }
- if (threadIndex == 0) {
- outputData[blockIdx.x] = data[0];
- }
- }
- template<unsigned int blockSize>__global__ void kernel7(int *inputData, int *outputData, unsigned int n) {
- __shared__ int data[THREAD_NUMBER];
- int threadIndex = threadIdx.x;
- int index = blockIdx.x*(blockSize*2) + threadIndex;
- int gridSize = blockSize * 2 * gridDim.x;
- data[threadIndex] = 0;
- while (index < n) {
- data[threadIndex] += inputData[index] + inputData[index + blockSize];
- index += gridSize;
- }
- __syncthreads();
- if (blockSize >= 512) {
- if (threadIndex < 256) {
- data[threadIndex] += data[threadIndex + 256];
- __syncthreads();
- }
- }
- if (blockSize >= 256) {
- if (threadIndex < 128) {
- data[threadIndex] += data[threadIndex + 128];
- __syncthreads();
- }
- }
- if (blockSize >= 128) {
- if (threadIndex < 64) {
- data[threadIndex] += data[threadIndex + 64];
- __syncthreads();
- }
- }
- if (threadIndex < 32) {
- data[threadIndex] += data[threadIndex + 32];
- data[threadIndex] += data[threadIndex + 16];
- data[threadIndex] += data[threadIndex + 8];
- data[threadIndex] += data[threadIndex + 4];
- data[threadIndex] += data[threadIndex + 2];
- data[threadIndex] += data[threadIndex + 1];
- }
- if (threadIndex == 0) {
- outputData[blockIdx.x] = data[0];
- }
- }
- int main() {
- int a[ARRAY_SIZE] = { 0 };
- int c_1[BLOCK_NUMBER] = { 0 };
- int c_2[BLOCK_NUMBER] = { 0 };
- int c_3[BLOCK_NUMBER] = { 0 };
- int c_4[BLOCK_NUMBER] = { 0 };
- int c_5[BLOCK_NUMBER] = { 0 };
- int c_6[BLOCK_NUMBER] = { 0 };
- int c_7[BLOCK_NUMBER] = { 0 };
- std::ifstream infile("Text.txt");
- int singleIn;
- int count = 0;
- while (infile >> singleIn) {
- a[count] = singleIn;
- count++;
- }
- int *device_a = 0;
- int *device_c_1 = 0;
- int *device_c_2 = 0;
- int *device_c_3 = 0;
- int *device_c_4 = 0;
- int *device_c_5 = 0;
- int *device_c_6 = 0;
- int *device_c_7 = 0;
- cudaEvent_t start1, stop1, start2, stop2, start3,stop3, start4,stop4,start5,stop5,start6,stop6,start7,stop7;
- cudaEventCreate(&start1);
- cudaEventCreate(&stop1);
- cudaEventCreate(&start2);
- cudaEventCreate(&stop2);
- cudaEventCreate(&start3);
- cudaEventCreate(&stop3);
- cudaEventCreate(&start4);
- cudaEventCreate(&stop4);
- cudaEventCreate(&start5);
- cudaEventCreate(&stop5);
- cudaEventCreate(&start6);
- cudaEventCreate(&stop6);
- cudaEventCreate(&start7);
- cudaEventCreate(&stop7);
- cudaError_t cudaStatus;
- cudaStatus = cudaMalloc((void**)&device_a, ARRAY_SIZE * sizeof(int));
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie1 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaStatus = cudaMalloc((void**)&device_c_1, BLOCK_NUMBER * sizeof(int));
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaStatus = cudaMalloc((void**)&device_c_2, BLOCK_NUMBER * sizeof(int));
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaStatus = cudaMalloc((void**)&device_c_3, BLOCK_NUMBER * sizeof(int));
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaStatus = cudaMalloc((void**)&device_c_4, BLOCK_NUMBER * sizeof(int));
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaStatus = cudaMalloc((void**)&device_c_5, BLOCK_NUMBER * sizeof(int));
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaStatus = cudaMalloc((void**)&device_c_6, BLOCK_NUMBER * sizeof(int));
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaStatus = cudaMalloc((void**)&device_c_7, BLOCK_NUMBER * sizeof(int));
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie2 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaStatus = cudaMemcpy(device_a, a, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie3 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaEventRecord(start1);
- kernel1 <<<BLOCK_NUMBER, THREAD_NUMBER >>> (device_a, device_c_1);
- cudaEventRecord(stop1);
- cudaStatus = cudaDeviceSynchronize();
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaStatus = cudaMemcpy(c_1, device_c_1, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaEventSynchronize(stop1);
- float millis1 = 0;
- cudaEventElapsedTime(&millis1, start1, stop1);
- int secondAdd1 = 0;
- for (int i = 0; i < BLOCK_NUMBER; i++) {
- secondAdd1 += c_1[i];
- }
- std::cout << "Kernel 1" << std::endl;
- std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis1 << "\t Result: \t" << secondAdd1 << std::endl;
- cudaEventRecord(start2);
- kernel2 << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_2);
- cudaEventRecord(stop2);
- cudaStatus = cudaDeviceSynchronize();
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaStatus = cudaMemcpy(c_2, device_c_2, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaEventSynchronize(stop2);
- float millis2 = 0;
- cudaEventElapsedTime(&millis2, start2, stop2);
- int secondAdd2 = 0;
- for (int i = 0; i < BLOCK_NUMBER; i++) {
- secondAdd2 += c_2[i];
- }
- std::cout << "Kernel 2" << std::endl;
- std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis2 << "\t Result:\t" << secondAdd2 << std::endl;
- cudaEventRecord(start3);
- kernel3 << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_3);
- cudaEventRecord(stop3);
- cudaStatus = cudaDeviceSynchronize();
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaStatus = cudaMemcpy(c_3, device_c_3, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaEventSynchronize(stop3);
- float millis3 = 0;
- cudaEventElapsedTime(&millis3, start3, stop3);
- int secondAdd3 = 0;
- for (int i = 0; i < BLOCK_NUMBER; i++) {
- secondAdd3 += c_3[i];
- }
- std::cout << "Kernel 3" << std::endl;
- std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis3 << "\t Result: \t" << secondAdd3 << std::endl;
- cudaEventRecord(start4);
- kernel4 << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_4);
- cudaEventRecord(stop4);
- cudaStatus = cudaDeviceSynchronize();
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaStatus = cudaMemcpy(c_4, device_c_4, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaEventSynchronize(stop4);
- float millis4 = 0;
- cudaEventElapsedTime(&millis4, start4, stop4);
- int secondAdd4 = 0;
- for (int i = 0; i < BLOCK_NUMBER; i++) {
- secondAdd4 += c_4[i];
- }
- std::cout << "Kernel 4" << std::endl;
- std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis4 << "\t Result: \t" << secondAdd4 << std::endl;
- cudaEventRecord(start5);
- kernel5 << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_5);
- cudaEventRecord(stop5);
- cudaStatus = cudaDeviceSynchronize();
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaStatus = cudaMemcpy(c_5, device_c_5, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaEventSynchronize(stop5);
- float millis5 = 0;
- cudaEventElapsedTime(&millis5, start5, stop5);
- int secondAdd5 = 0;
- for (int i = 0; i < BLOCK_NUMBER; i++) {
- secondAdd5 += c_5[i];
- }
- std::cout << "Kernel5" << std::endl;
- std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis5 << "\t Result: \t" << secondAdd5 << std::endl;
- cudaEventRecord(start6);
- kernel6<THREAD_NUMBER> << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_6);
- cudaEventRecord(stop6);
- cudaStatus = cudaDeviceSynchronize();
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaStatus = cudaMemcpy(c_6, device_c_6, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaEventSynchronize(stop1);
- float millis6 = 0;
- cudaEventElapsedTime(&millis6, start6, stop6);
- int secondAdd6 = 0;
- for (int i = 0; i < BLOCK_NUMBER; i++) {
- secondAdd6 += c_6[i];
- }
- std::cout << "Kernel 6" << std::endl;
- std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis6 << "\t Result: \t" << secondAdd6 << std::endl;
- cudaEventRecord(start7);
- kernel7<THREAD_NUMBER> << <BLOCK_NUMBER, THREAD_NUMBER >> > (device_a, device_c_7,ARRAY_SIZE);
- cudaEventRecord(stop7);
- cudaStatus = cudaDeviceSynchronize();
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie5 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaStatus = cudaMemcpy(c_7, device_c_7, BLOCK_NUMBER * sizeof(int), cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess) {
- std::cout << "Tutaj panie6 " << cudaGetErrorString(cudaStatus) << std::endl;
- }
- cudaEventSynchronize(stop7);
- float millis7 = 0;
- cudaEventElapsedTime(&millis7, start7, stop7);
- int secondAdd7 = 0;
- for (int i = 0; i < BLOCK_NUMBER; i++) {
- secondAdd7 += c_7[i];
- }
- std::cout << "Kernel 7" << std::endl;
- std::cout << std::fixed << std::setprecision(7) << "\t Execution time: \t" << millis7 << "\t Result: \t" << secondAdd7 << std::endl;
- int hostReductionResult = 0;
- for (int i = 0; i < ARRAY_SIZE; i++) {
- hostReductionResult += a[i];
- }
- std::cout << "Host reduction result: " << hostReductionResult << std::endl;
- getch();
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement