SHARE
TWEET

Untitled




Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include <stdio.h>
- #include <ctime>
- #include <iostream>
- #include <cmath>
- #include <algorithm>
- #include <cooperative_groups.h>
- #include <assert.h>
- #include <memory>
- #pragma once
- #ifdef __INTELLISENSE__
- void __syncthreads();
- #endif
- namespace cg = cooperative_groups;
- cudaError_t addWithCuda(int *c, const int *a, unsigned int size);
- __global__ void reduction(int *g_idata, int *g_odata) {
- // Shared Data (sdata) is allocated in kernel call as 3rd arg
- extern __shared__ float sdata[];
- int myId = threadIdx.x + blockDim.x * blockIdx.x;
- int tid = threadIdx.x;
- // load all the data from global memory to shared memory
- sdata[tid] = g_idata[myId];
- __syncthreads(); //Make sure the entire block is loaded
- for (unsigned int i = blockDim.x / 2; i > 0; i >>= 1) {
- if (tid < i) {
- g_idata[myId] += g_idata[myId + i];
- //sdata[tid] += sdata[tid + i];
- }
- __syncthreads();
- }
- if (tid == 0) {
- //g_odata[blockIdx.x] = sdata[0];
- g_odata[blockIdx.x] = g_idata[myId];
- }
- }
- void remplir(int *arr, int length, int arrayval)
- {
- for (int i = 0; i < length; ++i) {
- arr[i] = arrayval;
- }
- }
- void afficherTab(int *arr, int length)
- {
- for (int i = 0; i < length; ++i) {
- printf("%d ", arr[i]);
- printf("\n");
- }
- }
- int main()
- {
- const int arraySize = 128 * 1024;
- static int a[arraySize];
- //std::unique_ptr<int[]> a = std::unique_ptr<int[]>(new int[arraySize]);
- //int a[arraySize];
- static int c[arraySize] = { 0 };
- remplir(a, arraySize, 1);
- // Add vectors in parallel.
- cudaError_t cudaStatus = addWithCuda(c, a, arraySize);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "addWithCuda failed!");
- return 1;
- }
- //afficherTab(c, arraySize / 512);
- afficherTab(c, arraySize / 512);
- system("PAUSE");
- // cudaDeviceReset must be called before exiting in order for profiling and
- // tracing tools such as Nsight and Visual Profiler to show complete traces.
- cudaStatus = cudaDeviceReset();
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaDeviceReset failed!");
- return 1;
- }
- return 0;
- }
- // Helper function for using CUDA to add vectors in parallel.
- cudaError_t addWithCuda(int *c, const int *a, unsigned int size)
- {
- int *dev_a = 0;
- int *dev_c = 0;
- cudaError_t cudaStatus;
- //declaration pour la mesure du temp
- cudaEvent_t start, stop;
- cudaEventCreate(&start);
- cudaEventCreate(&stop);
- // Choose which GPU to run on, change this on a multi-GPU system.
- cudaStatus = cudaSetDevice(0);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
- goto Error;
- }
- // Allocate GPU buffers for three vectors (two input, one output) .
- cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMalloc failed!");
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMalloc failed!");
- goto Error;
- }
- // Copy input vectors from host memory to GPU buffers.
- cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed!");
- goto Error;
- }
- // Launch a kernel on the GPU with one thread for each element.
- //addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
- const int threads = 1024;
- const int nbreblock = size / threads;
- cudaEventRecord(start); // demarre le compteur
- reduction << <nbreblock, threads, threads * sizeof(int) >> >(dev_a, dev_c);
- cudaEventRecord(stop); // arret du compteur
- // Check for any errors launching the kernel
- cudaStatus = cudaGetLastError();
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
- goto Error;
- }
- // cudaDeviceSynchronize waits for the kernel to finish, and returns
- // any errors encountered during the launch.
- cudaStatus = cudaDeviceSynchronize();
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
- goto Error;
- }
- // Copy output vector from GPU buffer to host memory.
- cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaMemcpy failed!");
- goto Error;
- }
- cudaEventSynchronize(stop); // sync time
- float milliseconds = 0;
- cudaEventElapsedTime(&milliseconds, start, stop);
- std::cout << " temp = " << milliseconds << std::endl;
- Error:
- cudaFree(dev_c);
- cudaFree(dev_a);
- return cudaStatus;
- }
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy.