Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdlib.h>
- // For the CUDA runtime routines (prefixed with "cuda_")
- #include <cuda_runtime.h>
- /**
- * CUDA Kernel Device code
- */
- __global__ void vectorSum(int *A, int *B, int numElements) {
- extern __shared__ int S[];
- // Each thread loads one element from global to shared memory
- int tid = threadIdx.x;
- int i = blockDim.x * blockIdx.x + threadIdx.x;
- if (i < numElements) {
- S[tid] = A[i];
- __syncthreads();
- // Reduce in shared memory
- for (int t = blockDim.x/2; t > 0; t>>=1) {
- if (tid < t) {
- S[tid] += S[tid + t];
- }
- __syncthreads();
- }
- if (tid == 0) B[blockIdx.x] = S[0];
- }
- }
- /**
- * Host main routine
- */
- int main(int argc, char *argv[]) {
- if (argc != 2){
- fprintf(stderr, "ERROR: invalid number of arguments");
- exit(EXIT_FAILURE);
- }
- int numElements = atoi(argv[1]);
- // Error code to check return values for CUDA calls
- cudaError_t err = cudaSuccess;
- // Print the vector length to be used, and compute its size
- size_t size = numElements * sizeof(int);
- printf("[Vector summation of %d elements]\n", numElements);
- // Allocate the host input vector A
- int *h_A = (int*)malloc(size);
- // Allocate the host output vector B
- int *h_B = (int*)malloc(size);
- // Verify that allocations succeeded
- if (h_A == NULL || h_B == NULL){
- fprintf(stderr, "Failed to allocate host vectors!\n");
- exit(EXIT_FAILURE);
- }
- // Initialize the host input vector
- for (int i = 0; i < numElements; ++i) {
- h_A[i] = i;
- printf("%d ", h_A[i]);
- }
- printf("\n");
- // Allocate the device input vector A
- int *d_A = NULL;
- err = cudaMalloc((void **)&d_A, size);
- if (err != cudaSuccess) {
- fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- // Allocate the device output vector B
- int *d_B = NULL;
- err = cudaMalloc((void **)&d_B, size);
- if (err != cudaSuccess) {
- fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- // Copy the host input vector A in host memory to the device input vector in device memory
- printf("Copy input data from the host memory to the CUDA device\n");
- err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
- if (err != cudaSuccess) {
- fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- // Copy the host output vector B in host memory to the device input vector in device memory
- printf("Copy input data from the host memory to the CUDA device\n");
- err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
- if (err != cudaSuccess) {
- fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- // Launch the Vector Summation CUDA Kernel
- int threadsPerBlock = 256;
- int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
- printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
- vectorSum<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, numElements);
- err = cudaGetLastError();
- if (err != cudaSuccess) {
- fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- // Copy the device result vector in device memory to the host result vector in host memory
- printf("Copy output data from the CUDA device to the host memory\n");
- err = cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost);
- if (err != cudaSuccess) {
- fprintf(stderr, "Failed to copy vector B from device to host (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- // Print the result
- printf("Summation result: %d\n", d_B[0]);
- // Free device global memory
- err = cudaFree(d_A);
- if (err != cudaSuccess) {
- fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- err = cudaFree(d_B);
- if (err != cudaSuccess) {
- fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- // Free host memory
- free(h_A);
- free(h_B);
- // Reset the device and exit
- err = cudaDeviceReset();
- if (err != cudaSuccess) {
- fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- printf("Done\n");
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement