Untitled

#include <stdio.h>
#include <stdlib.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
/**
 * CUDA Kernel Device code
 */
__global__ void vectorSum(int *A, int *B, int numElements) {
  extern __shared__ int S[];
  // Each thread loads one element from global to shared memory
  int tid = threadIdx.x;
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < numElements) {
    S[tid] = A[i];
    __syncthreads();
    // Reduce in shared memory
    for (int t = blockDim.x/2; t > 0; t>>=1) {
      if (tid < t) {
        S[tid] += S[tid + t];
      }
      __syncthreads();
    }
    if (tid == 0) B[blockIdx.x] = S[0];
  }
}
/**
 * Host main routine
 */
int main(int argc, char *argv[]) {
  if (argc != 2){
    fprintf(stderr, "ERROR: invalid number of arguments");
    exit(EXIT_FAILURE);
  }
  int numElements = atoi(argv[1]);
  // Error code to check return values for CUDA calls
  cudaError_t err = cudaSuccess;
  // Print the vector length to be used, and compute its size
  size_t size = numElements * sizeof(int);
  printf("[Vector summation of %d elements]\n", numElements);
  // Allocate the host input vector A
  int *h_A = (int*)malloc(size);
  // Allocate the host output vector B
  int *h_B = (int*)malloc(size);
  // Verify that allocations succeeded
  if (h_A == NULL || h_B == NULL){
    fprintf(stderr, "Failed to allocate host vectors!\n");
    exit(EXIT_FAILURE);
  }
  // Initialize the host input vector
  for (int i = 0; i < numElements; ++i) {
    h_A[i] = i;
    printf("%d ", h_A[i]);
  }
  printf("\n");
  // Allocate the device input vector A
  int *d_A = NULL;
  err = cudaMalloc((void **)&d_A, size);
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
  // Allocate the device output vector B
  int *d_B = NULL;
  err = cudaMalloc((void **)&d_B, size);
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
  // Copy the host input vector A in host memory to the device input vector in device memory
  printf("Copy input data from the host memory to the CUDA device\n");
  err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
  // Copy the host output vector B in host memory to the device input vector in device memory
  printf("Copy input data from the host memory to the CUDA device\n");
  err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
  // Launch the Vector Summation CUDA Kernel
  int threadsPerBlock = 256;
  int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
  printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
  vectorSum<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, numElements);
  err = cudaGetLastError();
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
  // Copy the device result vector in device memory to the host result vector in host memory
  printf("Copy output data from the CUDA device to the host memory\n");
  err = cudaMemcpy(h_B, d_B, size, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to copy vector B from device to host (error code %s)!\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
  // Print the result
  printf("Summation result: %d\n", d_B[0]);
  // Free device global memory
  err = cudaFree(d_A);
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
  err = cudaFree(d_B);
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
  // Free host memory
  free(h_A);
  free(h_B);
  // Reset the device and exit
  err = cudaDeviceReset();
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
  printf("Done\n");
  return 0;
}