Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /**
- * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
- /**
- * Vector addition: C = A + B.
- *
- * This sample is a very basic sample that implements element by element
- * vector addition. It is the same as the sample illustrating Chapter 2
- * of the programming guide with some additions like error checking.
- */
- #include <stdio.h>
- // For the CUDA runtime routines (prefixed with "cuda_")
- #include <cuda_runtime.h>
- #include <helper_cuda.h>
- #include <helper_functions.h>
- #include <device_functions.h>
- #define PARAM 1
- /**
- * CUDA Kernel Device code
- *
- * Computes the vector addition of A and B into C. The 3 vectors have the same
- * number of elements numElements.
- */
- template <int BLOCK_SIZE>
- __global__ void
- vectorAdd(float *A, float *C, int numElements)
- {
- unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
- int BLOCK_NUMBER = (numElements + BLOCK_SIZE * (PARAM + 1) - 1) / (BLOCK_SIZE*(PARAM + 1));
- __shared__ float As[BLOCK_SIZE*(PARAM + 1)];
- As[threadIdx.x] = 0;
- for (int j=0; j<(PARAM + 1); j++)
- if ((i + (BLOCK_NUMBER * BLOCK_SIZE*j)) < numElements)
- As[threadIdx.x] += A[i + (BLOCK_NUMBER*BLOCK_SIZE*j)]; //odstęp = ilość bloków * rozmiar bloku
- __syncthreads();
- for (unsigned int odstep = ceilf(blockDim.x / 2); odstep >= 1; odstep = odstep / 2) {
- if (threadIdx.x < (odstep * 2)) {
- if (threadIdx.x + odstep < blockDim.x) {
- As[threadIdx.x] = As[threadIdx.x] + As[threadIdx.x + odstep];
- }
- }
- __syncthreads();
- }
- if (threadIdx.x == 0) {
- C[blockIdx.x] = As[0];
- }
- }
- /**
- * Host main routine
- */
- int
- main(void)
- {
- const int threadsPerBlock = 256;
- // Error code to check return values for CUDA calls
- cudaError_t err = cudaSuccess;
- // Print the vector length to be used, and compute its size
- int numElements = 500; //TODO
- size_t size = numElements * sizeof(float);
- printf("[Addition of %d elements]\n", numElements);
- // Allocate the host input vector A
- float *h_A = (float *)malloc(size);
- // Allocate the host output vector C
- float *h_C = (float *)malloc(size/(threadsPerBlock * (PARAM+1))); //sufit
- // Verify that allocations succeeded
- if (h_A == NULL || h_C == NULL)
- {
- fprintf(stderr, "Failed to allocate host vectors!\n");
- exit(EXIT_FAILURE);
- }
- float sum = 0;
- // Initialize the host input vectors
- for (int i = 0; i < numElements; ++i)
- {
- h_A[i] = rand()/(float)RAND_MAX;
- sum += h_A[i];
- }
- // Allocate the device input vector A
- float *d_A = NULL;
- err = cudaMalloc((void **)&d_A, size);
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- // Allocate the device output vector C
- float *d_C = NULL;
- err = cudaMalloc((void **)&d_C, (size / (threadsPerBlock * (PARAM+1))));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- // Copy the host input vectors A and B in host memory to the device input vectors in
- // device memory
- printf("Copy input data from the host memory to the CUDA device\n");
- err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- // Launch the Vector Add CUDA Kernel
- const int blocksPerGrid = (numElements + threadsPerBlock*(PARAM + 1) - 1) / (threadsPerBlock * (PARAM + 1));
- printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
- vectorAdd<threadsPerBlock><<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
- err = cudaGetLastError();
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- // Copy the device result vector in device memory to the host result vector
- // in host memory.
- printf("Copy output data from the CUDA device to the host memory\n");
- err = cudaMemcpy(h_C, d_C, (size / (threadsPerBlock * (PARAM+1))), cudaMemcpyDeviceToHost);
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- if (abs(sum - h_C[0])>0.01)
- {
- fprintf(stderr, "Result verification failed!\nh_C:%f\nsum:%f\n", h_C[0], sum);
- exit(EXIT_FAILURE);
- }
- printf("h_C:%f\nsum:%f\nTest PASSED\n", h_C[0], sum);
- // Free device global memory
- err = cudaFree(d_A);
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- err = cudaFree(d_C);
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
- // Free host memory
- free(h_A);
- free(h_C);
- printf("Done\n");
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement