Untitled

/**
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

/**
 * Vector addition: C = A + B.
 *
 * This sample is a very basic sample that implements element by element
 * vector addition. It is the same as the sample illustrating Chapter 2
 * of the programming guide with some additions like error checking.
 */

#include <stdio.h>

// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>

#include <helper_cuda.h>
#include <helper_functions.h>
#include <device_functions.h>

#define PARAM 1
/**
 * CUDA Kernel Device code
 *
 * Computes the vector addition of A and B into C. The 3 vectors have the same
 * number of elements numElements.
 */
template <int BLOCK_SIZE>
__global__ void
vectorAdd(float *A, float *C, int numElements)
{
	unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

	int BLOCK_NUMBER = (numElements + BLOCK_SIZE * (PARAM + 1) - 1) / (BLOCK_SIZE*(PARAM + 1));
	__shared__ float As[BLOCK_SIZE*(PARAM + 1)];
	As[threadIdx.x] = 0;
	for (int j=0; j<(PARAM + 1); j++)
		if ((i + (BLOCK_NUMBER * BLOCK_SIZE*j)) < numElements)
			As[threadIdx.x] += A[i + (BLOCK_NUMBER*BLOCK_SIZE*j)]; //odstęp = ilość bloków * rozmiar bloku
	__syncthreads();


	for (unsigned int odstep = ceilf(blockDim.x / 2); odstep >= 1; odstep = odstep / 2) {

		if (threadIdx.x < (odstep * 2)) {
			if (threadIdx.x + odstep < blockDim.x) {
				As[threadIdx.x] = As[threadIdx.x] + As[threadIdx.x + odstep];
			}
		}
		__syncthreads();
	}

	if (threadIdx.x == 0) {
		C[blockIdx.x] = As[0];
	}
}


/**
 * Host main routine
 */
int
main(void)
{
	const int threadsPerBlock = 256;

    // Error code to check return values for CUDA calls
    cudaError_t err = cudaSuccess;

    // Print the vector length to be used, and compute its size
    int numElements = 500; //TODO
    size_t size = numElements * sizeof(float);
    printf("[Addition of %d elements]\n", numElements);

    // Allocate the host input vector A
    float *h_A = (float *)malloc(size);


    // Allocate the host output vector C
    float *h_C = (float *)malloc(size/(threadsPerBlock * (PARAM+1))); //sufit

    // Verify that allocations succeeded
    if (h_A == NULL || h_C == NULL)
    {
        fprintf(stderr, "Failed to allocate host vectors!\n");
        exit(EXIT_FAILURE);
    }

	float sum = 0;
    // Initialize the host input vectors
    for (int i = 0; i < numElements; ++i)
    {
        h_A[i] = rand()/(float)RAND_MAX;
		sum += h_A[i];

    }

    // Allocate the device input vector A
    float *d_A = NULL;
    err = cudaMalloc((void **)&d_A, size);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Allocate the device output vector C
    float *d_C = NULL;
    err = cudaMalloc((void **)&d_C, (size / (threadsPerBlock * (PARAM+1))));

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Copy the host input vectors A and B in host memory to the device input vectors in
    // device memory
    printf("Copy input data from the host memory to the CUDA device\n");
    err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Launch the Vector Add CUDA Kernel
	const int blocksPerGrid = (numElements + threadsPerBlock*(PARAM + 1) - 1) / (threadsPerBlock * (PARAM + 1));
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
	vectorAdd<threadsPerBlock><<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, numElements);
    err = cudaGetLastError();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Copy the device result vector in device memory to the host result vector
    // in host memory.
    printf("Copy output data from the CUDA device to the host memory\n");
    err = cudaMemcpy(h_C, d_C, (size / (threadsPerBlock * (PARAM+1))), cudaMemcpyDeviceToHost);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }


	if (abs(sum - h_C[0])>0.01)
	{
		fprintf(stderr, "Result verification failed!\nh_C:%f\nsum:%f\n", h_C[0], sum);
		exit(EXIT_FAILURE);
	}
    printf("h_C:%f\nsum:%f\nTest PASSED\n", h_C[0], sum);

    // Free device global memory
    err = cudaFree(d_A);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    err = cudaFree(d_C);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Free host memory
    free(h_A);
    free(h_C);

    printf("Done\n");
    return 0;
}