Untitled

#include "com2039_lab5_reduce_gpu.h"

int main(void) {

    // Allocate Unified memory
    float *arr, *intermediate, *final;
    cudaMallocManaged(&arr, N * sizeof(float));
    cudaMallocManaged(&intermediate, (N * sizeof(float)/ BLOCK_SIZE));
    cudaMallocManaged(&final, sizeof(float));

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Initialising the array
    for (int i = 0; i < N; i++) {
        arr[i] = 1.0f;
    }

    int grid_size = N / BLOCK_SIZE;
    printf("Grid Size is: %d\n", grid_size);
    printf("Block Size is: %d\n", BLOCK_SIZE);

    cudaEventRecord(start); // Start timing
    reduceKernel<<<grid_size, BLOCK_SIZE>>>(intermediate, arr);

    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    reduceKernel<<<1, grid_size>>>(final, intermediate);

    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    cudaEventRecord(stop); // Stop timing

    //printf("Input Array: \n");
    //for (int n = 0; n < N; n++) {
        //printf("%f ", arr[n]);
    //}

    //printf("\n");

    //printf("Intermediate Array: \n");
    //for (int n = 0; n < grid_size; n++) {
        //printf("%f ", intermediate[n]);
    //}

    //printf("\n");

    printf("And the final reduction is: %f\n", final[0]);

    // Calculate the difference in times of the two events.
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Elapsed time was: %f milliseconds.\n", milliseconds);

    cudaFree(arr);
    cudaFree(intermediate);
    cudaFree(final);
}

__global__ void reduceKernel(float* out, float* in) {
    int myId = threadIdx.x + blockDim.x * blockIdx.x;
// ID relative to whole array
    int tid = threadIdx.x;
// Local ID within the current block

    __shared__ float temp[BLOCK_SIZE];
    temp[tid] = in[myId];
    // Make sure entire block is loaded!
    __syncthreads();

    // do reduction in shared memory
    for (unsigned int s = blockDim.x / 2; s >= 1; s >>= 1) {
        if (tid < s) {
            temp[tid] += temp[tid + s];
        }
        __syncthreads();
        // make sure all adds at one stage are done!
    }

    // only thread 0 writes result for this block back to global memory
    if (tid == 0) {
        out[blockIdx.x] = temp[tid];
    }
}