Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "com2039_lab5_reduce_gpu.h"
- int main(void) {
- // Allocate Unified memory
- float *arr, *intermediate, *final;
- cudaMallocManaged(&arr, N * sizeof(float));
- cudaMallocManaged(&intermediate, (N * sizeof(float)/ BLOCK_SIZE));
- cudaMallocManaged(&final, sizeof(float));
- cudaEvent_t start, stop;
- cudaEventCreate(&start);
- cudaEventCreate(&stop);
- // Initialising the array
- for (int i = 0; i < N; i++) {
- arr[i] = 1.0f;
- }
- int grid_size = N / BLOCK_SIZE;
- printf("Grid Size is: %d\n", grid_size);
- printf("Block Size is: %d\n", BLOCK_SIZE);
- cudaEventRecord(start); // Start timing
- reduceKernel<<<grid_size, BLOCK_SIZE>>>(intermediate, arr);
- // Wait for GPU to finish before accessing on host
- cudaDeviceSynchronize();
- reduceKernel<<<1, grid_size>>>(final, intermediate);
- // Wait for GPU to finish before accessing on host
- cudaDeviceSynchronize();
- cudaEventRecord(stop); // Stop timing
- //printf("Input Array: \n");
- //for (int n = 0; n < N; n++) {
- //printf("%f ", arr[n]);
- //}
- //printf("\n");
- //printf("Intermediate Array: \n");
- //for (int n = 0; n < grid_size; n++) {
- //printf("%f ", intermediate[n]);
- //}
- //printf("\n");
- printf("And the final reduction is: %f\n", final[0]);
- // Calculate the difference in times of the two events.
- cudaEventSynchronize(stop);
- float milliseconds = 0;
- cudaEventElapsedTime(&milliseconds, start, stop);
- printf("Elapsed time was: %f milliseconds.\n", milliseconds);
- cudaFree(arr);
- cudaFree(intermediate);
- cudaFree(final);
- }
- __global__ void reduceKernel(float* out, float* in) {
- int myId = threadIdx.x + blockDim.x * blockIdx.x;
- // ID relative to whole array
- int tid = threadIdx.x;
- // Local ID within the current block
- __shared__ float temp[BLOCK_SIZE];
- temp[tid] = in[myId];
- // Make sure entire block is loaded!
- __syncthreads();
- // do reduction in shared memory
- for (unsigned int s = blockDim.x / 2; s >= 1; s >>= 1) {
- if (tid < s) {
- temp[tid] += temp[tid + s];
- }
- __syncthreads();
- // make sure all adds at one stage are done!
- }
- // only thread 0 writes result for this block back to global memory
- if (tid == 0) {
- out[blockIdx.x] = temp[tid];
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement