task_5_reverse

#include <stdio.h>
#include <stdlib.h>

const int n = 12288;

__global__ void staticReverse(int *d)
{
    __shared__ int shmem[n];
    if (threadIdx.x >= n){
        return;
    }
    shmem[threadIdx.x] = d[threadIdx.x];
    __syncthreads();
    d[threadIdx.x] = shmem[n - threadIdx.x - 1];

}

__global__ void dynamicReverse(int *d, int n)
{
        extern __shared__ int shmem[];
         if (threadIdx.x >= n){
            return;
        }
        shmem[threadIdx.x] = d[threadIdx.x];
        __syncthreads();
        d[threadIdx.x] = shmem[n - threadIdx.x - 1];
}

int main(void)
{
  //const int n = 64; // FIX ME TO max possible size
  int a[n], r[n], d[n]; // FIX ME TO dynamic arrays if neccesary

  for (int i = 0; i < n; i++) {
    a[i] = i;
    r[i] = n-i-1;
    d[i] = 0;
  }

  int *d_d;
  cudaMalloc(&d_d, n * sizeof(int));

  // run version with static shared memory
  cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice);
  staticReverse<<<1, n>>>(d_d); // FIX kernel execution params
  cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost);
  for (int i = 0; i < n; i++)
    if (d[i] != r[i]) printf("Error: d[%d]!=r[%d] (%d, %d)n", i, i, d[i], r[i]);

  // run dynamic shared memory version
  cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice);
  dynamicReverse<<<1, n, n*sizeof(int)>>>(d_d, n); // FIX kernel executon params
  cudaMemcpy(d, d_d, n * sizeof(int), cudaMemcpyDeviceToHost);
  for (int i = 0; i < n; i++)
    if (d[i] != r[i]) printf("Error: d[%d]!=r[%d] (%d, %d)n", i, i, d[i], r[i]);
}