Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdlib.h>
- const int n = 12288;
- __global__ void staticReverse(int *d)
- {
- __shared__ int shmem[n];
- if (threadIdx.x >= n){
- return;
- }
- shmem[threadIdx.x] = d[threadIdx.x];
- __syncthreads();
- d[threadIdx.x] = shmem[n - threadIdx.x - 1];
- }
- __global__ void dynamicReverse(int *d, int n)
- {
- extern __shared__ int shmem[];
- if (threadIdx.x >= n){
- return;
- }
- shmem[threadIdx.x] = d[threadIdx.x];
- __syncthreads();
- d[threadIdx.x] = shmem[n - threadIdx.x - 1];
- }
- int main(void)
- {
- //const int n = 64; // FIX ME TO max possible size
- int a[n], r[n], d[n]; // FIX ME TO dynamic arrays if neccesary
- for (int i = 0; i < n; i++) {
- a[i] = i;
- r[i] = n-i-1;
- d[i] = 0;
- }
- int *d_d;
- cudaMalloc(&d_d, n * sizeof(int));
- // run version with static shared memory
- cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice);
- staticReverse<<<1, n>>>(d_d); // FIX kernel execution params
- cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost);
- for (int i = 0; i < n; i++)
- if (d[i] != r[i]) printf("Error: d[%d]!=r[%d] (%d, %d)n", i, i, d[i], r[i]);
- // run dynamic shared memory version
- cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice);
- dynamicReverse<<<1, n, n*sizeof(int)>>>(d_d, n); // FIX kernel executon params
- cudaMemcpy(d, d_d, n * sizeof(int), cudaMemcpyDeviceToHost);
- for (int i = 0; i < n; i++)
- if (d[i] != r[i]) printf("Error: d[%d]!=r[%d] (%d, %d)n", i, i, d[i], r[i]);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement