Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <vector>
- __global__ void kernel2(float * buf)
- {
- int tid = blockIdx.x * blockDim.x + threadIdx.x;
- float a[5];
- #pragma unroll
- for(int i = 0; i < 5; ++i)
- a[i] = buf[tid + i];
- float sum = 0.0F;
- #pragma unroll
- for(int i = 0; i < 5; ++i)
- sum += a[i];
- buf[tid] = sum;
- }
- __global__ void kernel3(float * buf, int start_index)
- {
- int tid = blockIdx.x * blockDim.x + threadIdx.x;
- float a[6];
- #pragma unroll
- for(int i = 0; i < 6; ++i)
- a[i] = buf[tid + i];
- float sum = 0.0F;
- #pragma unroll
- for(int i = 0; i < 5; ++i)
- sum += a[start_index + i];
- buf[tid] = sum;
- }
- #define ARRAY_SIZE_BASE 32
- #define ARRAY_SIZE (ARRAY_SIZE_BASE+3)
- #define ITERATIONS 400
- __global__ void kernel4(float * __restrict buf, const int * __restrict indexbuf)
- {
- float a[ARRAY_SIZE];
- int tid = blockIdx.x * blockDim.x + threadIdx.x;
- int current_ind = tid & 31;
- int index = indexbuf[current_ind];
- #pragma unroll
- for(int i = 0; i < ARRAY_SIZE; ++i)
- a[i] = __ldg(buf + i);
- float sum = 0.0F;
- #pragma unroll 10
- for(int i = 0; i < ITERATIONS; ++i)
- {
- index = (index + 1) & 31;
- float x = a[index];
- float y = a[index + 1];
- float z = a[index + 2];
- float w = a[index + 3];
- sum = x + y + z + w;
- a[index] = sum;
- }
- buf[tid] = sum;
- }
- #define THREADBLOCK_SIZE 64
- __device__ __forceinline__ int no_bank_conflict_index(int thread_id, int logical_index)
- {
- return logical_index * THREADBLOCK_SIZE + thread_id;
- }
- __global__ void kernel5(float * __restrict buf, const int * __restrict indexbuf)
- {
- __shared__ float A[ARRAY_SIZE * THREADBLOCK_SIZE];
- int tid = blockIdx.x * blockDim.x + threadIdx.x;
- int current_ind = tid & 31;
- int index = indexbuf[current_ind];
- #pragma unroll
- for(int i = 0; i < ARRAY_SIZE; ++i)
- A[no_bank_conflict_index(threadIdx.x, i)] = __ldg(buf + i);
- float sum = 0.0F;
- #pragma unroll 10
- for(int i = 0; i < ITERATIONS; ++i)
- {
- index = (index + 1) & 31;
- float x = A[no_bank_conflict_index(threadIdx.x, index)];
- float y = A[no_bank_conflict_index(threadIdx.x, index + 1)];
- float z = A[no_bank_conflict_index(threadIdx.x, index + 2)];
- float w = A[no_bank_conflict_index(threadIdx.x, index + 3)];
- sum = x + y + z + w;
- A[no_bank_conflict_index(threadIdx.x, index)] = sum;
- }
- buf[tid] = sum;
- }
- #define cuda_safe_call(stmt) {cudaError_t err = stmt; if (err != cudaSuccess) {std::cerr << "CUDA error " << err << ": " << cudaGetErrorString(err) << std::endl; exit(-1);}}
- int main(int argc, char * argv[])
- {
- cuda_safe_call(cudaSetDevice(1));
- int grid_size = 4096;
- int access_pattern = 0; // 0 - uniform, 1 - 32 replay
- float * p_d;
- int * p_i;
- cuda_safe_call(cudaMalloc((void**)&p_d, (THREADBLOCK_SIZE * grid_size + 32) * sizeof(float)));
- cuda_safe_call(cudaMalloc((void**)&p_i, (32 + ITERATIONS) * sizeof(int)));
- std::vector<int> indexes(32 + ITERATIONS);
- if (access_pattern == 0)
- {
- for(int i = 0; i < (32 + ITERATIONS); ++i)
- indexes[i] = 0;
- }
- else
- {
- for(int i = 0; i < (32 + ITERATIONS); ++i)
- indexes[i] = i % 32;
- }
- cuda_safe_call(cudaMemcpy(p_i, &indexes[0], indexes.size() * sizeof(int), cudaMemcpyHostToDevice));
- //kernel2<<<grid_size,THREADBLOCK_SIZE>>>(p_d);
- //kernel3<<<grid_size,THREADBLOCK_SIZE>>>(p_d, 1);
- cudaFuncSetCacheConfig(kernel4, cudaFuncCachePreferL1);
- kernel4<<<grid_size,THREADBLOCK_SIZE>>>(p_d, p_i);
- //kernel5<<<grid_size,THREADBLOCK_SIZE>>>(p_d, p_i);
- cuda_safe_call(cudaFree(p_d));
- cuda_safe_call(cudaFree(p_i));
- cuda_safe_call(cudaDeviceReset());
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement