Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Very poor memory access performance with CUDA
- #include "cuda_runtime.h"
- #include <stdio.h>
- #define d_MaxParallelRows 320
- #define d_MinTreatedRow 5
- #define d_MaxTreatedRow 915
- #define d_RowsResolution 1
- #define k_ThreadsPerBlock 64
- __global__ void myKernel(int Xi_FirstTreatedRow)
- {
- int l_ThreadIndex = blockDim.x * blockIdx.x + threadIdx.x;
- if (l_ThreadIndex >= d_MaxParallelRows)
- return;
- int l_Row = Xi_FirstTreatedRow + (l_ThreadIndex * d_RowsResolution);
- if (l_Row <= d_MaxTreatedRow) {
- //float l_SrcIntegral[1700];
- float* l_SrcIntegral = (float*)malloc(1700 * sizeof(float));
- for (int x=185; x<1407; x++) {
- for (int i=0; i<1700; i++)
- l_SrcIntegral[i] = i;
- }
- free(l_SrcIntegral);
- }
- }
- int main()
- {
- cudaError_t cudaStatus;
- cudaStatus = cudaSetDevice(0);
- int l_ThreadsPerBlock = k_ThreadsPerBlock;
- int l_BlocksPerGrid = (d_MaxParallelRows + l_ThreadsPerBlock - 1) / l_ThreadsPerBlock;
- int l_FirstRow = d_MinTreatedRow;
- while (l_FirstRow <= d_MaxTreatedRow) {
- printf("CUDA: FirstRow=%dn", l_FirstRow);
- fflush(stdout);
- myKernel<<<l_BlocksPerGrid, l_ThreadsPerBlock>>>(l_FirstRow);
- cudaDeviceSynchronize();
- l_FirstRow += (d_MaxParallelRows * d_RowsResolution);
- }
- printf("CUDA: Donen");
- return 0;
- }
Add Comment
Please, Sign In to add comment