Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "cuda.h"
- //#include "gcd.cu"
- #include <cstdio>
- #include <cmath>
- #include <iostream>
- #define BLOCK_DIM_X 16
- #define BLOCK_DIM_Y 32
- int main(){
- int n, m;
- scanf("%d%d", &n, &m);
- cuInit(0);
- CUdevice cuDevice;
- CUresult res = cuDeviceGet(&cuDevice, 0);
- CUcontext cuContext;
- res = cuCtxCreate(&cuContext, 0, cuDevice);
- CUmodule cuModule = (CUmodule)0;
- res = cuModuleLoad(&cuModule, "gcd.ptx");
- CUfunction gcd;
- res = cuModuleGetFunction(&gcd, cuModule, "gcd");
- int *pres = (int*) malloc(sizeof(int)*n*m);
- int *qres = (int*) malloc(sizeof(int)*n*m);
- CUdeviceptr gpu_pres;
- CUdeviceptr gpu_qres;
- res = cuMemAlloc(&gpu_pres, sizeof(int)*n*m);
- res = cuMemAlloc(&gpu_qres, sizeof(int)*n*m);
- cuCtxSynchronize();
- //int blocks_per_grid = n*m;
- //int threads_per_block = 16*32;
- int block_dim_x = BLOCK_DIM_X;
- int block_dim_y = BLOCK_DIM_Y;
- int grid_dim_x = (n-1) / BLOCK_DIM_X + 1;
- int grid_dim_y = (m-1) / BLOCK_DIM_Y + 1;
- int w = BLOCK_DIM_X * grid_dim_x;
- int test;
- void* args[] = {&w, &gpu_pres, &gpu_qres};
- //cuLaunchKernel(gcd, n*m, 1, 1, n, 1, 1, 0, 0, args, 0);
- res = cuLaunchKernel(gcd, grid_dim_x, grid_dim_y, 1, block_dim_x, block_dim_y, 1, 0, 0, args, 0);
- cuCtxSynchronize();
- res = cuMemcpyDtoH( (void*)pres, gpu_pres, sizeof(int)*n*m);
- res = cuMemcpyDtoH( (void*)qres, gpu_qres, sizeof(int)*n*m);
- cuCtxSynchronize();
- std::cout << "N = " << n << " M = " << m << std::endl;
- for(int i = 0; i<n; i++){
- for(int j = 0; j<m; j++){
- std::cout << i+1 << " " << j+1 << " " << pres[j*w + i] << " " << qres[j*n +i] << std::endl;
- }
- }
- cuMemFree(gpu_qres);
- cuMemFree(gpu_pres);
- cuCtxSynchronize();
- cuCtxDestroy(cuContext);
- free(pres);
- free(qres);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement