Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <iostream>
- #include <stdlib.h>
- #include <math.h>
- __global__ void MatrixTrans(double *M, int n){
- int idx = blockIdx.x * blockDim.x + threadIdx.x;
- if (idx < n*n && idx % n < idx / n){
- double tmp = M[idx];
- M[idx] = M[(idx%n)*n + idx/n];
- M[(idx%n)*n + idx/n] = tmp;
- }
- }
- int main(){
- int n = 6;
- double *h_m, *res_h_m;
- size_t bytes = n * n * sizeof(double);
- h_m = (double*)malloc(bytes);
- res_h_m = (double*)malloc(bytes);
- for (int i = 0; i < n; ++i){
- for (int j = 0; j < n; ++j){
- h_m[i*n + j] = 10*i + j;
- std::cout << h_m[i*n + j] << " ";
- }
- std::cout<<std::endl;
- }
- double *d_m;
- cudaMalloc(&d_m, bytes);
- cudaMemcpy(d_m, h_m, bytes, cudaMemcpyHostToDevice);
- int block_size, grid_size;
- block_size = 1024;
- grid_size = (n-1)/block_size + 1;
- //cudaEvent_t start_gpu, stop_gpu;
- //cudaEventCreate(&start_gpu);
- //cudaEventCreate(&stop_gpu);
- //cudaEventRecord(start_gpu);
- MatrixTrans<<<grid_size, block_size>>>(d_m, n);
- //cudaDeviceSynchronize();
- //cudaEventRecord(stop_gpu);
- //float delta = 0.0;
- //cudaEventElapsedTime(&delta, start_gpu, stop_gpu);
- cudaMemcpy(res_h_m, d_m, bytes, cudaMemcpyDeviceToHost);
- /*
- for (int i = 0; i < n; ++i){
- std::cout << h_c[i] << std::endl;
- }*/
- for (int i = 0; i < n; ++i){
- for (int j = 0; j < n; ++j){
- std::cout << res_h_m[i*n + j] << " ";
- }
- std::cout<<std::endl;
- }
- //std::cout << "Elapsed time" << delta << std::endl;
- cudaFree(d_m);
- free(h_m);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement