Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- <n>
- <power>
- <element>
- .....
- #include <assert.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <sys/resource.h>
- #define BLOCK 8
- #define SIZE (BLOCK * 64)
- #define TILE_SIZE (8)
- int n;
- float *
- create_matrix_h(unsigned int w, unsigned int h) {
- float *m;
- m = (float *) malloc(w * h * sizeof(float));
- if (m == NULL) {
- fprintf(stderr, "Failed to malloc.n");
- exit(1);
- }
- return m;
- }
- __global__ void
- kernel3(const float *m1, const float *m2, float *m3, unsigned int width) {
- const unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
- const unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
- unsigned int t, i;
- float result = 0, a, b;
- for (t = 0; t < width / TILE_SIZE; ++t) {
- for (i = 0; i != TILE_SIZE; ++i) {
- a = m1[row*width + t*TILE_SIZE + i];
- b = m2[(t*TILE_SIZE + i)*width + col];
- result += a * b;
- }
- __syncthreads();
- }
- m3[row*width + col] = result;
- }
- float *
- create_matrix_d(int w, int h) {
- float *m;
- if (cudaMalloc(&m, w * h * sizeof(float)) == cudaErrorMemoryAllocation) {
- fprintf(stderr, "Failed to cudaMalloc.n");
- return NULL;
- //exit(1);
- }
- return m;
- }
- void
- fill_matrix_h(float *const m, int w, int h, float *const values, int nvalues) {
- int i, j = 0;
- for (i = 0; i != w * h; ++i) {
- m[i] = values[j];
- j = (j + 1) % nvalues;
- }
- }
- int
- main(void) {
- int k;
- if (scanf("%d", &n) !=1 || n<1){
- return 0;
- }
- if (scanf(" %d", &k) !=1 || k<0){
- return 0;
- }
- float *hm[3], *dm[3];
- dim3 bdim(TILE_SIZE, TILE_SIZE);
- dim3 gdim(SIZE/TILE_SIZE, SIZE/TILE_SIZE);
- int i;
- for(i=0; i<3; ++i) {
- hm[i] = create_matrix_h(SIZE, SIZE);
- dm[i] = create_matrix_d(SIZE, SIZE);
- }
- float tem[n*n];
- for(i=0; i<n*n; ++i) {
- if (scanf(" %f", &tem[i]) !=1){
- return 0;
- }
- }
- float temid[n*n];
- int j = 0;
- for (i = 0; i != n*n; ++i) {
- if (i==0 || i == j + (n+1)) {
- temid[i] = 1;
- j = i;
- }
- else {
- temid[i] = 0;
- }
- }
- fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float));
- fill_matrix_h(hm[1], SIZE, SIZE, temid, sizeof(temid)/sizeof(float));
- cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
- int w;
- for (w=0; w<k; ++w) {
- cudaMemcpy(dm[1], hm[1], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
- kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE);
- cudaThreadSynchronize();
- cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
- hm[1] = hm[2];
- }
- printf(" %.3f ", hm[2][0]);
- return 0;
- }
- 2
- 2
- 1
- 2
- 3
- 4
- #include <assert.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <sys/resource.h>
- #define BLOCK 8
- #define SIZE (BLOCK * 64)
- #define TILE_SIZE (8)
- int n;
- float *
- create_matrix_h(unsigned int w, unsigned int h) {
- float *m;
- m = (float *) malloc(w * h * sizeof(float));
- if (m == NULL) {
- fprintf(stderr, "Failed to malloc.n");
- exit(1);
- }
- return m;
- }
- void
- print_matrix(const float *m, const int w, const int h) {
- int x, y;
- for (y = 0; y != h; ++y) {
- for (x = 0; x != w; ++x)
- printf("%.03f ", m[y*w + x]);
- printf("n");
- }
- }
- void
- cpu_mult(const float *m1, const float *m2, float *m3, unsigned int width) {
- unsigned int i, j, k;
- float result;
- for (i = 0; i != width; ++i) {
- for (j = 0; j != width; ++j) {
- result = 0;
- for (k = 0; k != width; ++k)
- result += m1[i*width + k] * m2[k*width + j];
- m3[i*width + j] = result;
- }
- }
- }
- __global__ void
- kernel3(const float *m1, const float *m2, float *m3, unsigned int width) {
- const unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
- const unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
- unsigned int t, i;
- float result = 0, a, b;
- for (t = 0; t < width / TILE_SIZE; ++t) {
- for (i = 0; i != TILE_SIZE; ++i) {
- a = m1[row*width + t*TILE_SIZE + i];
- b = m2[(t*TILE_SIZE + i)*width + col];
- result += a * b;
- }
- __syncthreads();
- }
- m3[row*width + col] = result;
- }
- float *
- create_matrix_d(int w, int h) {
- float *m;
- if (cudaMalloc(&m, w * h * sizeof(float)) == cudaErrorMemoryAllocation) {
- fprintf(stderr, "Failed to cudaMalloc.n");
- return NULL;
- //exit(1);
- }
- return m;
- }
- void
- fill_matrix_h(float *const m, int w, int h, float *const values, int nvalues) {
- int i, j = 0;
- for (i = 0; i != w * h; ++i) {
- m[i] = values[j];
- j = (j + 1) % nvalues;
- }
- }
- int
- main(void) {
- int k;
- if (scanf("%d", &n) !=1 || n<1){
- return 0;
- }
- if (scanf(" %d", &k) !=1 || k<0){
- return 0;
- }
- float *hm[3], *dm[3];
- dim3 bdim(TILE_SIZE, TILE_SIZE);
- dim3 gdim(SIZE/TILE_SIZE, SIZE/TILE_SIZE);
- int i;
- for(i=0; i<3; ++i) {
- hm[i] = create_matrix_h(SIZE, SIZE);
- dm[i] = create_matrix_d(SIZE, SIZE);
- }
- float tem[n*n];
- for(i=0; i<n*n; ++i) {
- if (scanf(" %f", &tem[i]) !=1){
- return 0;
- }
- }
- float temid[n*n];
- int j = 0;
- for (i = 0; i != n*n; ++i) {
- if (i==0 || j == n) { // not j + (n+1)
- temid[i] = 1;
- j=0;
- }
- else {
- temid[i] = 0;
- j++;
- }
- }
- fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float));
- fill_matrix_h(hm[1], SIZE, SIZE, temid, sizeof(temid)/sizeof(float));
- cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
- dm[1] = dm[0]; // For the first iteration Result = A * A;
- int w;
- if (k==0) {
- hm[2] = hm[1];
- }
- else if (k==1) {
- hm[2] = hm[0];
- }
- else {
- for (w=1; w<k; ++w) {
- kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE);
- cudaThreadSynchronize();
- // No need to copy back to host
- // cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
- // Copy between device pointers
- cudaMemcpy(dm[1], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToDevice);
- }
- cudaMemcpy(hm[2], dm[1], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
- }
- print_matrix(hm[2], n, n);
- return 0;
- }
- for (i = 0; i != n*n; ++i) {
- if (i==0 || i == j + (n)) { // not j + (n+1)
- temid[i] = 1;
- j = i;
- }
- else {
- temid[i] = 0;
- }
- }
- fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float));
- fill_matrix_h(hm[1], SIZE, SIZE, temid, sizeof(temid)/sizeof(float));
- cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
- int w;
- for (w=0; w<k; ++w) {
- cudaMemcpy(dm[1], hm[1], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
- kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE);
- cudaThreadSynchronize();
- cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
- hm[1] = hm[2];
- }
- fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float));
- cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
- dm[1] = dm[0]; // For the first iteration Result = A * A;
- int w;
- for (w=0; w<k; ++w) {
- kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE);
- cudaThreadSynchronize();
- // No need to copy back to host
- // cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
- // Copy between device pointers
- cudaMemcpy(dm[1], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToDevice);
- }
Add Comment
Please, Sign In to add comment