Untitled

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define TAM_BLOCK 256

__global__ void stencil2d(int *a_d, int *b_d){
  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;

  b_d[i*j+i] = 0.2f * (a_d[i*j+i]) + a_d[(i*j-1)+i] + a_d[(i*j)+i-1] + a_d[(i*j+1)+i] + a_d[(i*j)+i+1];

}

int main(){
  int *a,*gpu,k, *b, *a_d, *b_d, filas, columnas,i,j,iteraciones;

  printf("Escribe el numero de iteraciones: "); scanf("%d",&iteraciones);
  printf("Escribe el numero de filas: "); scanf("%d",&filas);
  printf("Escribe el numero de columnas: "); scanf("%d",&columnas);

  a = (int*) malloc(sizeof(int) * filas * columnas);
  b = (int*) malloc(sizeof(int) * filas * columnas);
  gpu = (int*) malloc(sizeof(int) * filas * columnas);

  for(i = 0; i < filas; i++){
    for(j = 0; j < columnas; j++){
      a[i*j+i] = rand() % 100;
      b[i*j+i] = 0;
    }
  }
  for(k = 0; k < iteraciones; k++){
     for(i = 1; i < (filas-1); i++){
        for(j = 1; j < (columnas-1); j++){
            gpu[i*j+i] = 0.2f*(a[i*j+i]) + a[(i*j-1)+i] + a[(i*j)+i-1] + a[(i*j+1)+i] + a[(i*j)+i+1];
        printf("b[%d][%d] = %d\t",i,j,gpu[i*j+i]);
        }
     printf("\n");
    }
  }

  cudaSetDevice(0);

  cudaMalloc((void **) &a_d, sizeof(int) * filas * columnas);
  cudaMalloc((void **) &b_d, sizeof(int) * filas * columnas);

  cudaMemcpy(a_d,a, sizeof(int) * filas * columnas, cudaMemcpyHostToDevice);
  cudaMemcpy(b_d,b, sizeof(int) * filas * columnas, cudaMemcpyHostToDevice);

  dim3 thread(TAM_BLOCK,TAM_BLOCK);
  dim3 bloques((((filas-2)/TAM_BLOCK)+1),(((columnas-2)/TAM_BLOCK)+1));

  for(k = 0; k < iteraciones; k++){
    stencil2d<<<bloques,thread>>> (a_d,b_d);
  }

  cudaMemcpy(b,b_d, sizeof(int) * filas * columnas, cudaMemcpyDeviceToHost);

  for(k = 0; k < iteraciones; k++){
    for(i = 1; i < (filas-1); i++){
      for(j = 1; j < (filas-1); j++){
        printf("b[%d][%d] = %d\t",i,j,b[i*j+i]);
      }
      printf("\n");
    }
  }
}