Untitled

$ cat t712.cu
#include <iostream>

#define ROWS 20
#define COLS 10

__global__ void kernel(const unsigned int size, float* matrix, const float* vector)
{
    // get the current element index for the thread
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size)
    {
        // sum the current element with the
        matrix[idx] += vector[threadIdx.x];
    }
}

int main(){

  float *h_mat, *d_mat, *h_vec, *d_vec;
  const unsigned int msz = ROWS*COLS*sizeof(float);
  const unsigned int vsz = COLS*sizeof(float);
  h_mat = (float *)malloc(msz);
  h_vec = (float *)malloc(vsz);
  cudaMalloc(&d_mat, msz);
  cudaMalloc(&d_vec, vsz);
  cudaMemset(d_mat, 0, msz); // set matrix to zero
  for (int i=0; i<COLS; i++) h_vec[i] = i;
  cudaMemcpy(d_vec, h_vec, vsz, cudaMemcpyHostToDevice);
  kernel<<<ROWS,COLS>>>(ROWS*COLS, d_mat, d_vec);
  cudaMemcpy(h_mat, d_mat, msz, cudaMemcpyDeviceToHost);
  for (int i = 0; i < ROWS; i++){
    for (int j = 0; j < COLS; j++) std::cout << h_mat[i*COLS+j] << " ";
    std::cout << std::endl;}
  return 0;
}
$ nvcc -o t712 t712.cu
$ cuda-memcheck ./t712
========= CUDA-MEMCHECK
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
========= ERROR SUMMARY: 0 errors
$