Untitled

__global__ void add(int* a, int* b, int* c, int n)
{
  int globalIdx = threadIdx.x + blockIdx.x * blockDim.x;
  int nt = blockDim.x * gridDim.x;
  while (globalIdx < n) {
    c[globalIdx] = a[globalIdx] + b[globalIdx];
    globalIdx += nt;
  }
}

template<typename T>
void random_ints(T* a, int n)
{
  for (int i = 0; i < n; ++i)
  {
    a[i] = T(1);
  }
}

void check_results(int* c, int n)
{
  bool isOk = true;
  int i;
  for (i = 0; i < n ; ++i)
  {
    if (c[i] != 2) {
      isOk = false;
      break;
    }
  }
  std::cout << (isOk ? "OK" : "WA") << " " << i << std::endl;
}

void AddVectorsViaStreams()
{
  const int StreamsCount = 2;
  const int N = 67108864;
  const int M = 1024;

  int *a, *b, *c; // host copies of a, b, c
  int *d_a, *d_b, *d_c; // device copies of a, b, c
  int size = N * sizeof(int);
  // Alloc space for device copies of a, b, c
  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);
  // Alloc space for host copies of a, b, c and setup input values
  a = (int *)malloc(size); random_ints(a, N);
  b = (int *)malloc(size); random_ints(b, N);
  c = (int *)calloc(N, sizeof(int));
  // Copy inputs to device
  //cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
  //cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
  // Launch add() kernel on GPU with N blocks
/*
  cudaEvent_t start, stop;
  float time = 0;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start, 0); */
  cudaDeviceProp devProp;
  cudaGetDeviceProperties(&devProp, 0);
 /* std::cout << devProp.maxThreadsPerBlock << " " << "\n";
  for (int i = 0; i < 3; ++i)
  {
    std::cout << devProp.maxGridSize[i] << std::endl;
  }*/

  cudaStream_t streams[StreamsCount];
  for (int i = 0; i < StreamsCount; ++i)
  {
    cudaStreamCreate(&streams[i]);
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess)
    {
      // print the CUDA error message and exit
      printf("CUDA error: %s\n", cudaGetErrorString(error));
      exit(-1);
    }
  }

  int actialN = N / StreamsCount;

  int blocksCount = std::min(devProp.maxGridSize[1], (actialN + M - 1) / M);
  std::cout << "blocks count: " << blocksCount << std::endl;
  std::cout << "threads per block: " << M << std::endl;


  for (int i = 0; i < StreamsCount; ++i)
  {
    cudaMemcpyAsync(d_a + i * actialN,
      a + i * actialN, actialN, cudaMemcpyHostToDevice, streams[i]);

    cudaMemcpyAsync(d_b + i * actialN,
      b + i * actialN, actialN, cudaMemcpyHostToDevice, streams[i]);

    add <<< blocksCount, M, 0, streams[i] >> >(d_a + i * actialN,
      d_b + i * actialN, d_c + i * actialN, actialN);

    cudaMemcpyAsync(c + i * actialN, d_c + i * actialN,
      actialN, cudaMemcpyDeviceToHost, streams[i]);
  }

  // std::this_thread::sleep_for(std::chrono::seconds(25));

  cudaDeviceSynchronize();

/*
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
  cudaEventElapsedTime(&time, start, stop);
  printf("Elapsed time : %.2f ms\n", time);
  cudaEventDestroy(start);
  cudaEventDestroy(stop);
*/
  check_results(c, N);

  cudaError_t error = cudaGetLastError();
  if (error != cudaSuccess)
  {
    // print the CUDA error message and exit
    printf("CUDA error: %s\n", cudaGetErrorString(error));
    exit(-1);
  }

  // Copy result back to host
  cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

  // Cleanup
  for (int i = 0; i < StreamsCount; ++i)
  {
    cudaStreamDestroy(streams[i]);
  }

  free(a); free(b); free(c);
  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
}

int main(void)
{
  //AddVectors();
  // MulMatrices();
 AddVectorsViaStreams();
  return 0;
}