Untitled

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "stdio.h"
#include "windows.h"
#include <locale.h>
#define SIZE 1024
__global__ void VectorAdd(int *a, int *b, int *c, int n) {
    int i = threadIdx.x;
    //for(i = 0; i < n; ++i)
    if (i < n)
        c[i] = a[i] + b[i];
}
int main() {
    setlocale(LC_ALL, "Russian");
    int Time1, Time2, Delay1;
    int *a, *b, *c;
    int *d_a, *d_b, *d_c;
    a = (int *)malloc(SIZE * sizeof(int));
    b = (int *)malloc(SIZE * sizeof(int));
    c = (int *)malloc(SIZE * sizeof(int));
    cudaMalloc(&d_a, SIZE * sizeof(int));
    cudaMalloc(&d_b, SIZE * sizeof(int));
    cudaMalloc(&d_c, SIZE * sizeof(int));
    Time1 = GetTickCount();
    for (int i = 0; i < SIZE; ++i) {
        a[i] = i;
        b[i] = i;
        c[i] = 0;
    }
    cudaMemcpy(d_a, a, SIZE * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, SIZE * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_c, c, SIZE * sizeof(int), cudaMemcpyHostToDevice);
    VectorAdd <<<1, SIZE >>> (d_a, d_b, d_c, SIZE);
    cudaMemcpy(c, d_c, SIZE * sizeof(int), cudaMemcpyDeviceToHost);
    for (int i = 0; i < 100; ++i)
        printf("c[%d] = %d\n", i, c[i]);
    Time2 = GetTickCount();
    Delay1 = Time2 - Time1;
    free(a);
    free(b);
    free(c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    printf("\nВремя вычисления CUDA = %d ms\n", Delay1);
    system("pause");
    return 0;
}