#include <cuda.h>
#include <stdio.h>
__global__ void helloWorld(char* str){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
str[idx] += idx;
}
int main(int argc, char** argv){
char str[] = "Hello World";
for(int i = 0; i < 12; i++){
str[i] -= i;
}
char *d_str;
size_t size = sizeof(str);
cudaMalloc((void**)&d_str, size);
cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);
dim3 dimGrid(2);
dim3 dimBlock(6);
helloWorld<<< dimGrid, dimBlock >>>(d_str);
cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
cudaFree(d_str);
printf("%s\\n", str);
return 0;
}