Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "stdio.h"
- #include <cuda.h>
- #include <cuda_runtime.h>
- #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
- #define printf(f, ...) ((void)(f, __VA_ARGS__),0)
- #endif
- #define NUM_BLOCKS 16
- #define BLOCK_WIDTH 1
- __global__ void hello(float * d_out, float * d_in)
- {
- int idx=blockIdx.x;
- d_out[idx]=d_in[idx];
- //printf("Hello world! I'm a thread in block %d\n", blockIdx.x);
- //printf("The number d_out is %f\n", d_in[idx]);
- }
- int main(int argc,char **argv)
- { const int ARRAY_SIZE = NUM_BLOCKS;
- const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
- float * d_in;
- float * d_out;
- float h_in[NUM_BLOCKS];
- float h_out[NUM_BLOCKS];
- for (int i = 0; i < NUM_BLOCKS; i++) {
- h_in[i] = float(i);
- //printf("The number h_in is %f\n", h_in[i]);
- }
- //allocate memory
- cudaMalloc((void**) &d_in, ARRAY_BYTES);
- cudaMalloc((void**) &d_out, ARRAY_BYTES);
- //Transfer array to GPU
- cudaMemcpy(d_in,h_in,ARRAY_BYTES, cudaMemcpyHostToDevice);
- // launch the kernel
- hello<<<NUM_BLOCKS, BLOCK_WIDTH>>>(d_out,d_in);
- //Copy back
- cudaMemcpy(h_out,d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
- // force the printf()s to flush
- cudaDeviceSynchronize();
- //printf("That's all!\n");
- // print out the resulting array
- for (int i =0; i < NUM_BLOCKS; i++) {
- //printf("%f\n", h_out[i]);
- }
- //cudaFree(d_in);
- //cudaFree(d_out);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement