TestingPTX.cu

#include "stdio.h"
#include <cuda.h>
#include <cuda_runtime.h>

#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
    #define printf(f, ...) ((void)(f, __VA_ARGS__),0)
#endif


#define NUM_BLOCKS 16
#define BLOCK_WIDTH 1

__global__ void hello(float * d_out, float * d_in)
{
    int idx=blockIdx.x;
    d_out[idx]=d_in[idx];
    //printf("Hello world! I'm a thread in block %d\n", blockIdx.x);
    //printf("The number d_out is %f\n", d_in[idx]);

}


int main(int argc,char **argv)
{   const int ARRAY_SIZE = NUM_BLOCKS;
    const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);

    float * d_in;
    float * d_out;
    float h_in[NUM_BLOCKS];
    float h_out[NUM_BLOCKS];
    for (int i = 0; i < NUM_BLOCKS; i++) {
        h_in[i] = float(i);
        //printf("The number h_in is %f\n", h_in[i]);
    }

    //allocate memory
    cudaMalloc((void**) &d_in, ARRAY_BYTES);
    cudaMalloc((void**) &d_out, ARRAY_BYTES);
    //Transfer array to GPU
    cudaMemcpy(d_in,h_in,ARRAY_BYTES, cudaMemcpyHostToDevice);
    // launch the kernel
    hello<<<NUM_BLOCKS, BLOCK_WIDTH>>>(d_out,d_in);
    //Copy back
    cudaMemcpy(h_out,d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);

    // force the printf()s to flush
    cudaDeviceSynchronize();

    //printf("That's all!\n");

        // print out the resulting array
    for (int i =0; i < NUM_BLOCKS; i++) {
        //printf("%f\n", h_out[i]);
    }

    //cudaFree(d_in);
    //cudaFree(d_out);

    return 0;
}