Advertisement
Guest User

Untitled

a guest
Mar 26th, 2011
341
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 2.59 KB | None | 0 0
  1. #include <stdlib.h>
  2. #include <stdio.h>
  3.  
  4. int N, L, I;
  5. float * inputs;
  6. float * temp;
  7. float * weights;
  8.  
  9. __global__ void mulKernel ( float * output, float * inputs, float * weights, int pos, int I )
  10. {
  11.    int idx = blockIdx.x * blockDim.x + threadIdx.x;
  12.  
  13.    output [idx] = inputs [idx%I] * weights [pos+idx];
  14. }
  15.  
  16. __global__ void sumKernel ( float * output, float * input, int N )
  17. {
  18.    int idx = blockIdx.x * blockDim.x + threadIdx.x;
  19.    
  20.    float sum = 0;
  21.              
  22.    for(int l=0;l<N;l++)
  23.    {
  24.      sum += input[idx*N+l];
  25.    }
  26.  
  27.    output[idx] = sum;
  28. }
  29.  
  30. void printVector (const float *p, const int N) {
  31.     for (int i=0; i<N; i++)
  32.     printf("%f\n",p[i]);
  33. }
  34.  
  35. float activation(float x)
  36. {
  37.   return x;
  38. }
  39.  
  40. int main(int argc, char *argv[])
  41. {
  42.     if(argc < 3)
  43.         printf("Usage: cuda <layers> <neurons&inputs>\n");
  44.     else
  45.     {
  46.         L = atoi(argv[1]);
  47.         N = atoi(argv[2]);
  48.         I = atoi(argv[2]);
  49.         inputs = (float*)malloc(I*sizeof(float));
  50.         weights = (float*)malloc(L*I*N*sizeof(float));
  51.  
  52.         // and fill with some arbitrary values
  53.         for (int i=0; i<I; i++)
  54.         {
  55.             inputs[i] = 2;
  56.         }
  57.         for (int i=0; i<L*N*I; i++)
  58.         {
  59.             weights[i] = 3;
  60.         }
  61.        
  62.         // allocate device memory
  63.         float * devInputs = NULL;
  64.         float * devTemp = NULL;
  65.         float * devWeights = NULL;
  66.  
  67.         cudaMalloc ( (void**)&devInputs, I*sizeof(float) );
  68.         cudaMalloc ( (void**)&devTemp, I*N*sizeof(float) );
  69.         cudaMalloc ( (void**)&devWeights, L*I*N*sizeof(float) );
  70.  
  71.         // set kernel launch configuration
  72.         dim3 threadsMul = dim3(512, 1);
  73.         int blocksCount = floor(I*N / threadsMul.x) + 1;
  74.         dim3 blocksMul  = dim3(blocksCount, 1);
  75.        
  76.         dim3 threadsSum = dim3(512, 1);
  77.         blocksCount = floor(I / threadsSum.x) + 1;
  78.         dim3 blocksSum  = dim3(blocksCount, 1);
  79.        
  80.         cudaMemcpy      ( devInputs, inputs, I*sizeof(float), cudaMemcpyHostToDevice );
  81.         cudaMemcpy      ( devWeights, weights, L*I*N*sizeof(float), cudaMemcpyHostToDevice );
  82.        
  83.         for(int j=0;j<L;j++)
  84.         {
  85.           mulKernel<<<blocksMul, threadsMul>>>(devTemp, devInputs, devWeights, j*N*I, I);
  86.          
  87.           sumKernel<<<blocksSum, threadsSum>>>(devInputs, devTemp, N);
  88.         }
  89.        
  90.         cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
  91.        
  92.         cudaFree         ( devInputs   );
  93.         cudaFree         ( devTemp   );
  94.         cudaFree         ( devWeights   );
  95.        
  96.         printVector (inputs, N);
  97.  
  98.         free(inputs);
  99.         free(weights);
  100.     }
  101.     return 0;
  102. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement