Untitled

#include <stdlib.h>
#include <stdio.h>

int N, L, I;
float * inputs;
float * temp;
float * weights;

__global__ void mulKernel ( float * output, float * inputs, float * weights, int pos, int I )
{
   int idx = blockIdx.x * blockDim.x + threadIdx.x;

   output [idx] = inputs [idx%I] * weights [pos+idx];
}

__global__ void sumKernel ( float * output, float * input, int N )
{
   int idx = blockIdx.x * blockDim.x + threadIdx.x;

   float sum = 0;

   for(int l=0;l<N;l++)
   {
     sum += input[idx*N+l];
   }

   output[idx] = sum;
}

void printVector (const float *p, const int N) {
    for (int i=0; i<N; i++)
    printf("%f\n",p[i]);
}

float activation(float x)
{
  return x;
}

int main(int argc, char *argv[])
{
    if(argc < 3)
        printf("Usage: cuda <layers> <neurons&inputs>\n");
    else
    {
        L = atoi(argv[1]);
        N = atoi(argv[2]);
        I = atoi(argv[2]);
        inputs = (float*)malloc(I*sizeof(float));
        weights = (float*)malloc(L*I*N*sizeof(float));

        // and fill with some arbitrary values
        for (int i=0; i<I; i++)
        {
            inputs[i] = 2;
        }
        for (int i=0; i<L*N*I; i++)
        {
            weights[i] = 3;
        }

        // allocate device memory
        float * devInputs = NULL;
        float * devTemp = NULL;
        float * devWeights = NULL;

        cudaMalloc ( (void**)&devInputs, I*sizeof(float) );
        cudaMalloc ( (void**)&devTemp, I*N*sizeof(float) );
        cudaMalloc ( (void**)&devWeights, L*I*N*sizeof(float) );

        // set kernel launch configuration
        dim3 threadsMul = dim3(512, 1);
        int blocksCount = floor(I*N / threadsMul.x) + 1;
        dim3 blocksMul  = dim3(blocksCount, 1);

        dim3 threadsSum = dim3(512, 1);
        blocksCount = floor(I / threadsSum.x) + 1;
        dim3 blocksSum  = dim3(blocksCount, 1);

        cudaMemcpy      ( devInputs, inputs, I*sizeof(float), cudaMemcpyHostToDevice );
        cudaMemcpy      ( devWeights, weights, L*I*N*sizeof(float), cudaMemcpyHostToDevice );

        for(int j=0;j<L;j++)
        {
          mulKernel<<<blocksMul, threadsMul>>>(devTemp, devInputs, devWeights, j*N*I, I);

          sumKernel<<<blocksSum, threadsSum>>>(devInputs, devTemp, N);
        }

        cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );

        cudaFree         ( devInputs   );
        cudaFree         ( devTemp   );
        cudaFree         ( devWeights   );

        printVector (inputs, N);

        free(inputs);
        free(weights);
    }
    return 0;
}