Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdlib.h>
- #include <stdio.h>
- int N, L, I;
- float * inputs;
- float * temp;
- float * weights;
- __global__ void mulKernel ( float * output, float * inputs, float * weights, int pos, int I )
- {
- int idx = blockIdx.x * blockDim.x + threadIdx.x;
- output [idx] = inputs [idx%I] * weights [pos+idx];
- }
- __global__ void sumKernel ( float * output, float * input, int N )
- {
- int idx = blockIdx.x * blockDim.x + threadIdx.x;
- float sum = 0;
- for(int l=0;l<N;l++)
- {
- sum += input[idx*N+l];
- }
- output[idx] = sum;
- }
- void printVector (const float *p, const int N) {
- for (int i=0; i<N; i++)
- printf("%f\n",p[i]);
- }
- float activation(float x)
- {
- return x;
- }
- int main(int argc, char *argv[])
- {
- if(argc < 3)
- printf("Usage: cuda <layers> <neurons&inputs>\n");
- else
- {
- L = atoi(argv[1]);
- N = atoi(argv[2]);
- I = atoi(argv[2]);
- inputs = (float*)malloc(I*sizeof(float));
- weights = (float*)malloc(L*I*N*sizeof(float));
- // and fill with some arbitrary values
- for (int i=0; i<I; i++)
- {
- inputs[i] = 2;
- }
- for (int i=0; i<L*N*I; i++)
- {
- weights[i] = 3;
- }
- // allocate device memory
- float * devInputs = NULL;
- float * devTemp = NULL;
- float * devWeights = NULL;
- cudaMalloc ( (void**)&devInputs, I*sizeof(float) );
- cudaMalloc ( (void**)&devTemp, I*N*sizeof(float) );
- cudaMalloc ( (void**)&devWeights, L*I*N*sizeof(float) );
- // set kernel launch configuration
- dim3 threadsMul = dim3(512, 1);
- int blocksCount = floor(I*N / threadsMul.x) + 1;
- dim3 blocksMul = dim3(blocksCount, 1);
- dim3 threadsSum = dim3(512, 1);
- blocksCount = floor(I / threadsSum.x) + 1;
- dim3 blocksSum = dim3(blocksCount, 1);
- cudaMemcpy ( devInputs, inputs, I*sizeof(float), cudaMemcpyHostToDevice );
- cudaMemcpy ( devWeights, weights, L*I*N*sizeof(float), cudaMemcpyHostToDevice );
- for(int j=0;j<L;j++)
- {
- mulKernel<<<blocksMul, threadsMul>>>(devTemp, devInputs, devWeights, j*N*I, I);
- sumKernel<<<blocksSum, threadsSum>>>(devInputs, devTemp, N);
- }
- cudaMemcpy ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
- cudaFree ( devInputs );
- cudaFree ( devTemp );
- cudaFree ( devWeights );
- printVector (inputs, N);
- free(inputs);
- free(weights);
- }
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement