Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // main.h file
- #ifndef ODINN_CUDA_MAIN_H
- #define ODINN_CUDA_MAIN_H
- #define ARR_SIZE 100
- #define ITER_AMOUNT 1
- typedef enum cudaError cudaError_t;
- static void HandleError(cudaError_t err, const char *file, int line) {
- if (err != CUDA_SUCCESS) {
- printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
- exit(EXIT_FAILURE);
- }
- }
- #define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
- #define GET_CURRENT_CLOCKS(var) (var = clock())
- #define GET_CLOCK_INTERVAL_SEC(start, end, result) (result = ((double)((double)end - (double)start) / (double)CLOCKS_PER_SEC))
- __host__ dim3 requestBlockSize(int x, int y=0, int z=0);
- __host__ dim3 requestNumBlocks(int x, int y=0, int z=0);
- __host__ void allocateVectors(unsigned int **a_host, unsigned int **b_host, unsigned int **c_host, unsigned int **a_device, unsigned int **b_device, unsigned int **c_device);
- __global__ void addVectors(unsigned int* a, unsigned int* b, unsigned int* result, int n);
- __host__ void cleanUp(unsigned int *a_host, unsigned int *b_host, unsigned int *c_host, unsigned int *a_device, unsigned int *b_device, unsigned int *c_device);
- #endif
- // End of main.h file
- // kernel.cu file
- #include <cuda.h>
- #include <stdio.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <time.h>
- #include <math.h>
- #include "main.h"
- static cudaDeviceProp prop;
- int main(void) {
- // Start lazy init now so first cudaMallow will run faster.
- cudaSetDevice(0);
- cudaFree(0);
- unsigned int *a_host, *b_host, *c_host;
- unsigned int *a_device, *b_device, *c_device;
- double delta_in_sec;
- size_t size = sizeof(unsigned int) * ARR_SIZE;
- clock_t start_clock, end_clock;
- HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0));
- dim3 block_size = requestBlockSize(1024);
- int blocks_requested = floor((double)(ARR_SIZE / block_size.x));
- dim3 n_blocks = requestNumBlocks(blocks_requested > 0 ? blocks_requested : 1);
- fprintf(stdout, "Allocating vectors ...\n");
- allocateVectors(&a_host, &b_host, &c_host, &a_device, &b_device, &c_device);
- fprintf(stdout, "Copying to device ...\n");
- HANDLE_ERROR(cudaMemcpy(a_device, a_host, size, cudaMemcpyHostToDevice));
- HANDLE_ERROR(cudaMemcpy(b_device, b_host, size, cudaMemcpyHostToDevice));
- fprintf(stdout, "Running kernel ...\n");
- GET_CURRENT_CLOCKS(start_clock);
- for(int i=0; i<ITER_AMOUNT; i++) {\
- addVectors<<<n_blocks, block_size>>>(a_device, b_device, c_device, ARR_SIZE);
- }
- GET_CURRENT_CLOCKS(end_clock);
- GET_CLOCK_INTERVAL_SEC(start_clock, end_clock, delta_in_sec);
- fprintf(stdout, "Runtime of kernel %d times on arrays in length %d took %f seconds\n"
- "Copying results back to host ...\n", ITER_AMOUNT, ARR_SIZE, delta_in_sec);
- HANDLE_ERROR(cudaMemcpy(c_host, c_device, size, cudaMemcpyDeviceToHost));;
- fprintf(stdout, "%u + %u != %u\n", a_host[0], b_host[0], c_host[0]);
- fprintf(stdout, "Cleaning up ...\n");
- cleanUp(a_host, b_host, c_host, a_device, b_device, c_device);
- fprintf(stdout, "Done!\n");
- }
- __host__ dim3 requestBlockSize(int x, int y, int z) {
- dim3 blocksize(
- x <= prop.maxThreadsDim[0] ? x : prop.maxThreadsDim[0],
- y <= prop.maxThreadsDim[1] ? y : prop.maxThreadsDim[1],
- z <= prop.maxThreadsDim[2] ? z : prop.maxThreadsDim[2]
- );
- return blocksize;
- }
- __host__ dim3 requestNumBlocks(int x, int y, int z) {
- dim3 numblocks(x, y, z);
- return numblocks;
- }
- __host__ void allocateVectors(unsigned int **a_host, unsigned int **b_host, unsigned int **c_host, unsigned int **a_device, unsigned int **b_device, unsigned int **c_device) {
- size_t size = sizeof(unsigned int) * ARR_SIZE;
- *a_host = (unsigned int *)malloc(size);
- *b_host = (unsigned int *)malloc(size);
- *c_host = (unsigned int *)malloc(size);
- HANDLE_ERROR(cudaMalloc((void **)a_device, size));
- HANDLE_ERROR(cudaMalloc((void **)b_device, size));
- HANDLE_ERROR(cudaMalloc((void **)c_device, size));
- srand(time(NULL));
- for(int i=0; i<ARR_SIZE; i++) {
- (*a_host)[i] = rand() % ARR_SIZE;
- (*b_host)[i] = rand() % ARR_SIZE;
- }
- }
- __global__ void addVectors(unsigned int* a, unsigned int* b, unsigned int* result, int n) {
- int idx = blockIdx.x * blockDim.x + threadIdx.x;
- if(idx >= 0 && idx < n)
- result[idx] = a[idx] + b[idx];
- }
- __host__ void cleanUp(unsigned int *a_host, unsigned int *b_host, unsigned int *c_host, unsigned int *a_device, unsigned int *b_device, unsigned int *c_device) {
- free(a_host);
- free(b_host);
- free(c_host);
- HANDLE_ERROR(cudaFree(a_device));
- HANDLE_ERROR(cudaFree(b_device));
- HANDLE_ERROR(cudaFree(c_device));
- }
- // End of kernel.cu file
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement