Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <stdlib.h>
- #include <sys/time.h>
- #define N 1024
- #define BLOCK_SIZE 1024
- float hArray[N];
- float *dArray;
- int blocks;
- float* result;
- float* dResult;
- void prologue(void) {
- cudaMalloc((void**) &dArray, sizeof(hArray));
- cudaMemcpy(dArray, hArray, sizeof(hArray), cudaMemcpyHostToDevice);
- cudaMalloc((void**) &dResult, sizeof(result));
- cudaMemcpy(dResult, result, sizeof(result), cudaMemcpyHostToDevice);
- }
- void epilogue(void) {
- cudaMemcpy(hArray, dArray, sizeof(hArray), cudaMemcpyDeviceToHost);
- cudaFree(dArray);
- cudaMemcpy(result, dResult, sizeof(result), cudaMemcpyDeviceToHost);
- cudaFree(dResult);
- }
- __global__ void calcDevice(float *A, float* result) {
- int x = blockDim.x * blockIdx.x + threadIdx.x;
- if (x < N) {
- atomicMax((int*) result, __float_as_int(A[x]));
- }
- }
- void findMax(void) {
- srand(time(NULL));
- for (int i = 0; i < N; i++) {
- hArray[i] = (float) rand() / RAND_MAX;
- }
- float max = 0.0F;
- result = &max;
- prologue();
- blocks = N / BLOCK_SIZE;
- if (N % BLOCK_SIZE)
- blocks++;
- calcDevice<<<blocks, BLOCK_SIZE>>>(dArray, dResult);
- cudaThreadSynchronize();
- epilogue();
- printf("Max = %f\n", max);
- }
- int main(int argc, char** argv) {
- int devCnt;
- cudaGetDeviceCount(&devCnt);
- if (devCnt == 0) {
- perror("No CUDA devices available -- exiting.");
- return 1;
- }
- findMax();
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement