Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdio.h>
- #include <time.h>
- #include <stdlib.h>
- #include <stdint.h>
- #include "cuda_runtime.h"
- #include "curand_kernel.h"
- #include "device_launch_parameters.h"
- #define GRID_SIZE 8192
- #define BLOCK_SIZE 256
- #define MEGABYTE (1024 * 1024)
- #define THREADS_PER_HASH 8
- #define ITERATIONS 16
- #define FNV_PRIME 0x01000193
- #define fnv(x,y) ((x) * FNV_PRIME ^(y))
- #define random() (rand() * rand())
- #define CUDA_SAFE_CALL(call) \
- do { \
- cudaError_t err = call; \
- if (cudaSuccess != err) { \
- fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \
- __FUNCTION__, __LINE__, cudaGetErrorString(err) ); \
- exit(EXIT_FAILURE); \
- } \
- } while (0)
- typedef union
- {
- uint32_t uint32s[128 / sizeof(uint32_t)];
- uint4 uint4s[128 / sizeof(uint4)];
- } hash128_t;
- __constant__ hash128_t * d_buffer;
- __constant__ unsigned int d_buffer_size;
- __device__ uint32_t fnv_reduce(uint4 v)
- {
- return fnv(fnv(fnv(v.x, v.y), v.z), v.w);
- }
- __global__ void test(int search, volatile unsigned int * num_results)
- {
- const unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;
- const int thread_id = threadIdx.x & (THREADS_PER_HASH - 1);
- const int start_lane = threadIdx.x & ~(THREADS_PER_HASH - 1);
- unsigned int s = gid;
- unsigned int r;
- for (int i = 0; i < THREADS_PER_HASH; i++) {
- for (int j = 0; j < ITERATIONS; j++) {
- unsigned int index = __shfl(s, start_lane + i) % d_buffer_size;
- uint4 v = d_buffer[index].uint4s[thread_id];
- s = fnv_reduce(v);
- }
- s = __shfl(s, start_lane + i);
- if (i == thread_id) {
- r = s;
- }
- }
- if (search == r) {
- atomicInc(const_cast<unsigned int *>(num_results), UINT_MAX);
- __threadfence_system();
- }
- }
- int main(int argc, char *argv[])
- {
- unsigned int buffer_size;
- if (argc == 2)
- buffer_size = atoi(argv[1]) * MEGABYTE;
- else
- buffer_size = 1024 * MEGABYTE;
- unsigned int * buffer = (unsigned int *)malloc(buffer_size);
- printf("Creating buffer of size %u bytes...\n", buffer_size);
- srand(time(NULL));
- for (unsigned int i = 0; i < buffer_size / 4; i++) {
- buffer[i] = random();
- }
- hash128_t * h_buffer;
- volatile unsigned int * num_results;
- unsigned int h_buffer_size = buffer_size / sizeof(hash128_t);
- CUDA_SAFE_CALL(cudaSetDevice(0));
- CUDA_SAFE_CALL(cudaMallocHost((void**)&num_results, sizeof(unsigned int)));
- CUDA_SAFE_CALL(cudaMalloc((void**)&h_buffer, buffer_size));
- CUDA_SAFE_CALL(cudaMemcpy(reinterpret_cast<void*>(h_buffer), buffer, buffer_size, cudaMemcpyHostToDevice));
- CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_buffer, &h_buffer, sizeof(hash128_t *)));
- CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_buffer_size, &h_buffer_size, sizeof(unsigned int)));
- num_results[0] = 0;
- unsigned int target;
- target = random();
- cudaEvent_t start, stop;
- CUDA_SAFE_CALL(cudaEventCreate(&start));
- CUDA_SAFE_CALL(cudaEventCreate(&stop));
- cudaEventRecord(start, nullptr);
- int count = 16;
- for (int i = 0; i < count; i++) {
- test << <GRID_SIZE, BLOCK_SIZE >> >(target, num_results);
- CUDA_SAFE_CALL(cudaGetLastError());
- CUDA_SAFE_CALL(cudaDeviceSynchronize());
- }
- cudaEventRecord(stop, nullptr);
- cudaFree(h_buffer);
- float duration;
- cudaEventElapsedTime(&duration, start, stop);
- printf("%f GB/s\n", (1000.0f / duration) * count * sizeof(uint4) * THREADS_PER_HASH * ITERATIONS * GRID_SIZE * BLOCK_SIZE / static_cast<float>(1 << 30));
- return num_results[0];
- }
Advertisement
Add Comment
Please, Sign In to add comment