Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #define _XOPEN_SOURCE 700
- #include <string.h>
- #include <stdlib.h>
- #include <stdio.h>
- #include <stdint.h>
- #include <sodium.h>
- #include <unistd.h>
- #include <time.h>
- #include <arpa/inet.h>
- #include <assert.h>
- #include <math.h>
- #include <CL/cl.h>
- #include <malloc.h>
- double get_ttime() {
- struct timespec ts;
- clock_gettime(CLOCK_MONOTONIC, &ts);
- return ts.tv_sec + ts.tv_nsec / 1000000000.0;
- }
- // COMPILE INSTRUCTIONS:
- // gcc-4.9 time-threaded-equihash-pointers.c equihash-pointers-working-minimal-memory-currently-fastest.c --pedantic -Ofast -Wall -std=c11 -o test-equihash -lsodium -lpthread -O3 -march=native -mtune=native -mavx -ftree-vectorize
- // theoretical min: 2*2^20*(6*(8+32)
- // 2 buckets * NUM_BUCKETS*(BUCKET_SIZE + NUM_ELEMENTS*(ELEMENT_SIZE))
- /*
- NOTE: only works for NUM_COLLISION_BITS < 24!
- */
- #define EQUIHASH_N 200
- #define EQUIHASH_K 9
- #define NUM_COLLISION_BITS (EQUIHASH_N / (EQUIHASH_K + 1))
- #define NUM_INDICES (1 << EQUIHASH_K)
- #define NUM_VALUES (1 << (NUM_COLLISION_BITS+1))
- #define NUM_BUCKETS (1 << NUM_COLLISION_BITS)
- #define DIGEST_SIZE 25
- typedef struct element element_t;
- typedef uint64_t digest_t[(DIGEST_SIZE + sizeof(uint64_t) - 1) / sizeof(uint64_t)];
- struct element {
- uint32_t digest_index;
- uint32_t parent_bucket_data;
- };
- typedef struct bucket {
- unsigned size;
- element_t data[18];
- } bucket_t;
- void hexout(unsigned char* digest_result) {
- for(unsigned i = 0; i < 4; ++i) {
- for(int j = 0; j < 8; ++j) {
- int c = digest_result[i*8 + j];
- printf("%2X", c);
- }
- }
- printf("\n");
- }
- uint32_t mask_collision_bits(uint8_t* data, size_t bit_index) {
- uint32_t n = ((*data << (bit_index)) & 0xff) << 12;
- n |= ((*(++data)) << (bit_index+4));
- n |= ((*(++data)) >> (4-bit_index));
- return n;
- }
- void get_element_parent_bucket_data(element_t* src, uint32_t* parent_bucket_index, uint8_t* a, uint8_t* b);
- //uint32_t mask_collision_bits(uint8_t* data, size_t bit_index);
- void decompress_indices(uint32_t* dst_uncompressed_indices, bucket_t* buckets, element_t* old_src);
- void produce_solutions(uint32_t* dst_solutions, uint32_t* n_solutions, bucket_t* buckets, digest_t* src_digests);
- int compare_indices32(uint32_t* a, uint32_t* b, size_t n_current_indices) {
- for(size_t i = 0; i < n_current_indices; ++i, ++a, ++b) {
- if(*a < *b) {
- return -1;
- } else if(*a > *b) {
- return 1;
- } else {
- return 0;
- }
- }
- return 0;
- }
- void normalize_indices(uint32_t* indices) {
- for(size_t step_index = 0; step_index < EQUIHASH_K; ++step_index) {
- for(size_t i = 0; i < NUM_INDICES; i += (1 << (step_index+1))) {
- if(compare_indices32(indices+i, indices+i+(1 << step_index), (1 << step_index)) > 0) {
- uint32_t tmp_indices[(1 << step_index)];
- memcpy(tmp_indices, indices+i, (1 << step_index)*sizeof(uint32_t));
- memcpy(indices+i, indices+i+(1 << step_index), (1 << step_index)*sizeof(uint32_t));
- memcpy(indices+i+(1 << step_index), tmp_indices, (1 << step_index)*sizeof(uint32_t));
- }
- }
- }
- }
- void xor_elements(uint8_t* dst, uint8_t* a, uint8_t* b) {
- ((uint64_t*)dst)[0] = ((uint64_t*)a)[0] ^ ((uint64_t*)b)[0];
- ((uint64_t*)dst)[1] = ((uint64_t*)a)[1] ^ ((uint64_t*)b)[1];
- ((uint64_t*)dst)[2] = ((uint64_t*)a)[2] ^ ((uint64_t*)b)[2];
- dst[24] = a[24] ^ b[24];
- }
- void hash(uint8_t* dst, uint32_t in, const crypto_generichash_blake2b_state* digest) {
- uint32_t tmp_in = in/2;
- crypto_generichash_blake2b_state new_digest = *digest;
- crypto_generichash_blake2b_update(&new_digest, (uint8_t*)&tmp_in, sizeof(uint32_t));
- crypto_generichash_blake2b_final(&new_digest, (uint8_t*)dst, 2*DIGEST_SIZE);
- }
- int is_indices_valid(uint32_t* indices, const crypto_generichash_blake2b_state* digest) {
- uint8_t digest_results[NUM_INDICES][DIGEST_SIZE];
- memset(digest_results, '\0', NUM_INDICES*DIGEST_SIZE);
- for(size_t i = 0; i < NUM_INDICES; ++i) {
- uint8_t digest_tmp[2*DIGEST_SIZE];
- hash(digest_tmp, indices[i], digest);
- memcpy(digest_results[i], digest_tmp+((indices[i] % 2)*EQUIHASH_N/8), DIGEST_SIZE);
- }
- for(size_t step_index = 0; step_index < EQUIHASH_K; ++step_index) {
- for(size_t i = 0; i < (NUM_INDICES >> step_index); i += 2) {
- uint8_t digest_tmp[DIGEST_SIZE];
- xor_elements(digest_tmp, digest_results[i], digest_results[i+1]);
- size_t start_bit = step_index*NUM_COLLISION_BITS;
- size_t byte_index = start_bit / 8;
- size_t bit_index = start_bit % 8;
- if(!mask_collision_bits(((uint8_t*)digest_tmp) + byte_index, bit_index) == 0) {
- return 0;
- }
- memcpy(digest_results[i / 2], digest_tmp, DIGEST_SIZE);
- }
- }
- size_t start_bit = EQUIHASH_K*NUM_COLLISION_BITS;
- size_t byte_index = start_bit / 8;
- size_t bit_index = start_bit % 8;
- return mask_collision_bits(((uint8_t*)digest_results[0]) + byte_index, bit_index) == 0;
- }
- void build(cl_device_id device_id, cl_program program) {
- const char* options = "";
- cl_int ret_val = clBuildProgram(program, 1, &device_id, options, NULL, NULL);
- // avoid abortion due to CL_BILD_PROGRAM_FAILURE
- cl_build_status build_status;
- clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &build_status, NULL);
- char *build_log;
- size_t ret_val_size;
- clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
- //build_log = calloc(10000, 1);
- clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, ret_val_size, NULL, NULL);
- build_log[ret_val_size] = '\0';
- //fprintf(stderr, "%s\n\n", build_log);
- }
- const char *get_error_string(cl_int error)
- {
- switch(error){
- // run-time and JIT compiler errors
- case 0: return "CL_SUCCESS";
- case -1: return "CL_DEVICE_NOT_FOUND";
- case -2: return "CL_DEVICE_NOT_AVAILABLE";
- case -3: return "CL_COMPILER_NOT_AVAILABLE";
- case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
- case -5: return "CL_OUT_OF_RESOURCES";
- case -6: return "CL_OUT_OF_HOST_MEMORY";
- case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
- case -8: return "CL_MEM_COPY_OVERLAP";
- case -9: return "CL_IMAGE_FORMAT_MISMATCH";
- case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
- case -11: return "CL_BUILD_PROGRAM_FAILURE";
- case -12: return "CL_MAP_FAILURE";
- case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
- case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
- case -15: return "CL_COMPILE_PROGRAM_FAILURE";
- case -16: return "CL_LINKER_NOT_AVAILABLE";
- case -17: return "CL_LINK_PROGRAM_FAILURE";
- case -18: return "CL_DEVICE_PARTITION_FAILED";
- case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
- // compile-time errors
- case -30: return "CL_INVALID_VALUE";
- case -31: return "CL_INVALID_DEVICE_TYPE";
- case -32: return "CL_INVALID_PLATFORM";
- case -33: return "CL_INVALID_DEVICE";
- case -34: return "CL_INVALID_CONTEXT";
- case -35: return "CL_INVALID_QUEUE_PROPERTIES";
- case -36: return "CL_INVALID_COMMAND_QUEUE";
- case -37: return "CL_INVALID_HOST_PTR";
- case -38: return "CL_INVALID_MEM_OBJECT";
- case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
- case -40: return "CL_INVALID_IMAGE_SIZE";
- case -41: return "CL_INVALID_SAMPLER";
- case -42: return "CL_INVALID_BINARY";
- case -43: return "CL_INVALID_BUILD_OPTIONS";
- case -44: return "CL_INVALID_PROGRAM";
- case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
- case -46: return "CL_INVALID_KERNEL_NAME";
- case -47: return "CL_INVALID_KERNEL_DEFINITION";
- case -48: return "CL_INVALID_KERNEL";
- case -49: return "CL_INVALID_ARG_INDEX";
- case -50: return "CL_INVALID_ARG_VALUE";
- case -51: return "CL_INVALID_ARG_SIZE";
- case -52: return "CL_INVALID_KERNEL_ARGS";
- case -53: return "CL_INVALID_WORK_DIMENSION";
- case -54: return "CL_INVALID_WORK_GROUP_SIZE";
- case -55: return "CL_INVALID_WORK_ITEM_SIZE";
- case -56: return "CL_INVALID_GLOBAL_OFFSET";
- case -57: return "CL_INVALID_EVENT_WAIT_LIST";
- case -58: return "CL_INVALID_EVENT";
- case -59: return "CL_INVALID_OPERATION";
- case -60: return "CL_INVALID_GL_OBJECT";
- case -61: return "CL_INVALID_BUFFER_SIZE";
- case -62: return "CL_INVALID_MIP_LEVEL";
- case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
- case -64: return "CL_INVALID_PROPERTY";
- case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
- case -66: return "CL_INVALID_COMPILER_OPTIONS";
- case -67: return "CL_INVALID_LINKER_OPTIONS";
- case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
- // extension errors
- case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
- case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
- case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
- case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
- case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
- case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
- case -9999: return "NVIDIA: ILLEGAL READ OR WRITE TO A BUFFER";
- default:
- fprintf(stderr, "'%d'\n", error);
- return "Unknown OpenCL error";
- }
- }
- void check_error(cl_int ret, unsigned line_number) {
- if(ret != 0) {
- fprintf(stderr, "An error occured on line %u: %s\n", line_number, get_error_string(ret));
- exit(1);
- }
- }
- typedef struct gpu_config {
- unsigned flags;
- char* program_source_code;
- size_t program_source_code_size;
- cl_program program;
- cl_platform_id platform_ids;
- cl_uint n_platforms;
- cl_device_id device_ids;
- cl_uint n_devices;
- cl_context context;
- cl_command_queue command_queue;
- cl_kernel initial_bucket_hashing_kernel;
- cl_kernel bucket_collide_and_hash_kernel;
- cl_kernel produce_solutions_kernel;
- // gpu variables below
- // two heaps for the digest, or one array split in two elements to hold
- // destination and source heaps for hashes between rounds
- cl_mem digests[2];
- cl_mem new_digest_index;
- cl_mem buckets;
- cl_mem blake2b_digest;
- cl_mem n_solutions;
- cl_mem dst_solutions;
- //cl_mem elements;
- } gpu_config_t;
- void init_program(gpu_config_t* config, const char* file_path, unsigned flags) {
- memset(config, '\0', sizeof(gpu_config_t));
- config->flags = flags;
- FILE* f = fopen(file_path, "r");
- if (!f) {
- fprintf(stderr, "program with path \"%s\".\n", file_path);
- exit(1);
- }
- config->program_source_code = calloc(400000, sizeof(char));
- config->program_source_code_size = fread(config->program_source_code, 1, 400000, f);
- fclose(f);
- cl_int ret = 0;
- cl_int zero = 0;
- check_error(clGetPlatformIDs(1, &config->platform_ids, &config->n_platforms), __LINE__);
- check_error(clGetDeviceIDs(config->platform_ids, CL_DEVICE_TYPE_DEFAULT, 1, &config->device_ids, &config->n_devices), __LINE__);
- config->context = clCreateContext(NULL, 1, &config->device_ids, NULL, NULL, &ret);
- check_error(ret, __LINE__);
- config->program = clCreateProgramWithSource(config->context, 1, &config->program_source_code, &config->program_source_code_size, &ret);
- check_error(ret, __LINE__);
- cl_build_status status;
- cl_int err;
- cl_uint platformCount;
- cl_uint deviceCount;
- cl_int r;
- size_t logSize;
- char *programLog;
- if (clBuildProgram(config->program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS){
- // check build log
- clGetProgramBuildInfo(config->program, config->device_ids,
- CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
- programLog = (char*) calloc (logSize+1, sizeof(char));
- clGetProgramBuildInfo(config->program, config->device_ids,
- CL_PROGRAM_BUILD_LOG, logSize+1, programLog, &r);
- printf("Build failed; error=%d, status=%d, programLog:%s, r: %d\n",
- err, status, programLog, r);
- free(programLog);
- }
- //check_error(clBuildProgram(config->program, 1, &config->device_ids, NULL, NULL, NULL), __LINE__);
- config->initial_bucket_hashing_kernel = clCreateKernel(config->program, "initial_bucket_hashing", &ret);
- check_error(ret, __LINE__);
- config->bucket_collide_and_hash_kernel = clCreateKernel(config->program, "bucket_collide_and_hash", &ret);
- check_error(ret, __LINE__);
- config->produce_solutions_kernel = clCreateKernel(config->program, "produce_solutions", &ret);
- check_error(ret, __LINE__);
- config->command_queue = clCreateCommandQueue(config->context, config->device_ids, CL_QUEUE_PROFILING_ENABLE, &ret);
- check_error(ret, __LINE__);
- config->buckets = clCreateBuffer(config->context, CL_MEM_READ_WRITE, NUM_BUCKETS * sizeof(bucket_t) * EQUIHASH_K, NULL, &ret);
- check_error(ret, __LINE__);
- check_error(clEnqueueFillBuffer(config->command_queue, config->buckets, &zero, 1, 0, NUM_BUCKETS * sizeof(bucket_t) * EQUIHASH_K, 0, NULL, NULL), __LINE__);
- config->digests[0] = clCreateBuffer(config->context, CL_MEM_READ_WRITE, (NUM_VALUES + NUM_VALUES / 2) * sizeof(digest_t), NULL, &ret);
- check_error(ret, __LINE__);
- check_error(clEnqueueFillBuffer(config->command_queue, config->digests[0], &zero, 1, 0, (NUM_VALUES + NUM_VALUES / 2) * sizeof(digest_t), 0, NULL, NULL), __LINE__);
- config->digests[1] = clCreateBuffer(config->context, CL_MEM_READ_WRITE, (NUM_VALUES + NUM_VALUES / 2) * sizeof(digest_t), NULL, &ret);
- check_error(ret, __LINE__);
- check_error(clEnqueueFillBuffer(config->command_queue, config->digests[1], &zero, 1, 0, (NUM_VALUES + NUM_VALUES / 2) * sizeof(digest_t), 0, NULL, NULL), __LINE__);
- config->new_digest_index = clCreateBuffer(config->context, CL_MEM_READ_WRITE, sizeof(uint32_t), NULL, &ret);
- check_error(ret, __LINE__);
- check_error(clEnqueueFillBuffer(config->command_queue, config->new_digest_index, &zero, 1, 0, sizeof(uint32_t), 0, NULL, NULL), __LINE__);
- config->blake2b_digest = clCreateBuffer(config->context, CL_MEM_READ_WRITE, sizeof(crypto_generichash_blake2b_state), NULL, &ret);
- check_error(ret, __LINE__);
- check_error(clEnqueueFillBuffer(config->command_queue, config->blake2b_digest, &zero, 1, 0, sizeof(crypto_generichash_blake2b_state), 0, NULL, NULL), __LINE__);
- config->dst_solutions = clCreateBuffer(config->context, CL_MEM_READ_WRITE, 20*NUM_INDICES*sizeof(uint32_t), NULL, &ret);
- check_error(ret, __LINE__);
- check_error(clEnqueueFillBuffer(config->command_queue, config->dst_solutions, &zero, 1, 0, 20*NUM_INDICES*sizeof(uint32_t), 0, NULL, NULL), __LINE__);
- config->n_solutions = clCreateBuffer(config->context, CL_MEM_READ_WRITE, sizeof(uint32_t), NULL, &ret);
- check_error(ret, __LINE__);
- check_error(clEnqueueFillBuffer(config->command_queue, config->n_solutions, &zero, 1, 0, sizeof(uint32_t), 0, NULL, NULL), __LINE__);
- /*
- config->elements = clCreateBuffer(config->context, CL_MEM_READ_WRITE, sizeof(element_t)*EQUIHASH_K*NUM_INDICES/2*(1<<16), NULL, &ret);
- check_error(ret, __LINE__);
- check_error(clEnqueueFillBuffer(config->command_queue, config->elements, &zero, 1, 0, sizeof(element_t)*EQUIHASH_K*NUM_INDICES/2*(1<<16), 0, NULL, NULL), __LINE__);
- */
- //fprintf(stderr, "Total gpu buffer %u\n", NUM_BUCKETS * sizeof(bucket_t) * EQUIHASH_K + 2* (NUM_VALUES + NUM_VALUES / 2) * sizeof(digest_t) + sizeof(crypto_generichash_blake2b_state) + 20*NUM_INDICES*sizeof(uint32_t) + sizeof(uint32_t));
- free(config->program_source_code);
- }
- void cleanup_program(gpu_config_t* config) {
- check_error(clReleaseProgram(config->program), __LINE__);
- check_error(clReleaseKernel(config->initial_bucket_hashing_kernel), __LINE__);
- check_error(clReleaseKernel(config->bucket_collide_and_hash_kernel), __LINE__);
- check_error(clReleaseKernel(config->produce_solutions_kernel), __LINE__);
- check_error(clReleaseCommandQueue(config->command_queue), __LINE__);
- check_error(clReleaseMemObject(config->dst_solutions), __LINE__);
- check_error(clReleaseMemObject(config->n_solutions), __LINE__);
- check_error(clReleaseMemObject(config->digests[0]), __LINE__);
- check_error(clReleaseMemObject(config->digests[1]), __LINE__);
- check_error(clReleaseMemObject(config->buckets), __LINE__);
- check_error(clReleaseMemObject(config->blake2b_digest), __LINE__);
- //check_error(clReleaseMemObject(config->elements), __LINE__);
- check_error(clReleaseContext(config->context), __LINE__);
- }
- size_t equihash(uint32_t* dst_solutions, crypto_generichash_blake2b_state* digest) {
- size_t global_work_offset = 0;
- size_t global_work_size = 1 << 20;
- size_t global_work_size_last = 1 << 16;
- // optimal AMD setting for wavefront?
- size_t local_work_size = 32;
- gpu_config_t config;
- init_program(&config, "./equihash.cl", 0);
- cl_ulong time_start;
- cl_ulong time_end;
- cl_ulong total_time = 0;
- cl_event timing_events[10];
- cl_int zero = 0;
- check_error(clEnqueueWriteBuffer(config.command_queue, config.blake2b_digest, CL_TRUE, 0, sizeof(crypto_generichash_blake2b_state), (void*)digest, 0, NULL, NULL), __LINE__);
- check_error(clEnqueueFillBuffer(config.command_queue, config.new_digest_index, &zero, 1, 0, sizeof(uint32_t), 0, NULL, NULL), __LINE__);
- check_error(clSetKernelArg(config.initial_bucket_hashing_kernel, 0, sizeof(cl_mem), (void *)&config.buckets), __LINE__);
- check_error(clSetKernelArg(config.initial_bucket_hashing_kernel, 1, sizeof(cl_mem), (void *)&config.digests[0]), __LINE__);
- check_error(clSetKernelArg(config.initial_bucket_hashing_kernel, 2, sizeof(cl_mem), (void *)&config.blake2b_digest), __LINE__);
- check_error(clSetKernelArg(config.initial_bucket_hashing_kernel, 3, sizeof(cl_mem), (void *)&config.new_digest_index), __LINE__);
- check_error(clEnqueueNDRangeKernel(config.command_queue, config.initial_bucket_hashing_kernel, 1, &global_work_offset, &global_work_size, &local_work_size, 0, NULL, &timing_events[0]), __LINE__);
- check_error(clWaitForEvents(1, &timing_events[0]), __LINE__);
- check_error(clGetEventProfilingInfo(timing_events[0], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL), __LINE__);
- check_error(clGetEventProfilingInfo(timing_events[0], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL), __LINE__);
- fprintf(stderr, "step0: %0.3f ms\n", (time_end - time_start) / 1000000.0);
- total_time += (time_end-time_start);
- uint32_t i = 1;
- for(i = 1; i < EQUIHASH_K; ++i) {
- /* Set OpenCL Kernel Parameters */
- check_error(clEnqueueFillBuffer(config.command_queue, config.new_digest_index, &zero, 1, 0, sizeof(uint32_t), 0, NULL, NULL), __LINE__);
- check_error(clSetKernelArg(config.bucket_collide_and_hash_kernel, 0, sizeof(cl_mem), (void *)&config.digests[i%2]), __LINE__);
- check_error(clSetKernelArg(config.bucket_collide_and_hash_kernel, 1, sizeof(cl_mem), (void *)&config.digests[(i-1)%2]), __LINE__);
- check_error(clSetKernelArg(config.bucket_collide_and_hash_kernel, 2, sizeof(cl_mem), (void *)&config.buckets), __LINE__);
- check_error(clSetKernelArg(config.bucket_collide_and_hash_kernel, 3, sizeof(uint32_t), (void*)&i), __LINE__);
- check_error(clSetKernelArg(config.bucket_collide_and_hash_kernel, 4, sizeof(cl_mem), (void *)&config.new_digest_index), __LINE__);
- check_error(clEnqueueNDRangeKernel(config.command_queue, config.bucket_collide_and_hash_kernel, 1, &global_work_offset, &global_work_size, &local_work_size, 0, NULL, &timing_events[i]), __LINE__);
- check_error(clWaitForEvents(1, &timing_events[i]), __LINE__);
- check_error(clGetEventProfilingInfo(timing_events[i], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL), __LINE__);
- check_error(clGetEventProfilingInfo(timing_events[i], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL), __LINE__);
- fprintf(stderr, "step%u: %0.3f ms\n",i , (time_end - time_start) / 1000000.0);
- total_time += (time_end-time_start);
- }
- uint32_t n_solutions = 0;
- bucket_t *buckets = (bucket_t *) malloc(NUM_BUCKETS*EQUIHASH_K*sizeof(bucket_t));
- digest_t *src_digests = (digest_t*)malloc((NUM_VALUES + NUM_VALUES / 2)*sizeof(digest_t));
- // Replace this with sols
- //uint32_t dst_solutions[10*NUM_INDICES];
- check_error(clEnqueueReadBuffer(config.command_queue, config.digests[0], CL_TRUE, 0, (NUM_VALUES + NUM_VALUES / 2) * sizeof(digest_t), src_digests, 0, NULL, NULL), __LINE__);
- check_error(clEnqueueReadBuffer(config.command_queue, config.buckets, CL_TRUE, 0,NUM_BUCKETS * sizeof(bucket_t) * EQUIHASH_K , buckets, 0, NULL, NULL), __LINE__);
- //m_queue.enqueueReadBuffer(m_digests[0], true, 0, (NUM_VALUES + NUM_VALUES / 2) * sizeof(digest_t), &src_digests);
- //m_queue.enqueueReadBuffer(m_buckets, true, 0, NUM_BUCKETS * sizeof(bucket_t) * EQUIHASH_K , &buckets);
- //m_queue.finish();
- produce_solutions(dst_solutions, &n_solutions, buckets, src_digests);
- /*
- global_work_size = 1<<20;
- local_work_size = 1;
- check_error(clEnqueueFillBuffer(config.command_queue, config.n_solutions, &zero, 1, 0, sizeof(uint32_t), 0, NULL, NULL), __LINE__);
- check_error(clSetKernelArg(config.produce_solutions_kernel, 0, sizeof(cl_mem), (uint32_t *)&config.dst_solutions), __LINE__);
- check_error(clSetKernelArg(config.produce_solutions_kernel, 1, sizeof(cl_mem), (uint32_t *)&config.n_solutions), __LINE__);
- check_error(clSetKernelArg(config.produce_solutions_kernel, 2, sizeof(cl_mem), (bucket_t *)&config.buckets), __LINE__);
- check_error(clSetKernelArg(config.produce_solutions_kernel, 3, sizeof(cl_mem), (digest_t *)&config.digests[0]), __LINE__);
- check_error(clSetKernelArg(config.produce_solutions_kernel, 4, sizeof(cl_mem), (void *)&config.blake2b_digest), __LINE__);
- //check_error(clSetKernelArg(config.produce_solutions_kernel, 5, sizeof(cl_mem), (void *)&config.elements), __LINE__);
- check_error(clEnqueueNDRangeKernel(config.command_queue, config.produce_solutions_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &timing_events[9]), __LINE__);
- check_error(clWaitForEvents(1, &timing_events[9]), __LINE__);
- check_error(clGetEventProfilingInfo(timing_events[9], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL), __LINE__);
- check_error(clGetEventProfilingInfo(timing_events[9], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL), __LINE__);
- fprintf(stderr, "step0: %0.3f ms\n", (time_end - time_start) / 1000000.0);
- total_time += (time_end-time_start);
- .
- /*
- check_error(clEnqueueReadBuffer(config.command_queue, config.dst_solutions, CL_TRUE, 0, 10*NUM_INDICES*sizeof(uint32_t), dst_solutions, 0, NULL, NULL), __LINE__);
- check_error(clEnqueueReadBuffer(config.command_queue, config.n_solutions, CL_TRUE, 0, sizeof(uint32_t), &n_solutions, 0, NULL, NULL), __LINE__);
- */
- return 0;
- fprintf(stderr, "found solutions in %0.3f ms\n", total_time / 1000000.0);
- printf("solutions: %u\n", n_solutions);
- for(i = 0; i < n_solutions; ++i) {
- normalize_indices(dst_solutions + (NUM_INDICES*i));
- }
- /* for(size_t i = 0; i < n_solutions; ++i) {
- for(size_t k = 0; k < 512; ++k) {
- printf("%u ", dst_solutions[i*512+k]);
- }
- printf("\n\n");
- }
- */
- cleanup_program(&config);
- return n_solutions;
- }
- void get_element_parent_bucket_data(element_t* src, uint32_t* parent_bucket_index, uint8_t* a, uint8_t* b) {
- *parent_bucket_index = src->parent_bucket_data >> 8;
- *a = (src->parent_bucket_data >> 4) & 0xf;
- *b = (src->parent_bucket_data & 0xf);
- }
- /*
- uint32_t mask_collision_bits(uint8_t* data, size_t bit_index) {
- uint32_t n = ((*data << (bit_index)) & 0xff) << 12;
- n |= ((*(++data)) << (bit_index+4));
- n |= ((*(++data)) >> (4-bit_index));
- return n;
- }
- */
- void decompress_indices(uint32_t* dst_uncompressed_indices, bucket_t* buckets, element_t* old_src) {
- element_t leaves[NUM_INDICES/2];
- element_t past_leaves[NUM_INDICES/4];
- leaves[0] = *old_src;
- for(size_t i = 0; i < EQUIHASH_K-1; ++i) {
- // Copy memory to past_leaves
- memcpy(&past_leaves, &leaves, (size_t)(1<<i));
- for(size_t j = 0; j < (1 << i); ++j) {
- element_t* src = past_leaves + j;
- uint32_t parent_bucket_index;
- uint8_t a;
- uint8_t b;
- get_element_parent_bucket_data(src, &parent_bucket_index, &a, &b);
- bucket_t* parent_bucket = buckets + ((EQUIHASH_K-2-i) * NUM_BUCKETS) + parent_bucket_index;
- leaves[2*j] = parent_bucket->data[a];
- leaves[2*j+1] = parent_bucket->data[b];
- }
- }
- for(size_t j = 0; j < NUM_INDICES/2; ++j) {
- element_t* src = leaves + j;
- uint32_t parent_bucket_index;
- uint8_t a;
- uint8_t b;
- get_element_parent_bucket_data(src, &parent_bucket_index, &a, &b);
- *dst_uncompressed_indices = parent_bucket_index;
- dst_uncompressed_indices++;
- }
- }
- void produce_solutions(uint32_t* dst_solutions, uint32_t* n_solutions, bucket_t* buckets, digest_t* src_digests) {
- size_t start_bit = (EQUIHASH_K*NUM_COLLISION_BITS);
- size_t byte_index = start_bit / 8;
- size_t bit_index = start_bit % 8;
- bucket_t* src_buckets = buckets + (EQUIHASH_K-1)*NUM_BUCKETS;
- for (uint32_t i = 0; i < NUM_BUCKETS; ++i){
- bucket_t* bucket = src_buckets + i;
- fprintf(stderr, "i:%u", i);
- int has_dupe = 0;
- // Exit if no collision. We can exit as its only one worker per thread.
- if(bucket->size == 1) continue;
- for(size_t a = 0; a < bucket->size && !has_dupe; ++a) {
- element_t* base = bucket->data + a;
- for(size_t b = a+1; b < bucket->size; ++b) {
- element_t* el = bucket->data + b;
- uint32_t ai = mask_collision_bits((( uint8_t*)src_digests[base->digest_index]) + byte_index, bit_index);
- uint32_t bi = mask_collision_bits((( uint8_t*)src_digests[el->digest_index]) + byte_index, bit_index);
- if(ai == bi && ai != 0) {
- uint32_t uncompressed_indices[NUM_INDICES];
- decompress_indices(uncompressed_indices, buckets, base);
- decompress_indices(uncompressed_indices + NUM_INDICES/2, buckets, el);
- for(size_t k = 0; k < NUM_INDICES && !has_dupe; ++k) {
- for(size_t o = k+1; o < NUM_INDICES && !has_dupe; ++o) {
- if(uncompressed_indices[k] == uncompressed_indices[o]) {
- has_dupe = 1;
- }
- }
- }
- //has_dupe = 1;
- if(!has_dupe) {
- *n_solutions += 1;
- memcpy(dst_solutions + (*n_solutions)*NUM_INDICES, uncompressed_indices, NUM_INDICES*sizeof(uint32_t));
- } else {
- break;
- }
- }
- }
- }
- bucket->size = 0;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement