Advertisement
Guest User

Untitled

a guest
Oct 27th, 2016
71
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 26.69 KB | None | 0 0
  1. #define _XOPEN_SOURCE 700
  2. #include <string.h>
  3. #include <stdlib.h>
  4. #include <stdio.h>
  5. #include <stdint.h>
  6. #include <sodium.h>
  7. #include <unistd.h>
  8. #include <time.h>
  9. #include <arpa/inet.h>
  10. #include <assert.h>
  11. #include <math.h>
  12. #include <CL/cl.h>
  13. #include <malloc.h>
  14.  
  15. double get_ttime() {
  16. struct timespec ts;
  17. clock_gettime(CLOCK_MONOTONIC, &ts);
  18. return ts.tv_sec + ts.tv_nsec / 1000000000.0;
  19. }
  20.  
  21. // COMPILE INSTRUCTIONS:
  22. // gcc-4.9 time-threaded-equihash-pointers.c equihash-pointers-working-minimal-memory-currently-fastest.c --pedantic -Ofast -Wall -std=c11 -o test-equihash -lsodium -lpthread -O3 -march=native -mtune=native -mavx -ftree-vectorize
  23. // theoretical min: 2*2^20*(6*(8+32)
  24. // 2 buckets * NUM_BUCKETS*(BUCKET_SIZE + NUM_ELEMENTS*(ELEMENT_SIZE))
  25.  
  26. /*
  27. NOTE: only works for NUM_COLLISION_BITS < 24!
  28. */
  29.  
  30. #define EQUIHASH_N 200
  31. #define EQUIHASH_K 9
  32.  
  33. #define NUM_COLLISION_BITS (EQUIHASH_N / (EQUIHASH_K + 1))
  34. #define NUM_INDICES (1 << EQUIHASH_K)
  35.  
  36. #define NUM_VALUES (1 << (NUM_COLLISION_BITS+1))
  37. #define NUM_BUCKETS (1 << NUM_COLLISION_BITS)
  38. #define DIGEST_SIZE 25
  39.  
  40. typedef struct element element_t;
  41. typedef uint64_t digest_t[(DIGEST_SIZE + sizeof(uint64_t) - 1) / sizeof(uint64_t)];
  42.  
  43.  
  44.  
  45. struct element {
  46. uint32_t digest_index;
  47. uint32_t parent_bucket_data;
  48. };
  49.  
  50.  
  51. typedef struct bucket {
  52. unsigned size;
  53. element_t data[18];
  54. } bucket_t;
  55.  
  56.  
  57. void hexout(unsigned char* digest_result) {
  58. for(unsigned i = 0; i < 4; ++i) {
  59. for(int j = 0; j < 8; ++j) {
  60. int c = digest_result[i*8 + j];
  61. printf("%2X", c);
  62. }
  63. }
  64. printf("\n");
  65. }
  66.  
  67.  
  68. uint32_t mask_collision_bits(uint8_t* data, size_t bit_index) {
  69. uint32_t n = ((*data << (bit_index)) & 0xff) << 12;
  70. n |= ((*(++data)) << (bit_index+4));
  71. n |= ((*(++data)) >> (4-bit_index));
  72. return n;
  73. }
  74.  
  75.  
  76. void get_element_parent_bucket_data(element_t* src, uint32_t* parent_bucket_index, uint8_t* a, uint8_t* b);
  77.  
  78. //uint32_t mask_collision_bits(uint8_t* data, size_t bit_index);
  79.  
  80.  
  81. void decompress_indices(uint32_t* dst_uncompressed_indices, bucket_t* buckets, element_t* old_src);
  82.  
  83.  
  84. void produce_solutions(uint32_t* dst_solutions, uint32_t* n_solutions, bucket_t* buckets, digest_t* src_digests);
  85.  
  86. int compare_indices32(uint32_t* a, uint32_t* b, size_t n_current_indices) {
  87. for(size_t i = 0; i < n_current_indices; ++i, ++a, ++b) {
  88. if(*a < *b) {
  89. return -1;
  90. } else if(*a > *b) {
  91. return 1;
  92. } else {
  93. return 0;
  94. }
  95. }
  96. return 0;
  97. }
  98.  
  99. void normalize_indices(uint32_t* indices) {
  100. for(size_t step_index = 0; step_index < EQUIHASH_K; ++step_index) {
  101. for(size_t i = 0; i < NUM_INDICES; i += (1 << (step_index+1))) {
  102. if(compare_indices32(indices+i, indices+i+(1 << step_index), (1 << step_index)) > 0) {
  103. uint32_t tmp_indices[(1 << step_index)];
  104. memcpy(tmp_indices, indices+i, (1 << step_index)*sizeof(uint32_t));
  105. memcpy(indices+i, indices+i+(1 << step_index), (1 << step_index)*sizeof(uint32_t));
  106. memcpy(indices+i+(1 << step_index), tmp_indices, (1 << step_index)*sizeof(uint32_t));
  107. }
  108. }
  109. }
  110. }
  111.  
  112.  
  113. void xor_elements(uint8_t* dst, uint8_t* a, uint8_t* b) {
  114. ((uint64_t*)dst)[0] = ((uint64_t*)a)[0] ^ ((uint64_t*)b)[0];
  115. ((uint64_t*)dst)[1] = ((uint64_t*)a)[1] ^ ((uint64_t*)b)[1];
  116. ((uint64_t*)dst)[2] = ((uint64_t*)a)[2] ^ ((uint64_t*)b)[2];
  117. dst[24] = a[24] ^ b[24];
  118. }
  119.  
  120. void hash(uint8_t* dst, uint32_t in, const crypto_generichash_blake2b_state* digest) {
  121. uint32_t tmp_in = in/2;
  122. crypto_generichash_blake2b_state new_digest = *digest;
  123. crypto_generichash_blake2b_update(&new_digest, (uint8_t*)&tmp_in, sizeof(uint32_t));
  124. crypto_generichash_blake2b_final(&new_digest, (uint8_t*)dst, 2*DIGEST_SIZE);
  125. }
  126.  
  127.  
  128. int is_indices_valid(uint32_t* indices, const crypto_generichash_blake2b_state* digest) {
  129. uint8_t digest_results[NUM_INDICES][DIGEST_SIZE];
  130. memset(digest_results, '\0', NUM_INDICES*DIGEST_SIZE);
  131.  
  132. for(size_t i = 0; i < NUM_INDICES; ++i) {
  133. uint8_t digest_tmp[2*DIGEST_SIZE];
  134. hash(digest_tmp, indices[i], digest);
  135. memcpy(digest_results[i], digest_tmp+((indices[i] % 2)*EQUIHASH_N/8), DIGEST_SIZE);
  136. }
  137.  
  138. for(size_t step_index = 0; step_index < EQUIHASH_K; ++step_index) {
  139. for(size_t i = 0; i < (NUM_INDICES >> step_index); i += 2) {
  140. uint8_t digest_tmp[DIGEST_SIZE];
  141. xor_elements(digest_tmp, digest_results[i], digest_results[i+1]);
  142.  
  143. size_t start_bit = step_index*NUM_COLLISION_BITS;
  144. size_t byte_index = start_bit / 8;
  145. size_t bit_index = start_bit % 8;
  146.  
  147. if(!mask_collision_bits(((uint8_t*)digest_tmp) + byte_index, bit_index) == 0) {
  148. return 0;
  149. }
  150.  
  151. memcpy(digest_results[i / 2], digest_tmp, DIGEST_SIZE);
  152. }
  153. }
  154.  
  155. size_t start_bit = EQUIHASH_K*NUM_COLLISION_BITS;
  156. size_t byte_index = start_bit / 8;
  157. size_t bit_index = start_bit % 8;
  158. return mask_collision_bits(((uint8_t*)digest_results[0]) + byte_index, bit_index) == 0;
  159. }
  160.  
  161.  
  162. void build(cl_device_id device_id, cl_program program) {
  163. const char* options = "";
  164. cl_int ret_val = clBuildProgram(program, 1, &device_id, options, NULL, NULL);
  165.  
  166. // avoid abortion due to CL_BILD_PROGRAM_FAILURE
  167.  
  168. cl_build_status build_status;
  169. clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &build_status, NULL);
  170.  
  171. char *build_log;
  172. size_t ret_val_size;
  173.  
  174. clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
  175.  
  176. //build_log = calloc(10000, 1);
  177.  
  178. clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, ret_val_size, NULL, NULL);
  179.  
  180. build_log[ret_val_size] = '\0';
  181. //fprintf(stderr, "%s\n\n", build_log);
  182. }
  183.  
  184. const char *get_error_string(cl_int error)
  185. {
  186. switch(error){
  187. // run-time and JIT compiler errors
  188. case 0: return "CL_SUCCESS";
  189. case -1: return "CL_DEVICE_NOT_FOUND";
  190. case -2: return "CL_DEVICE_NOT_AVAILABLE";
  191. case -3: return "CL_COMPILER_NOT_AVAILABLE";
  192. case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
  193. case -5: return "CL_OUT_OF_RESOURCES";
  194. case -6: return "CL_OUT_OF_HOST_MEMORY";
  195. case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
  196. case -8: return "CL_MEM_COPY_OVERLAP";
  197. case -9: return "CL_IMAGE_FORMAT_MISMATCH";
  198. case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
  199. case -11: return "CL_BUILD_PROGRAM_FAILURE";
  200. case -12: return "CL_MAP_FAILURE";
  201. case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
  202. case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
  203. case -15: return "CL_COMPILE_PROGRAM_FAILURE";
  204. case -16: return "CL_LINKER_NOT_AVAILABLE";
  205. case -17: return "CL_LINK_PROGRAM_FAILURE";
  206. case -18: return "CL_DEVICE_PARTITION_FAILED";
  207. case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
  208.  
  209. // compile-time errors
  210. case -30: return "CL_INVALID_VALUE";
  211. case -31: return "CL_INVALID_DEVICE_TYPE";
  212. case -32: return "CL_INVALID_PLATFORM";
  213. case -33: return "CL_INVALID_DEVICE";
  214. case -34: return "CL_INVALID_CONTEXT";
  215. case -35: return "CL_INVALID_QUEUE_PROPERTIES";
  216. case -36: return "CL_INVALID_COMMAND_QUEUE";
  217. case -37: return "CL_INVALID_HOST_PTR";
  218. case -38: return "CL_INVALID_MEM_OBJECT";
  219. case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
  220. case -40: return "CL_INVALID_IMAGE_SIZE";
  221. case -41: return "CL_INVALID_SAMPLER";
  222. case -42: return "CL_INVALID_BINARY";
  223. case -43: return "CL_INVALID_BUILD_OPTIONS";
  224. case -44: return "CL_INVALID_PROGRAM";
  225. case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
  226. case -46: return "CL_INVALID_KERNEL_NAME";
  227. case -47: return "CL_INVALID_KERNEL_DEFINITION";
  228. case -48: return "CL_INVALID_KERNEL";
  229. case -49: return "CL_INVALID_ARG_INDEX";
  230. case -50: return "CL_INVALID_ARG_VALUE";
  231. case -51: return "CL_INVALID_ARG_SIZE";
  232. case -52: return "CL_INVALID_KERNEL_ARGS";
  233. case -53: return "CL_INVALID_WORK_DIMENSION";
  234. case -54: return "CL_INVALID_WORK_GROUP_SIZE";
  235. case -55: return "CL_INVALID_WORK_ITEM_SIZE";
  236. case -56: return "CL_INVALID_GLOBAL_OFFSET";
  237. case -57: return "CL_INVALID_EVENT_WAIT_LIST";
  238. case -58: return "CL_INVALID_EVENT";
  239. case -59: return "CL_INVALID_OPERATION";
  240. case -60: return "CL_INVALID_GL_OBJECT";
  241. case -61: return "CL_INVALID_BUFFER_SIZE";
  242. case -62: return "CL_INVALID_MIP_LEVEL";
  243. case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
  244. case -64: return "CL_INVALID_PROPERTY";
  245. case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
  246. case -66: return "CL_INVALID_COMPILER_OPTIONS";
  247. case -67: return "CL_INVALID_LINKER_OPTIONS";
  248. case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
  249.  
  250. // extension errors
  251. case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
  252. case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
  253. case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
  254. case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
  255. case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
  256. case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
  257. case -9999: return "NVIDIA: ILLEGAL READ OR WRITE TO A BUFFER";
  258. default:
  259. fprintf(stderr, "'%d'\n", error);
  260. return "Unknown OpenCL error";
  261. }
  262. }
  263.  
  264.  
  265. void check_error(cl_int ret, unsigned line_number) {
  266. if(ret != 0) {
  267. fprintf(stderr, "An error occured on line %u: %s\n", line_number, get_error_string(ret));
  268. exit(1);
  269. }
  270. }
  271.  
  272. typedef struct gpu_config {
  273. unsigned flags;
  274.  
  275. char* program_source_code;
  276. size_t program_source_code_size;
  277.  
  278. cl_program program;
  279.  
  280. cl_platform_id platform_ids;
  281. cl_uint n_platforms;
  282.  
  283. cl_device_id device_ids;
  284. cl_uint n_devices;
  285.  
  286. cl_context context;
  287. cl_command_queue command_queue;
  288.  
  289.  
  290. cl_kernel initial_bucket_hashing_kernel;
  291.  
  292. cl_kernel bucket_collide_and_hash_kernel;
  293.  
  294. cl_kernel produce_solutions_kernel;
  295.  
  296.  
  297.  
  298. // gpu variables below
  299. // two heaps for the digest, or one array split in two elements to hold
  300. // destination and source heaps for hashes between rounds
  301. cl_mem digests[2];
  302. cl_mem new_digest_index;
  303. cl_mem buckets;
  304. cl_mem blake2b_digest;
  305. cl_mem n_solutions;
  306. cl_mem dst_solutions;
  307. //cl_mem elements;
  308.  
  309. } gpu_config_t;
  310.  
  311. void init_program(gpu_config_t* config, const char* file_path, unsigned flags) {
  312. memset(config, '\0', sizeof(gpu_config_t));
  313.  
  314. config->flags = flags;
  315.  
  316. FILE* f = fopen(file_path, "r");
  317. if (!f) {
  318. fprintf(stderr, "program with path \"%s\".\n", file_path);
  319. exit(1);
  320. }
  321. config->program_source_code = calloc(400000, sizeof(char));
  322. config->program_source_code_size = fread(config->program_source_code, 1, 400000, f);
  323. fclose(f);
  324.  
  325. cl_int ret = 0;
  326. cl_int zero = 0;
  327.  
  328. check_error(clGetPlatformIDs(1, &config->platform_ids, &config->n_platforms), __LINE__);
  329. check_error(clGetDeviceIDs(config->platform_ids, CL_DEVICE_TYPE_DEFAULT, 1, &config->device_ids, &config->n_devices), __LINE__);
  330. config->context = clCreateContext(NULL, 1, &config->device_ids, NULL, NULL, &ret);
  331. check_error(ret, __LINE__);
  332.  
  333. config->program = clCreateProgramWithSource(config->context, 1, &config->program_source_code, &config->program_source_code_size, &ret);
  334. check_error(ret, __LINE__);
  335.  
  336. cl_build_status status;
  337. cl_int err;
  338. cl_uint platformCount;
  339. cl_uint deviceCount;
  340. cl_int r;
  341. size_t logSize;
  342. char *programLog;
  343.  
  344. if (clBuildProgram(config->program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS){
  345. // check build log
  346. clGetProgramBuildInfo(config->program, config->device_ids,
  347. CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
  348. programLog = (char*) calloc (logSize+1, sizeof(char));
  349. clGetProgramBuildInfo(config->program, config->device_ids,
  350. CL_PROGRAM_BUILD_LOG, logSize+1, programLog, &r);
  351. printf("Build failed; error=%d, status=%d, programLog:%s, r: %d\n",
  352. err, status, programLog, r);
  353. free(programLog);
  354.  
  355. }
  356.  
  357. //check_error(clBuildProgram(config->program, 1, &config->device_ids, NULL, NULL, NULL), __LINE__);
  358.  
  359. config->initial_bucket_hashing_kernel = clCreateKernel(config->program, "initial_bucket_hashing", &ret);
  360. check_error(ret, __LINE__);
  361.  
  362. config->bucket_collide_and_hash_kernel = clCreateKernel(config->program, "bucket_collide_and_hash", &ret);
  363. check_error(ret, __LINE__);
  364.  
  365. config->produce_solutions_kernel = clCreateKernel(config->program, "produce_solutions", &ret);
  366. check_error(ret, __LINE__);
  367.  
  368.  
  369.  
  370. config->command_queue = clCreateCommandQueue(config->context, config->device_ids, CL_QUEUE_PROFILING_ENABLE, &ret);
  371. check_error(ret, __LINE__);
  372.  
  373. config->buckets = clCreateBuffer(config->context, CL_MEM_READ_WRITE, NUM_BUCKETS * sizeof(bucket_t) * EQUIHASH_K, NULL, &ret);
  374. check_error(ret, __LINE__);
  375. check_error(clEnqueueFillBuffer(config->command_queue, config->buckets, &zero, 1, 0, NUM_BUCKETS * sizeof(bucket_t) * EQUIHASH_K, 0, NULL, NULL), __LINE__);
  376.  
  377. config->digests[0] = clCreateBuffer(config->context, CL_MEM_READ_WRITE, (NUM_VALUES + NUM_VALUES / 2) * sizeof(digest_t), NULL, &ret);
  378. check_error(ret, __LINE__);
  379. check_error(clEnqueueFillBuffer(config->command_queue, config->digests[0], &zero, 1, 0, (NUM_VALUES + NUM_VALUES / 2) * sizeof(digest_t), 0, NULL, NULL), __LINE__);
  380.  
  381. config->digests[1] = clCreateBuffer(config->context, CL_MEM_READ_WRITE, (NUM_VALUES + NUM_VALUES / 2) * sizeof(digest_t), NULL, &ret);
  382. check_error(ret, __LINE__);
  383. check_error(clEnqueueFillBuffer(config->command_queue, config->digests[1], &zero, 1, 0, (NUM_VALUES + NUM_VALUES / 2) * sizeof(digest_t), 0, NULL, NULL), __LINE__);
  384.  
  385. config->new_digest_index = clCreateBuffer(config->context, CL_MEM_READ_WRITE, sizeof(uint32_t), NULL, &ret);
  386. check_error(ret, __LINE__);
  387. check_error(clEnqueueFillBuffer(config->command_queue, config->new_digest_index, &zero, 1, 0, sizeof(uint32_t), 0, NULL, NULL), __LINE__);
  388.  
  389.  
  390. config->blake2b_digest = clCreateBuffer(config->context, CL_MEM_READ_WRITE, sizeof(crypto_generichash_blake2b_state), NULL, &ret);
  391. check_error(ret, __LINE__);
  392. check_error(clEnqueueFillBuffer(config->command_queue, config->blake2b_digest, &zero, 1, 0, sizeof(crypto_generichash_blake2b_state), 0, NULL, NULL), __LINE__);
  393.  
  394. config->dst_solutions = clCreateBuffer(config->context, CL_MEM_READ_WRITE, 20*NUM_INDICES*sizeof(uint32_t), NULL, &ret);
  395. check_error(ret, __LINE__);
  396. check_error(clEnqueueFillBuffer(config->command_queue, config->dst_solutions, &zero, 1, 0, 20*NUM_INDICES*sizeof(uint32_t), 0, NULL, NULL), __LINE__);
  397.  
  398.  
  399. config->n_solutions = clCreateBuffer(config->context, CL_MEM_READ_WRITE, sizeof(uint32_t), NULL, &ret);
  400. check_error(ret, __LINE__);
  401. check_error(clEnqueueFillBuffer(config->command_queue, config->n_solutions, &zero, 1, 0, sizeof(uint32_t), 0, NULL, NULL), __LINE__);
  402.  
  403. /*
  404. config->elements = clCreateBuffer(config->context, CL_MEM_READ_WRITE, sizeof(element_t)*EQUIHASH_K*NUM_INDICES/2*(1<<16), NULL, &ret);
  405. check_error(ret, __LINE__);
  406. check_error(clEnqueueFillBuffer(config->command_queue, config->elements, &zero, 1, 0, sizeof(element_t)*EQUIHASH_K*NUM_INDICES/2*(1<<16), 0, NULL, NULL), __LINE__);
  407. */
  408.  
  409. //fprintf(stderr, "Total gpu buffer %u\n", NUM_BUCKETS * sizeof(bucket_t) * EQUIHASH_K + 2* (NUM_VALUES + NUM_VALUES / 2) * sizeof(digest_t) + sizeof(crypto_generichash_blake2b_state) + 20*NUM_INDICES*sizeof(uint32_t) + sizeof(uint32_t));
  410.  
  411. free(config->program_source_code);
  412.  
  413. }
  414.  
  415. void cleanup_program(gpu_config_t* config) {
  416. check_error(clReleaseProgram(config->program), __LINE__);
  417. check_error(clReleaseKernel(config->initial_bucket_hashing_kernel), __LINE__);
  418. check_error(clReleaseKernel(config->bucket_collide_and_hash_kernel), __LINE__);
  419. check_error(clReleaseKernel(config->produce_solutions_kernel), __LINE__);
  420. check_error(clReleaseCommandQueue(config->command_queue), __LINE__);
  421.  
  422.  
  423. check_error(clReleaseMemObject(config->dst_solutions), __LINE__);
  424. check_error(clReleaseMemObject(config->n_solutions), __LINE__);
  425. check_error(clReleaseMemObject(config->digests[0]), __LINE__);
  426. check_error(clReleaseMemObject(config->digests[1]), __LINE__);
  427. check_error(clReleaseMemObject(config->buckets), __LINE__);
  428. check_error(clReleaseMemObject(config->blake2b_digest), __LINE__);
  429. //check_error(clReleaseMemObject(config->elements), __LINE__);
  430. check_error(clReleaseContext(config->context), __LINE__);
  431. }
  432.  
  433.  
  434. size_t equihash(uint32_t* dst_solutions, crypto_generichash_blake2b_state* digest) {
  435. size_t global_work_offset = 0;
  436. size_t global_work_size = 1 << 20;
  437. size_t global_work_size_last = 1 << 16;
  438. // optimal AMD setting for wavefront?
  439. size_t local_work_size = 32;
  440. gpu_config_t config;
  441. init_program(&config, "./equihash.cl", 0);
  442.  
  443. cl_ulong time_start;
  444. cl_ulong time_end;
  445. cl_ulong total_time = 0;
  446. cl_event timing_events[10];
  447. cl_int zero = 0;
  448.  
  449. check_error(clEnqueueWriteBuffer(config.command_queue, config.blake2b_digest, CL_TRUE, 0, sizeof(crypto_generichash_blake2b_state), (void*)digest, 0, NULL, NULL), __LINE__);
  450.  
  451.  
  452. check_error(clEnqueueFillBuffer(config.command_queue, config.new_digest_index, &zero, 1, 0, sizeof(uint32_t), 0, NULL, NULL), __LINE__);
  453. check_error(clSetKernelArg(config.initial_bucket_hashing_kernel, 0, sizeof(cl_mem), (void *)&config.buckets), __LINE__);
  454. check_error(clSetKernelArg(config.initial_bucket_hashing_kernel, 1, sizeof(cl_mem), (void *)&config.digests[0]), __LINE__);
  455. check_error(clSetKernelArg(config.initial_bucket_hashing_kernel, 2, sizeof(cl_mem), (void *)&config.blake2b_digest), __LINE__);
  456. check_error(clSetKernelArg(config.initial_bucket_hashing_kernel, 3, sizeof(cl_mem), (void *)&config.new_digest_index), __LINE__);
  457. check_error(clEnqueueNDRangeKernel(config.command_queue, config.initial_bucket_hashing_kernel, 1, &global_work_offset, &global_work_size, &local_work_size, 0, NULL, &timing_events[0]), __LINE__);
  458. check_error(clWaitForEvents(1, &timing_events[0]), __LINE__);
  459. check_error(clGetEventProfilingInfo(timing_events[0], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL), __LINE__);
  460. check_error(clGetEventProfilingInfo(timing_events[0], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL), __LINE__);
  461. fprintf(stderr, "step0: %0.3f ms\n", (time_end - time_start) / 1000000.0);
  462. total_time += (time_end-time_start);
  463.  
  464.  
  465. uint32_t i = 1;
  466. for(i = 1; i < EQUIHASH_K; ++i) {
  467. /* Set OpenCL Kernel Parameters */
  468. check_error(clEnqueueFillBuffer(config.command_queue, config.new_digest_index, &zero, 1, 0, sizeof(uint32_t), 0, NULL, NULL), __LINE__);
  469.  
  470. check_error(clSetKernelArg(config.bucket_collide_and_hash_kernel, 0, sizeof(cl_mem), (void *)&config.digests[i%2]), __LINE__);
  471. check_error(clSetKernelArg(config.bucket_collide_and_hash_kernel, 1, sizeof(cl_mem), (void *)&config.digests[(i-1)%2]), __LINE__);
  472. check_error(clSetKernelArg(config.bucket_collide_and_hash_kernel, 2, sizeof(cl_mem), (void *)&config.buckets), __LINE__);
  473. check_error(clSetKernelArg(config.bucket_collide_and_hash_kernel, 3, sizeof(uint32_t), (void*)&i), __LINE__);
  474. check_error(clSetKernelArg(config.bucket_collide_and_hash_kernel, 4, sizeof(cl_mem), (void *)&config.new_digest_index), __LINE__);
  475. check_error(clEnqueueNDRangeKernel(config.command_queue, config.bucket_collide_and_hash_kernel, 1, &global_work_offset, &global_work_size, &local_work_size, 0, NULL, &timing_events[i]), __LINE__);
  476. check_error(clWaitForEvents(1, &timing_events[i]), __LINE__);
  477. check_error(clGetEventProfilingInfo(timing_events[i], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL), __LINE__);
  478. check_error(clGetEventProfilingInfo(timing_events[i], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL), __LINE__);
  479. fprintf(stderr, "step%u: %0.3f ms\n",i , (time_end - time_start) / 1000000.0);
  480. total_time += (time_end-time_start);
  481.  
  482. }
  483.  
  484. uint32_t n_solutions = 0;
  485. bucket_t *buckets = (bucket_t *) malloc(NUM_BUCKETS*EQUIHASH_K*sizeof(bucket_t));
  486. digest_t *src_digests = (digest_t*)malloc((NUM_VALUES + NUM_VALUES / 2)*sizeof(digest_t));
  487.  
  488. // Replace this with sols
  489. //uint32_t dst_solutions[10*NUM_INDICES];
  490.  
  491. check_error(clEnqueueReadBuffer(config.command_queue, config.digests[0], CL_TRUE, 0, (NUM_VALUES + NUM_VALUES / 2) * sizeof(digest_t), src_digests, 0, NULL, NULL), __LINE__);
  492.  
  493. check_error(clEnqueueReadBuffer(config.command_queue, config.buckets, CL_TRUE, 0,NUM_BUCKETS * sizeof(bucket_t) * EQUIHASH_K , buckets, 0, NULL, NULL), __LINE__);
  494.  
  495. //m_queue.enqueueReadBuffer(m_digests[0], true, 0, (NUM_VALUES + NUM_VALUES / 2) * sizeof(digest_t), &src_digests);
  496. //m_queue.enqueueReadBuffer(m_buckets, true, 0, NUM_BUCKETS * sizeof(bucket_t) * EQUIHASH_K , &buckets);
  497.  
  498. //m_queue.finish();
  499.  
  500. produce_solutions(dst_solutions, &n_solutions, buckets, src_digests);
  501.  
  502. /*
  503. global_work_size = 1<<20;
  504. local_work_size = 1;
  505. check_error(clEnqueueFillBuffer(config.command_queue, config.n_solutions, &zero, 1, 0, sizeof(uint32_t), 0, NULL, NULL), __LINE__);
  506. check_error(clSetKernelArg(config.produce_solutions_kernel, 0, sizeof(cl_mem), (uint32_t *)&config.dst_solutions), __LINE__);
  507. check_error(clSetKernelArg(config.produce_solutions_kernel, 1, sizeof(cl_mem), (uint32_t *)&config.n_solutions), __LINE__);
  508. check_error(clSetKernelArg(config.produce_solutions_kernel, 2, sizeof(cl_mem), (bucket_t *)&config.buckets), __LINE__);
  509. check_error(clSetKernelArg(config.produce_solutions_kernel, 3, sizeof(cl_mem), (digest_t *)&config.digests[0]), __LINE__);
  510. check_error(clSetKernelArg(config.produce_solutions_kernel, 4, sizeof(cl_mem), (void *)&config.blake2b_digest), __LINE__);
  511. //check_error(clSetKernelArg(config.produce_solutions_kernel, 5, sizeof(cl_mem), (void *)&config.elements), __LINE__);
  512.  
  513.  
  514.  
  515. check_error(clEnqueueNDRangeKernel(config.command_queue, config.produce_solutions_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &timing_events[9]), __LINE__);
  516. check_error(clWaitForEvents(1, &timing_events[9]), __LINE__);
  517. check_error(clGetEventProfilingInfo(timing_events[9], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL), __LINE__);
  518. check_error(clGetEventProfilingInfo(timing_events[9], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL), __LINE__);
  519. fprintf(stderr, "step0: %0.3f ms\n", (time_end - time_start) / 1000000.0);
  520. total_time += (time_end-time_start);
  521. .
  522.  
  523. /*
  524. check_error(clEnqueueReadBuffer(config.command_queue, config.dst_solutions, CL_TRUE, 0, 10*NUM_INDICES*sizeof(uint32_t), dst_solutions, 0, NULL, NULL), __LINE__);
  525. check_error(clEnqueueReadBuffer(config.command_queue, config.n_solutions, CL_TRUE, 0, sizeof(uint32_t), &n_solutions, 0, NULL, NULL), __LINE__);
  526. */
  527. return 0;
  528. fprintf(stderr, "found solutions in %0.3f ms\n", total_time / 1000000.0);
  529. printf("solutions: %u\n", n_solutions);
  530.  
  531. for(i = 0; i < n_solutions; ++i) {
  532. normalize_indices(dst_solutions + (NUM_INDICES*i));
  533. }
  534. /* for(size_t i = 0; i < n_solutions; ++i) {
  535. for(size_t k = 0; k < 512; ++k) {
  536. printf("%u ", dst_solutions[i*512+k]);
  537. }
  538.  
  539. printf("\n\n");
  540. }
  541. */
  542.  
  543. cleanup_program(&config);
  544. return n_solutions;
  545. }
  546.  
  547.  
  548. void get_element_parent_bucket_data(element_t* src, uint32_t* parent_bucket_index, uint8_t* a, uint8_t* b) {
  549. *parent_bucket_index = src->parent_bucket_data >> 8;
  550. *a = (src->parent_bucket_data >> 4) & 0xf;
  551. *b = (src->parent_bucket_data & 0xf);
  552. }
  553.  
  554. /*
  555. uint32_t mask_collision_bits(uint8_t* data, size_t bit_index) {
  556. uint32_t n = ((*data << (bit_index)) & 0xff) << 12;
  557. n |= ((*(++data)) << (bit_index+4));
  558. n |= ((*(++data)) >> (4-bit_index));
  559. return n;
  560. }
  561. */
  562.  
  563. void decompress_indices(uint32_t* dst_uncompressed_indices, bucket_t* buckets, element_t* old_src) {
  564. element_t leaves[NUM_INDICES/2];
  565. element_t past_leaves[NUM_INDICES/4];
  566. leaves[0] = *old_src;
  567. for(size_t i = 0; i < EQUIHASH_K-1; ++i) {
  568. // Copy memory to past_leaves
  569. memcpy(&past_leaves, &leaves, (size_t)(1<<i));
  570. for(size_t j = 0; j < (1 << i); ++j) {
  571. element_t* src = past_leaves + j;
  572. uint32_t parent_bucket_index;
  573. uint8_t a;
  574. uint8_t b;
  575. get_element_parent_bucket_data(src, &parent_bucket_index, &a, &b);
  576.  
  577. bucket_t* parent_bucket = buckets + ((EQUIHASH_K-2-i) * NUM_BUCKETS) + parent_bucket_index;
  578. leaves[2*j] = parent_bucket->data[a];
  579. leaves[2*j+1] = parent_bucket->data[b];
  580. }
  581. }
  582. for(size_t j = 0; j < NUM_INDICES/2; ++j) {
  583. element_t* src = leaves + j;
  584. uint32_t parent_bucket_index;
  585. uint8_t a;
  586. uint8_t b;
  587. get_element_parent_bucket_data(src, &parent_bucket_index, &a, &b);
  588. *dst_uncompressed_indices = parent_bucket_index;
  589. dst_uncompressed_indices++;
  590. }
  591. }
  592.  
  593.  
  594. void produce_solutions(uint32_t* dst_solutions, uint32_t* n_solutions, bucket_t* buckets, digest_t* src_digests) {
  595.  
  596. size_t start_bit = (EQUIHASH_K*NUM_COLLISION_BITS);
  597. size_t byte_index = start_bit / 8;
  598. size_t bit_index = start_bit % 8;
  599. bucket_t* src_buckets = buckets + (EQUIHASH_K-1)*NUM_BUCKETS;
  600. for (uint32_t i = 0; i < NUM_BUCKETS; ++i){
  601. bucket_t* bucket = src_buckets + i;
  602. fprintf(stderr, "i:%u", i);
  603. int has_dupe = 0;
  604. // Exit if no collision. We can exit as its only one worker per thread.
  605. if(bucket->size == 1) continue;
  606.  
  607. for(size_t a = 0; a < bucket->size && !has_dupe; ++a) {
  608. element_t* base = bucket->data + a;
  609. for(size_t b = a+1; b < bucket->size; ++b) {
  610. element_t* el = bucket->data + b;
  611. uint32_t ai = mask_collision_bits((( uint8_t*)src_digests[base->digest_index]) + byte_index, bit_index);
  612. uint32_t bi = mask_collision_bits((( uint8_t*)src_digests[el->digest_index]) + byte_index, bit_index);
  613. if(ai == bi && ai != 0) {
  614. uint32_t uncompressed_indices[NUM_INDICES];
  615. decompress_indices(uncompressed_indices, buckets, base);
  616. decompress_indices(uncompressed_indices + NUM_INDICES/2, buckets, el);
  617.  
  618. for(size_t k = 0; k < NUM_INDICES && !has_dupe; ++k) {
  619. for(size_t o = k+1; o < NUM_INDICES && !has_dupe; ++o) {
  620. if(uncompressed_indices[k] == uncompressed_indices[o]) {
  621. has_dupe = 1;
  622. }
  623. }
  624. }
  625. //has_dupe = 1;
  626. if(!has_dupe) {
  627. *n_solutions += 1;
  628. memcpy(dst_solutions + (*n_solutions)*NUM_INDICES, uncompressed_indices, NUM_INDICES*sizeof(uint32_t));
  629. } else {
  630. break;
  631. }
  632. }
  633. }
  634. }
  635. bucket->size = 0;
  636. }
  637. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement