Guest User

Untitled

a guest
Oct 31st, 2012
340
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 14.70 KB | None | 0 0
  1. // CUDA MD5 hash calculation implementation (A: [email protected]).
  2. // A very useful link: http://people.eku.edu/styere/Encrypt/JS-MD5.html
  3.  
  4. #include <stdio.h>
  5.  
  6. #define GPU_MAX_PW 10000000
  7.  
  8. unsigned short *gpuHashes = NULL;
  9.  
  10. typedef unsigned int uint;
  11.  
  12. // On-device variable declarations
  13. extern __shared__ char memory[];    // on-chip shared memory
  14. __constant__ uint k[64], rconst[16];    // constants (in fast on-chip constant cache)
  15. __constant__ uint target[4];        // target hash, if searching for hash matches
  16.  
  17. // MD5 magic numbers. These will be loaded into on-device "constant" memory
  18. static const uint k_cpu[64] = {
  19.     0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
  20.     0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
  21.     0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
  22.     0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
  23.  
  24.     0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
  25.     0xd62f105d, 0x2441453,  0xd8a1e681, 0xe7d3fbc8,
  26.     0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
  27.     0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
  28.  
  29.     0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
  30.     0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
  31.     0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x4881d05,
  32.     0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
  33.  
  34.     0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
  35.     0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
  36.     0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
  37.     0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
  38. };
  39.  
  40. static const uint rconst_cpu[16] = {
  41.     7, 12, 17, 22,   5,  9, 14, 20,   4, 11, 16, 23,   6, 10, 15, 21
  42. };
  43.  
  44. extern "C"
  45. void init_constants(uint *target_cpu)
  46. {
  47.     cudaMemcpyToSymbol(k, k_cpu, sizeof(k));
  48.     cudaMemcpyToSymbol(rconst, rconst_cpu, sizeof(rconst));
  49.     if(target_cpu) { cudaMemcpyToSymbol(target, target_cpu, 4*4); };
  50. }
  51.  
  52. // MD5 routines (straight from Wikipedia's MD5 pseudocode description)
  53. //__device__ inline uint leftrotate (uint x, uint c){
  54. //  return (x << c) | (x >> (32 - c));
  55. //}
  56.  
  57. //__device__ inline uint r(const uint i){
  58. //  return rconst[(i / 16) * 4 + i % 4];
  59. //}
  60.  
  61. // Accessor for w[16] array. Naively, this would just be w[i]; however, this
  62. // choice leads to worst-case-scenario access pattern wrt. shared memory
  63. // bank conflicts, as the same indices in different threads fall into the
  64. // same bank (as the words are 16 uints long). The packing below causes the
  65. // same indices in different threads of a warp to map to different banks. In
  66. // testing this gave a ~40% speedup.
  67. //
  68. // PS: An alternative solution would be to make the w array 17 uints long
  69. // (thus wasting a little shared memory)
  70. //__device__ inline uint &getw(uint *w, const int i){
  71. //  return w[i/* + threadIdx.x) % 16*/];
  72. //}
  73.  
  74. // const- version
  75. __device__ inline uint getw(const uint *w, const int i){
  76.     return w[i/* + threadIdx.x) % 16*/];
  77. }
  78.  
  79.  
  80. //__device__ inline uint getk(const int i){
  81. //  return k[i];    // Note: this is as fast as possible (measured)
  82. //}
  83.  
  84. //////////////////////////////////////////////////////////////////////////////
  85. /////////////       Ron Rivest's MD5 C Implementation       //////////////////
  86. //////////////////////////////////////////////////////////////////////////////
  87.  
  88. /*
  89.  **********************************************************************
  90.  ** Copyright (C) 1990, RSA Data Security, Inc. All rights reserved. **
  91.  **                                                                  **
  92.  ** License to copy and use this software is granted provided that   **
  93.  ** it is identified as the "RSA Data Security, Inc. MD5 Message     **
  94.  ** Digest Algorithm" in all material mentioning or referencing this **
  95.  ** software or this function.                                       **
  96.  **                                                                  **
  97.  ** License is also granted to make and use derivative works         **
  98.  ** provided that such works are identified as "derived from the RSA **
  99.  ** Data Security, Inc. MD5 Message Digest Algorithm" in all         **
  100.  ** material mentioning or referencing the derived work.             **
  101.  **                                                                  **
  102.  ** RSA Data Security, Inc. makes no representations concerning      **
  103.  ** either the merchantability of this software or the suitability   **
  104.  ** of this software for any particular purpose.  It is provided "as **
  105.  ** is" without express or implied warranty of any kind.             **
  106.  **                                                                  **
  107.  ** These notices must be retained in any copies of any part of this **
  108.  ** documentation and/or software.                                   **
  109.  **********************************************************************
  110.  */
  111.  
  112.  
  113. /* F, G and H are basic MD5 functions: selection, majority, parity */
  114. #define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
  115. #define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
  116. #define H(x, y, z) ((x) ^ (y) ^ (z))
  117. #define I(x, y, z) ((y) ^ ((x) | (~z)))
  118.  
  119. /* ROTATE_LEFT rotates x left n bits */
  120. #define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
  121.  
  122. /* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4 */
  123. /* Rotation is separate from addition to prevent recomputation */
  124. #define FF(a, b, c, d, x, s, ac) \
  125.   {(a) += F ((b), (c), (d)) + (x) + (uint)(ac); \
  126.    (a) = ROTATE_LEFT ((a), (s)); \
  127.    (a) += (b); \
  128.   }
  129. #define GG(a, b, c, d, x, s, ac) \
  130.   {(a) += G ((b), (c), (d)) + (x) + (uint)(ac); \
  131.    (a) = ROTATE_LEFT ((a), (s)); \
  132.    (a) += (b); \
  133.   }
  134. #define HH(a, b, c, d, x, s, ac) \
  135.   {(a) += H ((b), (c), (d)) + (x) + (uint)(ac); \
  136.    (a) = ROTATE_LEFT ((a), (s)); \
  137.    (a) += (b); \
  138.   }
  139. #define II(a, b, c, d, x, s, ac) \
  140.   {(a) += I ((b), (c), (d)) + (x) + (uint)(ac); \
  141.    (a) = ROTATE_LEFT ((a), (s)); \
  142.    (a) += (b); \
  143.   }
  144.  
  145. // Basic MD5 step. Transform buf based on in.
  146. void inline __device__ md5_gpu(const uint *in, uint &a, uint &b, uint &c, uint &d){
  147.     const uint a0 = 0x67452301;
  148.     const uint b0 = 0xEFCDAB89;
  149.     const uint c0 = 0x98BADCFE;
  150.     const uint d0 = 0x10325476;
  151.  
  152.     //Initialize hash value for this chunk:
  153.     a = a0;
  154.     b = b0;
  155.     c = c0;
  156.     d = d0;
  157.  
  158.   /* Round 1 */
  159. #define S11 7
  160. #define S12 12
  161. #define S13 17
  162. #define S14 22
  163.   FF ( a, b, c, d, getw(in,  0), S11, 3614090360); /* 1 */
  164.   FF ( d, a, b, c, getw(in,  1), S12, 3905402710); /* 2 */
  165.   FF ( c, d, a, b, getw(in,  2), S13,  606105819); /* 3 */
  166.   FF ( b, c, d, a, getw(in,  3), S14, 3250441966); /* 4 */
  167.   FF ( a, b, c, d, getw(in,  4), S11, 4118548399); /* 5 */
  168.   FF ( d, a, b, c, getw(in,  5), S12, 1200080426); /* 6 */
  169.   FF ( c, d, a, b, getw(in,  6), S13, 2821735955); /* 7 */
  170.   FF ( b, c, d, a, getw(in,  7), S14, 4249261313); /* 8 */
  171.   FF ( a, b, c, d, getw(in,  8), S11, 1770035416); /* 9 */
  172.   FF ( d, a, b, c, getw(in,  9), S12, 2336552879); /* 10 */
  173.   FF ( c, d, a, b, getw(in, 10), S13, 4294925233); /* 11 */
  174.   FF ( b, c, d, a, getw(in, 11), S14, 2304563134); /* 12 */
  175.   FF ( a, b, c, d, getw(in, 12), S11, 1804603682); /* 13 */
  176.   FF ( d, a, b, c, getw(in, 13), S12, 4254626195); /* 14 */
  177.   FF ( c, d, a, b, getw(in, 14), S13, 2792965006); /* 15 */
  178.   FF ( b, c, d, a, getw(in, 15), S14, 1236535329); /* 16 */
  179.  
  180.   /* Round 2 */
  181. #define S21 5
  182. #define S22 9
  183. #define S23 14
  184. #define S24 20
  185.   GG ( a, b, c, d, getw(in,  1), S21, 4129170786); /* 17 */
  186.   GG ( d, a, b, c, getw(in,  6), S22, 3225465664); /* 18 */
  187.   GG ( c, d, a, b, getw(in, 11), S23,  643717713); /* 19 */
  188.   GG ( b, c, d, a, getw(in,  0), S24, 3921069994); /* 20 */
  189.   GG ( a, b, c, d, getw(in,  5), S21, 3593408605); /* 21 */
  190.   GG ( d, a, b, c, getw(in, 10), S22,   38016083); /* 22 */
  191.   GG ( c, d, a, b, getw(in, 15), S23, 3634488961); /* 23 */
  192.   GG ( b, c, d, a, getw(in,  4), S24, 3889429448); /* 24 */
  193.   GG ( a, b, c, d, getw(in,  9), S21,  568446438); /* 25 */
  194.   GG ( d, a, b, c, getw(in, 14), S22, 3275163606); /* 26 */
  195.   GG ( c, d, a, b, getw(in,  3), S23, 4107603335); /* 27 */
  196.   GG ( b, c, d, a, getw(in,  8), S24, 1163531501); /* 28 */
  197.   GG ( a, b, c, d, getw(in, 13), S21, 2850285829); /* 29 */
  198.   GG ( d, a, b, c, getw(in,  2), S22, 4243563512); /* 30 */
  199.   GG ( c, d, a, b, getw(in,  7), S23, 1735328473); /* 31 */
  200.   GG ( b, c, d, a, getw(in, 12), S24, 2368359562); /* 32 */
  201.  
  202.   /* Round 3 */
  203. #define S31 4
  204. #define S32 11
  205. #define S33 16
  206. #define S34 23
  207.   HH ( a, b, c, d, getw(in,  5), S31, 4294588738); /* 33 */
  208.   HH ( d, a, b, c, getw(in,  8), S32, 2272392833); /* 34 */
  209.   HH ( c, d, a, b, getw(in, 11), S33, 1839030562); /* 35 */
  210.   HH ( b, c, d, a, getw(in, 14), S34, 4259657740); /* 36 */
  211.   HH ( a, b, c, d, getw(in,  1), S31, 2763975236); /* 37 */
  212.   HH ( d, a, b, c, getw(in,  4), S32, 1272893353); /* 38 */
  213.   HH ( c, d, a, b, getw(in,  7), S33, 4139469664); /* 39 */
  214.   HH ( b, c, d, a, getw(in, 10), S34, 3200236656); /* 40 */
  215.   HH ( a, b, c, d, getw(in, 13), S31,  681279174); /* 41 */
  216.   HH ( d, a, b, c, getw(in,  0), S32, 3936430074); /* 42 */
  217.   HH ( c, d, a, b, getw(in,  3), S33, 3572445317); /* 43 */
  218.   HH ( b, c, d, a, getw(in,  6), S34,   76029189); /* 44 */
  219.   HH ( a, b, c, d, getw(in,  9), S31, 3654602809); /* 45 */
  220.   HH ( d, a, b, c, getw(in, 12), S32, 3873151461); /* 46 */
  221.   HH ( c, d, a, b, getw(in, 15), S33,  530742520); /* 47 */
  222.   HH ( b, c, d, a, getw(in,  2), S34, 3299628645); /* 48 */
  223.  
  224.   /* Round 4 */
  225. #define S41 6
  226. #define S42 10
  227. #define S43 15
  228. #define S44 21
  229.   II ( a, b, c, d, getw(in,  0), S41, 4096336452); /* 49 */
  230.   II ( d, a, b, c, getw(in,  7), S42, 1126891415); /* 50 */
  231.   II ( c, d, a, b, getw(in, 14), S43, 2878612391); /* 51 */
  232.   II ( b, c, d, a, getw(in,  5), S44, 4237533241); /* 52 */
  233.   II ( a, b, c, d, getw(in, 12), S41, 1700485571); /* 53 */
  234.   II ( d, a, b, c, getw(in,  3), S42, 2399980690); /* 54 */
  235.   II ( c, d, a, b, getw(in, 10), S43, 4293915773); /* 55 */
  236.   II ( b, c, d, a, getw(in,  1), S44, 2240044497); /* 56 */
  237.   II ( a, b, c, d, getw(in,  8), S41, 1873313359); /* 57 */
  238.   II ( d, a, b, c, getw(in, 15), S42, 4264355552); /* 58 */
  239.   II ( c, d, a, b, getw(in,  6), S43, 2734768916); /* 59 */
  240.   II ( b, c, d, a, getw(in, 13), S44, 1309151649); /* 60 */
  241.   II ( a, b, c, d, getw(in,  4), S41, 4149444226); /* 61 */
  242.   II ( d, a, b, c, getw(in, 11), S42, 3174756917); /* 62 */
  243.   II ( c, d, a, b, getw(in,  2), S43,  718787259); /* 63 */
  244.   II ( b, c, d, a, getw(in,  9), S44, 3951481745); /* 64 */
  245.  
  246.     a += a0;
  247.     b += b0;
  248.     c += c0;
  249.     d += d0;
  250. }
  251.  
  252. // The kernel (this is the entrypoint of GPU code)
  253. // Loads the 64-byte word to be hashed from g   lobal to shared memory and calls the calculation routine
  254. __global__ void md5_calc(/*char *gwords, char *paddedWords3,*/ unsigned short *hash, int realthreads, int msg_size, int size_hash, uint launch){
  255.     int linidx;
  256.  
  257.     // assuming blockDim.y = 1 and threadIdx.y = 0, always
  258.     linidx = threadIdx.x + ((blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x);
  259.  
  260.     // this check slows down the code by ~0.4% (measured)
  261.     if(linidx >= realthreads){
  262.         return;
  263.     }
  264.  
  265.     uint a;
  266.     uint b;
  267.     uint c;
  268.     uint d;
  269.  
  270.     char msg[64] = {0};
  271.  
  272.     unsigned long long int password;
  273.  
  274.     password = (launch * realthreads) + linidx + 1;
  275.  
  276.     b = 0;
  277.    
  278.     //convert to ASCII
  279.     while(password != 0){
  280.         msg[b] = password % 10 + 48; //ASCII;;
  281.         password /= 10;
  282.         b++;
  283.     }
  284.  
  285.     c = 0;
  286.    
  287.     //invert number order
  288.     while(c < (b / 2)){
  289.         d = msg[c];
  290.         msg[c] = msg[b - c - 1];
  291.         msg[b - c - 1] = d;
  292.    
  293.         c++;
  294.     }
  295.  
  296.     // load the dictionary word for this thread
  297.     b = 0;
  298.  
  299.     for(a = 0; a < msg_size; a++){
  300.         //end of the password
  301.         if(msg[a] == 0){
  302.             msg[a] = 0x80;
  303.             b = a;
  304.  
  305.             break;
  306.         }
  307.     }
  308.  
  309.     if(!b){
  310.         msg[msg_size] = 0x80;
  311.    
  312.         msg[56] = msg_size * 8;
  313.     }
  314.     else{
  315.         msg[56] = b * 8;
  316.     }
  317.  
  318.     // compute MD5 hash
  319.     md5_gpu((uint *)msg, a, b, c, d);
  320.  
  321.     // return the hash converted to number
  322.     // all this work around is because of different edian order
  323.     if(size_hash == 1){
  324.         hash[(linidx)] = ((a & 240) >> 4);
  325.     }
  326.     else{
  327.         if(size_hash == 2){
  328.             hash[(linidx)] = a & 255;
  329.         }
  330.         else{
  331.             if(size_hash == 3){
  332.                 hash[(linidx)] = ((a & 61440) >> 12) + ((a & 240) << 4) + ((a & 15) << 4);
  333.             }
  334.             else{
  335.                 if(size_hash == 4){
  336.                     hash[(linidx)] = ((a & 61440) >> 8) + ((a & 3840) >> 8) + ((a & 240) << 8) + ((a & 15) << 8);
  337.                 }
  338.             }
  339.         }
  340.     }
  341. }
  342.  
  343. void find_best_factorization(int *bx, int *by, int nblocks){
  344.     *bx = -1;
  345.     int best_r = 100000;
  346.     int bytmp;
  347.     int r; 
  348.  
  349.     for(bytmp = 1; bytmp != 65536; bytmp++){
  350.         r = nblocks % bytmp;
  351.  
  352.         if(r < best_r && nblocks / bytmp < 65535){
  353.             *by = bytmp;
  354.             *bx = nblocks / bytmp;
  355.             best_r = r;
  356.            
  357.             if(r == 0){
  358.                 break;
  359.             }
  360.  
  361.             *bx++;
  362.         }
  363.     }
  364.  
  365.     if(*bx == -1){
  366.         printf("Unfactorizable?!\n");
  367.         exit(-1);
  368.     }
  369. }
  370.  
  371. void HandleError(cudaError_t err, const char *file, int line){
  372.     if(err != cudaSuccess){
  373.         printf("%s in %s at line %d.. err number %d\n", cudaGetErrorString(err), file, line, err);
  374.         exit(EXIT_FAILURE);
  375.     }
  376. }
  377.  
  378. #define HANDLE_ERROR(err)(HandleError(err, __FILE__, __LINE__))
  379.  
  380. // Given a total number of threads, their memory requirements, and the
  381. // number of threadsPerBlock, compute the optimal allowable grid dimensions.
  382. // Returns false if the requested number of threads are impossible to fit to
  383. // shared memory.
  384. int calculate_grid_parameters(int gridDim[2], int threadsPerBlock, int neededthreads, int dynShmemPerThread, int staticShmemPerBlock){
  385.     const int shmemPerMP = 1102484;
  386.  
  387.     int dyn_shared_mem_required = dynShmemPerThread * threadsPerBlock;
  388.     int shared_mem_required = staticShmemPerBlock + dyn_shared_mem_required;
  389.     int nblocks;
  390.     int nthreads;
  391.     int over;
  392.  
  393.     if(shared_mem_required > shmemPerMP){
  394.         return 0;
  395.     }
  396.  
  397.     // calculate the total number of threads
  398.     nthreads = neededthreads;
  399.     over = neededthreads % threadsPerBlock;
  400.  
  401.     // round up to multiple of threadsPerBlock
  402.     if(over){
  403.         nthreads += threadsPerBlock - over;
  404.     }
  405.  
  406.     // calculate the number of blocks
  407.     nblocks = nthreads / threadsPerBlock;
  408.  
  409.     if(nthreads % threadsPerBlock){
  410.         nblocks++;
  411.     }
  412.  
  413.     // calculate block dimensions so that there are as close to nblocks blocks as possible
  414.     find_best_factorization(&gridDim[0], &gridDim[1], nblocks);
  415.  
  416.     printf("block size = %d grid %d grid %d\n\n", nblocks, gridDim[0], gridDim[1]);
  417.  
  418.     return 1;
  419. }
  420.  
  421. int main(){
  422.     unsigned short *gwords = NULL;
  423.     uint realthreads = 1;
  424.     uint msg_size = 1;
  425.     uint size_hash = 1;
  426.     uint launch = 0;
  427.  
  428.     if((gwords = (unsigned short *)calloc(GPU_MAX_PW, sizeof(unsigned short))) == NULL){
  429.         exit(-1);
  430.     }
  431.  
  432.     int gridDim[2];
  433.  
  434.     calculate_grid_parameters(gridDim, 1024, GPU_MAX_PW, 0, 49152);
  435.  
  436.     dim3 grid;
  437.     grid.x = gridDim[0];
  438.     grid.y = gridDim[1];
  439.  
  440.     HANDLE_ERROR(cudaMemcpyToSymbol(k, k_cpu, sizeof(k)));
  441.     HANDLE_ERROR(cudaMemcpyToSymbol(rconst, rconst_cpu, sizeof(rconst)));
  442.  
  443.     HANDLE_ERROR(cudaMalloc((void **)&gpuHashes, sizeof(unsigned short) * GPU_MAX_PW));
  444.  
  445.     md5_calc<<<grid, 1024>>>(gpuHashes, realthreads, msg_size, size_hash, launch);
  446.  
  447.     HANDLE_ERROR(cudaMemcpy(gwords, gpuHashes, sizeof(unsigned short) * GPU_MAX_PW, cudaMemcpyDeviceToHost));
  448. }
Advertisement
Add Comment
Please, Sign In to add comment