// The kernel (this is the entrypoint of GPU code) // Loads the 64-byte word to be hashed from g lobal to shared memory and calls the calculation routine __global__ void md5_calc(char *gwords, char *paddedWords3, uint *hash, int realthreads, int msg_size, int size_hash, uint launch){ int linidx; uint a; uint b = 0; uint c; uint d; char msg[64] = {0}; // assuming blockDim.y = 1 and threadIdx.y = 0, always int iblock= blockIdx.x + blockIdx.y * gridDim.x; linidx = threadIdx.x + iblock * blockDim.x; // this check slows down the code by ~0.4% (measured) if(linidx >= realthreads){ return; } // load the dictionary word for this thread //char *word = &memory[0] + threadIdx.x * msg_size; for(a = 0; a < msg_size; a++){ //word[(a + threadIdx.x) % msg_size] = gwords[linidx * msg_size + a]; //msg[a] = word[(a + threadIdx.x) % msg_size]; msg[a] = gwords[linidx * msg_size + a]; //end of the password if(msg[a] == 0){ msg[a] = 0x80; b = a; break; } } if(!b){ msg[msg_size] = 0x80; msg[56] = msg_size * 8; } else{ msg[56] = b * 8; } for(a = 0; a < 64; a++){ paddedWords3[linidx * 64 + a] = msg[a]; } paddedWords3[linidx * 64 + 63] = linidx; paddedWords3[linidx * 64 + 62] = launch; paddedWords3[linidx * 64 + 61] = (launch * realthreads) + linidx + 1; // compute MD5 hash md5_gpu((uint *)msg, a, b, c, d); //md5_gpu((uint *)msg1, a1, b1, c1, d1); // return the hash converted to number // all this work around is because of different edian order if(size_hash == 1){ hash[(linidx)] = ((a & 240) >> 4); } else{ if(size_hash == 2){ hash[(linidx)] = a & 255; } else{ if(size_hash == 3){ hash[(linidx)] = ((a & 61440) >> 12) + ((a & 240) << 4) + ((a & 15) << 4); } else{ if(size_hash == 4){ hash[(linidx)] = ((a & 61440) >> 8) + ((a & 3840) >> 8) + ((a & 240) << 8) + ((a & 15) << 8); } } } } }