// The kernel (this is the entrypoint of GPU code)
// Loads the 64-byte word to be hashed from g lobal to shared memory and calls the calculation routine
__global__ void md5_calc(char *gwords, char *paddedWords3, uint *hash, int realthreads, int msg_size, int size_hash, uint launch){
int linidx;
uint a;
uint b = 0;
uint c;
uint d;
char msg[64] = {0};
// assuming blockDim.y = 1 and threadIdx.y = 0, always
int iblock= blockIdx.x + blockIdx.y * gridDim.x;
linidx = threadIdx.x + iblock * blockDim.x;
// this check slows down the code by ~0.4% (measured)
if(linidx >= realthreads){
return;
}
// load the dictionary word for this thread
//char *word = &memory[0] + threadIdx.x * msg_size;
for(a = 0; a < msg_size; a++){
//word[(a + threadIdx.x) % msg_size] = gwords[linidx * msg_size + a];
//msg[a] = word[(a + threadIdx.x) % msg_size];
msg[a] = gwords[linidx * msg_size + a];
//end of the password
if(msg[a] == 0){
msg[a] = 0x80;
b = a;
break;
}
}
if(!b){
msg[msg_size] = 0x80;
msg[56] = msg_size * 8;
}
else{
msg[56] = b * 8;
}
for(a = 0; a < 64; a++){
paddedWords3[linidx * 64 + a] = msg[a];
}
paddedWords3[linidx * 64 + 63] = linidx;
paddedWords3[linidx * 64 + 62] = launch;
paddedWords3[linidx * 64 + 61] = (launch * realthreads) + linidx + 1;
// compute MD5 hash
md5_gpu((uint *)msg, a, b, c, d);
//md5_gpu((uint *)msg1, a1, b1, c1, d1);
// return the hash converted to number
// all this work around is because of different edian order
if(size_hash == 1){
hash[(linidx)] = ((a & 240) >> 4);
}
else{
if(size_hash == 2){
hash[(linidx)] = a & 255;
}
else{
if(size_hash == 3){
hash[(linidx)] = ((a & 61440) >> 12) + ((a & 240) << 4) + ((a & 15) << 4);
}
else{
if(size_hash == 4){
hash[(linidx)] = ((a & 61440) >> 8) + ((a & 3840) >> 8) + ((a & 240) << 8) + ((a & 15) << 8);
}
}
}
}
}