// The kernel (this is the entrypoint of GPU code)
// Loads the 64-byte word to be hashed from g	lobal to shared memory and calls the calculation routine
__global__ void md5_calc(char *gwords, char *paddedWords3, uint *hash, int realthreads, int msg_size, int size_hash, uint launch){
	int linidx;
	uint a;
	uint b = 0;
	uint c;
	uint d;

	char msg[64] = {0};

	// assuming blockDim.y = 1 and threadIdx.y = 0, always
	int iblock= blockIdx.x + blockIdx.y * gridDim.x;
	linidx = threadIdx.x + iblock * blockDim.x;

	// this check slows down the code by ~0.4% (measured)
	if(linidx >= realthreads){
		return; 
	}

	// load the dictionary word for this thread
	//char *word = &memory[0] + threadIdx.x * msg_size;

	for(a = 0; a < msg_size; a++){
		//word[(a + threadIdx.x) % msg_size] = gwords[linidx * msg_size + a];
		//msg[a] = word[(a + threadIdx.x) % msg_size];
		msg[a] = gwords[linidx * msg_size + a];

		//end of the password
		if(msg[a] == 0){
			msg[a] = 0x80;
			b = a;

			break;
		}
	}

	if(!b){
		msg[msg_size] = 0x80;
	
		msg[56] = msg_size * 8;
	}
	else{
		msg[56] = b * 8;
	}


	for(a = 0; a < 64; a++){
		paddedWords3[linidx * 64 + a] = msg[a];
	}

	paddedWords3[linidx * 64 + 63] = linidx;
	paddedWords3[linidx * 64 + 62] = launch;
	paddedWords3[linidx * 64 + 61] = (launch * realthreads) + linidx + 1;

	// compute MD5 hash
	md5_gpu((uint *)msg, a, b, c, d);
	//md5_gpu((uint *)msg1, a1, b1, c1, d1);

	// return the hash converted to number
	// all this work around is because of different edian order
	if(size_hash == 1){
		hash[(linidx)] = ((a & 240) >> 4);
	}
	else{
		if(size_hash == 2){
			hash[(linidx)] = a & 255;
		}
		else{
			if(size_hash == 3){
				hash[(linidx)] = ((a & 61440) >> 12) + ((a & 240) << 4) + ((a & 15) << 4);
			}
			else{
				if(size_hash == 4){
					hash[(linidx)] = ((a & 61440) >> 8) + ((a & 3840) >> 8) + ((a & 240) << 8) + ((a & 15) << 8);
				}
			}
		}
	}
}