PoC Captcha Regconizer - BKHCM

# you might need to install tesseract or tesseract-ocr from your os pkg manager
from urllib import request

from PIL import Image
from PIL import ImageFilter
from pytesseract import pytesseract


def extract(file):
    # load img and convert it to grayscale
    img = Image.open(file).convert('L')
    # load px matrix
    px = img.load()

    # binarize img
    for y in range(img.size[1]):
        for x in range(img.size[0]):
            if px[x, y] < 30 or px[x, y] > 230:
                px[x, y] = 255
            else:
                px[x, y] = 0

    img = img.filter(ImageFilter.MaxFilter)

    # tesseract -psm 8: treat the image as a single word
    text = pytesseract.image_to_string(img, config='-psm 8')

    # result
    return text


def main():
    # download captcha image
    request.urlretrieve('http://aao.hcmut.edu.vn/image/data/Tra_cuu/phpcaptcha/captcha.php', 'captcha.jpg')
    # print result
    print(extract('captcha.jpg'))


if __name__ == '__main__':
main()