Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # you might need to install tesseract or tesseract-ocr from your os pkg manager
- from urllib import request
- from PIL import Image
- from PIL import ImageFilter
- from pytesseract import pytesseract
- def extract(file):
- # load img and convert it to grayscale
- img = Image.open(file).convert('L')
- # load px matrix
- px = img.load()
- # binarize img
- for y in range(img.size[1]):
- for x in range(img.size[0]):
- if px[x, y] < 30 or px[x, y] > 230:
- px[x, y] = 255
- else:
- px[x, y] = 0
- img = img.filter(ImageFilter.MaxFilter)
- # tesseract -psm 8: treat the image as a single word
- text = pytesseract.image_to_string(img, config='-psm 8')
- # result
- return text
- def main():
- # download captcha image
- request.urlretrieve('http://aao.hcmut.edu.vn/image/data/Tra_cuu/phpcaptcha/captcha.php', 'captcha.jpg')
- # print result
- print(extract('captcha.jpg'))
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement