Advertisement
Guest User

PoC Captcha Regconizer - BKHCM

a guest
Oct 13th, 2016
303
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.00 KB | None | 0 0
  1. # you might need to install tesseract or tesseract-ocr from your os pkg manager
  2. from urllib import request
  3.  
  4. from PIL import Image
  5. from PIL import ImageFilter
  6. from pytesseract import pytesseract
  7.  
  8.  
  9. def extract(file):
  10.     # load img and convert it to grayscale
  11.     img = Image.open(file).convert('L')
  12.     # load px matrix
  13.     px = img.load()
  14.  
  15.     # binarize img
  16.     for y in range(img.size[1]):
  17.         for x in range(img.size[0]):
  18.             if px[x, y] < 30 or px[x, y] > 230:
  19.                 px[x, y] = 255
  20.             else:
  21.                 px[x, y] = 0
  22.  
  23.     img = img.filter(ImageFilter.MaxFilter)
  24.  
  25.     # tesseract -psm 8: treat the image as a single word
  26.     text = pytesseract.image_to_string(img, config='-psm 8')
  27.  
  28.     # result
  29.     return text
  30.  
  31.  
  32. def main():
  33.     # download captcha image
  34.     request.urlretrieve('http://aao.hcmut.edu.vn/image/data/Tra_cuu/phpcaptcha/captcha.php', 'captcha.jpg')
  35.     # print result
  36.     print(extract('captcha.jpg'))
  37.  
  38.  
  39. if __name__ == '__main__':
  40. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement