Guest User

Untitled

a guest
Apr 19th, 2018
85
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.97 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Thu Apr 5 17:23:53 2018
  4. @author: ADubey4
  5. """
  6.  
  7. import cv2
  8. from pytesseract import pytesseract as pt
  9. import pandas as pd
  10. from PIL import Image
  11. import sys
  12. import pdf2image
  13. import numpy as np
  14. from signatureExtractor import getSignatureFromPage, getSignature
  15. #from matplotlib import pyplot as plt
  16.  
  17. if sys.version_info[0] < 3:
  18. from StringIO import StringIO
  19. else:
  20. from io import StringIO
  21.  
  22.  
  23.  
  24. pt.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'
  25. file = r"C:\Users\file\path"
  26.  
  27.  
  28. #search_word = "sincerely"
  29. search_word = "signature"
  30.  
  31. signature_margin = (400, 400, 400, 400) # (left, top, right, bottom)
  32. is_signature_margin_percentage = True # if true its percentage of the box dimensions, else the exact pixels
  33.  
  34.  
  35. if(is_signature_margin_percentage):
  36. signature_margin = [x/100.0 for x in signature_margin ]
  37.  
  38. if file.endswith(".pdf"):
  39. doc_page_images = pdf2image.convert_from_path(file, dpi = 300)
  40. else:
  41. img = cv2.imread(file,cv2.COLOR_RGB2BGR)
  42. doc_page_images=[img]
  43. # img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  44.  
  45. def get_ocr_df(doc):
  46. ret = pt.image_to_data(doc).lower()
  47. TESTDATA = StringIO(ret)
  48. return pd.read_csv(TESTDATA, sep=r"\t", engine='python').dropna(axis=0, how='any')
  49.  
  50. def clean_printed(im):
  51. df = get_ocr_df(im)
  52. print(df)
  53. for i, row in df.iterrows():
  54. im[row.top:row.top + row.height+1, row.left:row.left + row.width+1] = 255
  55. return im
  56.  
  57. for page_no, doc in enumerate(doc_page_images):
  58. df = get_ocr_df(doc)
  59. signature_df = df[df['text'].str.contains(search_word)]
  60.  
  61. if type(doc) is np.ndarray:
  62. x2_max = doc.shape[0]
  63. y2_max = doc.shape[1]
  64. doc_type = "np"
  65. else:
  66. x2_max = doc.size[0]
  67. y2_max = doc.size[1]
  68. doc_type = "pil"
  69.  
  70. for i, row in signature_df.iterrows():
  71. print(page_no, i)
  72. if is_signature_margin_percentage :
  73. x1 = row.left - int(signature_margin[0]*row.width)
  74. y1 = row.top - int(signature_margin[1]*row.height)
  75. x2 = row.left + row.width + int(signature_margin[2]*row.width)
  76. y2 = row.top + row.height + int(signature_margin[3]*row.height)
  77. else:
  78. x1 = row.left - int(signature_margin[0])
  79. y1 = row.top - int(signature_margin[1])
  80. x2 = row.left + row.width + int(signature_margin[2])
  81. y2 = row.top + row.height + int(signature_margin[3])
  82.  
  83. if x1 < 0: x1=0
  84. if y1 < 0: y1=0
  85. if x2 > x2_max: x2 = x2_max
  86. if y2 > y2_max: x2 = y2_max
  87.  
  88. print(x1,y1, x2,y2)
  89. if type(doc) is np.ndarray:
  90. im = doc[y1:y2+1, x1:x2+1]
  91. Image.fromarray(im).show()
  92. else:
  93. im = doc.crop((x1, y1, x2, y2))
  94. im.show()
  95.  
  96. im = clean_printed(np.array(im))
  97. Image.fromarray(im).show()
  98. # signature = getSignatureFromPage(img = np.array(im))
  99. # signature = getSignature(img = signature)
  100. # cv2.imshow('Signature'+str(i), signature)
Add Comment
Please, Sign In to add comment