Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """
- Created on Thu Apr 5 17:23:53 2018
- @author: ADubey4
- """
- import cv2
- from pytesseract import pytesseract as pt
- import pandas as pd
- from PIL import Image
- import sys
- import pdf2image
- import numpy as np
- from signatureExtractor import getSignatureFromPage, getSignature
- #from matplotlib import pyplot as plt
- if sys.version_info[0] < 3:
- from StringIO import StringIO
- else:
- from io import StringIO
- pt.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'
- file = r"C:\Users\file\path"
- #search_word = "sincerely"
- search_word = "signature"
- signature_margin = (400, 400, 400, 400) # (left, top, right, bottom)
- is_signature_margin_percentage = True # if true its percentage of the box dimensions, else the exact pixels
- if(is_signature_margin_percentage):
- signature_margin = [x/100.0 for x in signature_margin ]
- if file.endswith(".pdf"):
- doc_page_images = pdf2image.convert_from_path(file, dpi = 300)
- else:
- img = cv2.imread(file,cv2.COLOR_RGB2BGR)
- doc_page_images=[img]
- # img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
- def get_ocr_df(doc):
- ret = pt.image_to_data(doc).lower()
- TESTDATA = StringIO(ret)
- return pd.read_csv(TESTDATA, sep=r"\t", engine='python').dropna(axis=0, how='any')
- def clean_printed(im):
- df = get_ocr_df(im)
- print(df)
- for i, row in df.iterrows():
- im[row.top:row.top + row.height+1, row.left:row.left + row.width+1] = 255
- return im
- for page_no, doc in enumerate(doc_page_images):
- df = get_ocr_df(doc)
- signature_df = df[df['text'].str.contains(search_word)]
- if type(doc) is np.ndarray:
- x2_max = doc.shape[0]
- y2_max = doc.shape[1]
- doc_type = "np"
- else:
- x2_max = doc.size[0]
- y2_max = doc.size[1]
- doc_type = "pil"
- for i, row in signature_df.iterrows():
- print(page_no, i)
- if is_signature_margin_percentage :
- x1 = row.left - int(signature_margin[0]*row.width)
- y1 = row.top - int(signature_margin[1]*row.height)
- x2 = row.left + row.width + int(signature_margin[2]*row.width)
- y2 = row.top + row.height + int(signature_margin[3]*row.height)
- else:
- x1 = row.left - int(signature_margin[0])
- y1 = row.top - int(signature_margin[1])
- x2 = row.left + row.width + int(signature_margin[2])
- y2 = row.top + row.height + int(signature_margin[3])
- if x1 < 0: x1=0
- if y1 < 0: y1=0
- if x2 > x2_max: x2 = x2_max
- if y2 > y2_max: x2 = y2_max
- print(x1,y1, x2,y2)
- if type(doc) is np.ndarray:
- im = doc[y1:y2+1, x1:x2+1]
- Image.fromarray(im).show()
- else:
- im = doc.crop((x1, y1, x2, y2))
- im.show()
- im = clean_printed(np.array(im))
- Image.fromarray(im).show()
- # signature = getSignatureFromPage(img = np.array(im))
- # signature = getSignature(img = signature)
- # cv2.imshow('Signature'+str(i), signature)
Add Comment
Please, Sign In to add comment