Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import fitz
- import re
- import matplotlib.pyplot as plt
- from common_elements import loop_statements, show_image, line_by_line
- def read_pdf(fName, debug,year):
- doc = fitz.open(fName)
- t_date = []
- p_date = []
- desc = []
- amount = []
- count_pages = 0
- last_page = 0
- for page in doc:
- paths = page.get_drawings()
- dup_p = page
- count_rect = 0
- t_d_rect, p_d_rect, d_rect, a_rect, rect_1, rect_2 = None, None, None, None, None, None
- count_pages+=1
- for path in paths:
- p = path['rect']
- if p.width > 200 and p.x0 > 40 and p.x1 < 350:
- if count_rect == 1:
- if debug > 1:
- dup_p.draw_rect(p,color=fitz.pdfcolor["blue"],width=2)
- rect_1 = p
- if count_rect == 2:
- if debug > 1:
- dup_p.draw_rect(p,color=fitz.pdfcolor["red"],width=2)
- rect_2 = p
- if rect_1 != None and rect_2 != None and last_page != count_pages:
- t_d_rect = [rect_1.x0,rect_1.y0,rect_1.x0+47,rect_2.y0]
- p_d_rect = [rect_1.x0+48,rect_1.y0,rect_1.x0+91,rect_2.y0]
- d_rect = [rect_1.x0+92,rect_1.y0,rect_1.x0+255,rect_2.y0]
- a_rect = [rect_1.x0+256,rect_1.y0,rect_1.x1,rect_2.y0]
- if debug > 10:
- dup_p.draw_rect(t_d_rect,color=fitz.pdfcolor["yellow"],width=2)
- dup_p.draw_rect(p_d_rect,color=fitz.pdfcolor["green"],width=2)
- dup_p.draw_rect(d_rect,color=fitz.pdfcolor["cyan"],width=2)
- dup_p.draw_rect(a_rect,color=fitz.pdfcolor["magenta"],width=2)
- if debug > 1:
- show_image(dup_p,"rectangles!")
- t_date.append(page.get_text("text", clip=t_d_rect).split("\n"))
- p_date.append(page.get_text("text", clip=p_d_rect).split("\n"))
- desc_text = page.get_text("block", clip=d_rect)
- desc_text = desc_text.split("\n")
- amount_text = page.get_text("text", clip=a_rect).replace("$","").replace(",","").split("\n")
- if desc_text[0] == "PREVIOUS" and desc_text[1] == "STATEMENT" and desc_text[2] == "BALANCE":
- desc_text = desc_text[3:]
- amount_text = amount_text[1:]
- rem = []
- for d in range(0,len(desc_text)-1):
- if len(desc_text[d]) <= 13 and len(desc_text[d]) > 1 and len(desc_text[d-1]) > 19:
- desc_text[d-1] = desc_text[d-1] + " " + desc_text[d]
- desc_text[d] = ""
- rem.append(d)
- remd = 0
- for r in rem:
- if r < len(desc_text)+remd-1:
- desc_text.pop(r-remd)
- remd+=1
- desc.append(desc_text)
- amount.append(amount_text)
- last_page = count_pages
- count_rect+=1
- all_data = []
- t_year, p_year = year, year
- check_date = re.match(r".*(\d{4}_01_\d{2}).*", fName, re.IGNORECASE)
- for i in range(0,len(t_date)):
- for j in range(0,len(t_date[i])):
- if t_date[i][j] == "" or desc[i][j] == "PAYMENT - THANK YOU":
- continue
- if desc[i][j][-4:] == " NET":
- desc[i][j] = desc[i][j][0:-4]
- if re.match(r".*(amz\*amazon|amz\*ware|amazon.ca\*|amazon.*downtown).*", desc[i][j], re.IGNORECASE) != None:
- desc[i][j] = "Amazon.ca"
- if re.match(r".*(AMZN Mktp CA|Amazon \*Mark).*", desc[i][j], re.IGNORECASE) != None:
- desc[i][j] = "AMZN Mktp CA"
- if re.match(r".*(AMZN Mktp US).*", desc[i][j], re.IGNORECASE) != None:
- desc[i][j] = "AMZN Mktp US"
- if re.match(r".*(audible).*", desc[i][j], re.IGNORECASE) != None:
- desc[i][j] = "Audible CA"
- if check_date != None:
- if t_date[i][j][0:3] == "DEC":
- t_year = str(int(year)-1)
- else:
- t_year = year
- if p_date[i][j][0:3] == "DEC":
- p_year = str(int(year)-1)
- else:
- p_year = year
- all_data.append([year,t_date[i][j]+ " " + t_year, p_date[i][j]+ " " + p_year, desc[i][j], amount[i][j]])
- doc.close()
- plt.show()
- return all_data
- dir = ".\\td\\"
- loop_statements(dir, read_pdf, "td")
Advertisement
Add Comment
Please, Sign In to add comment