Untitled

import fitz
import re
import matplotlib.pyplot as plt
from common_elements import loop_statements, show_image, line_by_line

def read_pdf(fName, debug,year):
    doc = fitz.open(fName)
    t_date = []
    p_date = []
    desc = []
    amount = []
    count_pages = 0
    last_page = 0
    for page in doc:
        paths = page.get_drawings()
        dup_p = page
        count_rect = 0
        t_d_rect, p_d_rect, d_rect, a_rect, rect_1, rect_2 = None, None, None, None, None, None
        count_pages+=1
        for path in paths:
            p = path['rect']
            if p.width > 200 and p.x0 > 40 and p.x1 < 350:
                if count_rect == 1:
                    if debug > 1:
                        dup_p.draw_rect(p,color=fitz.pdfcolor["blue"],width=2)
                    rect_1 = p
                if count_rect == 2:
                    if debug > 1:
                        dup_p.draw_rect(p,color=fitz.pdfcolor["red"],width=2)
                    rect_2 = p

                if rect_1 != None and rect_2 != None and last_page != count_pages:
                    t_d_rect = [rect_1.x0,rect_1.y0,rect_1.x0+47,rect_2.y0]
                    p_d_rect = [rect_1.x0+48,rect_1.y0,rect_1.x0+91,rect_2.y0]
                    d_rect = [rect_1.x0+92,rect_1.y0,rect_1.x0+255,rect_2.y0]
                    a_rect = [rect_1.x0+256,rect_1.y0,rect_1.x1,rect_2.y0]

                    if debug > 10:
                        dup_p.draw_rect(t_d_rect,color=fitz.pdfcolor["yellow"],width=2)
                        dup_p.draw_rect(p_d_rect,color=fitz.pdfcolor["green"],width=2)
                        dup_p.draw_rect(d_rect,color=fitz.pdfcolor["cyan"],width=2)
                        dup_p.draw_rect(a_rect,color=fitz.pdfcolor["magenta"],width=2)
                    if debug > 1:
                        show_image(dup_p,"rectangles!")

                    t_date.append(page.get_text("text", clip=t_d_rect).split("\n"))
                    p_date.append(page.get_text("text", clip=p_d_rect).split("\n"))
                    desc_text = page.get_text("block", clip=d_rect)
                    desc_text = desc_text.split("\n")

                    amount_text = page.get_text("text", clip=a_rect).replace("$","").replace(",","").split("\n")
                    if desc_text[0] == "PREVIOUS" and desc_text[1] == "STATEMENT" and desc_text[2] == "BALANCE":
                        desc_text = desc_text[3:]
                        amount_text = amount_text[1:]

                    rem = []
                    for d in range(0,len(desc_text)-1):
                        if len(desc_text[d]) <= 13 and len(desc_text[d]) > 1 and len(desc_text[d-1]) > 19:
                            desc_text[d-1] = desc_text[d-1] + " " + desc_text[d]
                            desc_text[d] = ""
                            rem.append(d)

                    remd = 0
                    for r in rem:
                        if r < len(desc_text)+remd-1:
                            desc_text.pop(r-remd)
                            remd+=1
                    desc.append(desc_text)
                    amount.append(amount_text)
                    last_page = count_pages
                count_rect+=1

    all_data = []
    t_year, p_year = year, year
    check_date = re.match(r".*(\d{4}_01_\d{2}).*", fName, re.IGNORECASE)
    for i in range(0,len(t_date)):
        for j in range(0,len(t_date[i])):
            if t_date[i][j] == "" or desc[i][j] == "PAYMENT - THANK YOU":
                continue
            if desc[i][j][-4:] == " NET":
                desc[i][j] = desc[i][j][0:-4]
            if re.match(r".*(amz\*amazon|amz\*ware|amazon.ca\*|amazon.*downtown).*", desc[i][j], re.IGNORECASE) != None:
                desc[i][j] = "Amazon.ca"
            if re.match(r".*(AMZN Mktp CA|Amazon \*Mark).*", desc[i][j], re.IGNORECASE) != None:
                desc[i][j] = "AMZN Mktp CA"
            if re.match(r".*(AMZN Mktp US).*", desc[i][j], re.IGNORECASE) != None:
                desc[i][j] = "AMZN Mktp US"
            if re.match(r".*(audible).*", desc[i][j], re.IGNORECASE) != None:
                desc[i][j] = "Audible CA"
            if check_date != None:
                if t_date[i][j][0:3] == "DEC":
                    t_year = str(int(year)-1)
                else:
                    t_year = year
                if p_date[i][j][0:3] == "DEC":
                    p_year = str(int(year)-1)
                else:
                    p_year = year
            all_data.append([year,t_date[i][j]+ " " + t_year, p_date[i][j]+ " " + p_year, desc[i][j], amount[i][j]])
    doc.close()
    plt.show()
    return all_data

dir = ".\\td\\"
loop_statements(dir, read_pdf, "td")