Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import xlsxwriter
- from datetime import datetime
- import numpy as np
- import time
- import matplotlib.pyplot as plt
- import os
- def loop_statements(dir,reader_f,act_type,debug=0):
- everything = []
- now_and_then = round(time.time() * 1000)
- count=0
- for f in os.listdir(dir):
- f = f.lower()
- if f.endswith(".pdf"):
- count+=1
- if count >=5 and debug > 0:
- continue
- y = f[0:4]
- f = dir + f
- everything.append(reader_f(f, debug, y))
- everything = filter_list(everything,act_type)
- make_xlsx(everything, act_type+".xlsx")
- print("took",round(time.time() * 1000)-now_and_then, "ms to parse",count,"files")
- def filter_list(everything, act_type):
- for i in range(len(everything)):
- for j in range(len(everything[i])):
- try:
- new_amt = float(everything[i][j][4])
- except:
- new_amt = 0
- if everything[i][j][3] == "NET" and new_amt == 120:
- everything[i][j][3] = "ANNUAL FEE"
- if re.match(r".*(amz\*amazon|amz\*ware|amazon.ca\*|amazon.*downtown).*", everything[i][j][3], re.IGNORECASE) != None:
- everything[i][j][3] = "Amazon.ca"
- if re.match(r".*(AMZN Mktp CA|Amazon \*Mark).*", everything[i][j][3], re.IGNORECASE) != None:
- everything[i][j][3] = "AMZN Mktp CA"
- if re.match(r".*(AMZN Mktp US).*", everything[i][j][3], re.IGNORECASE) != None:
- everything[i][j][3] = "AMZN Mktp US"
- if re.match(r".*(audible).*", everything[i][j][3], re.IGNORECASE) != None:
- everything[i][j][3] = "Audible CA"
- if re.match(r".*(MEC|Mountain Equipment).*", everything[i][j][3], re.IGNORECASE) != None:
- everything[i][j][3] = "MEC - Mountain Equipment"
- if re.match(r".*(Steamgames|steam.*seattle).*", everything[i][j][3], re.IGNORECASE) != None:
- everything[i][j][3] = "Steam Games"
- check_groc = re.match(r".*(foods|super a|IGA|Safeway|Buy-low|buy low|freson|nofrills|pc express|superstore|save on|no frills).*", everything[i][j][3], re.IGNORECASE)
- check_ff = re.match(r".*(restaur|kura|pastry|red beard|pho a pho|mr mikes|tandoori|sushi|the shack|bakery|pizza|spicy gre|A&W|subway|dairy queen|wendys|taco de|tim horton|pizzeria|gus' pizza|panago|DQ Grill).*", everything[i][j][3], re.IGNORECASE)
- check_gas = re.match(r".*(chv[0-9]{5}|chevron|mohawk|canco|gti pet|co-op|7-eleven|Shell|esso|husky|petrocan|macs|7 eleven|petro canada).*", everything[i][j][3], re.IGNORECASE)
- check_int = re.match(r".*(Telus comm|tsi).*", everything[i][j][3], re.IGNORECASE)
- check_ful = re.match(r".*(Shell easypay|shell ep).*", everything[i][j][3], re.IGNORECASE)
- check_phn = re.match(r".*(fido mobile|freedom|koodo|virgin|public mob|telus mob).*", everything[i][j][3], re.IGNORECASE)
- check_amz = re.match(r".*(audible|amazon|amzn).*", everything[i][j][3], re.IGNORECASE)
- check_rent = re.match(r".*(cheque withdrawal).*", everything[i][j][3], re.IGNORECASE)
- check_pp = re.match(r".*(paypal).*", everything[i][j][3], re.IGNORECASE)
- check_pwr = re.match(r".*(BC HYDRO).*", everything[i][j][3], re.IGNORECASE)
- check_pay = re.match(r".*(from canada|province of b\.c).*", everything[i][j][3], re.IGNORECASE)
- check_inv2 = re.match(r".*(INTERAC e-Transfer).*", everything[i][j][3], re.IGNORECASE)
- check_med = re.match(r".*(dental|optometric|audio).*", everything[i][j][3], re.IGNORECASE)
- if check_groc != None:
- l="grocery"
- elif check_ff != None:
- l="fast food"
- elif check_ful != None:
- l="fuel"
- elif check_gas != None and new_amt < 20:
- l="gas station"
- elif check_gas != None:
- l="fuel"
- elif check_int != None:
- l="internet"
- elif check_phn != None:
- l="cell phone"
- elif check_amz != None:
- l="amazon"
- elif check_pp != None:
- l="paypal"
- elif check_pwr != None:
- l="power"
- elif check_rent != None and new_amt > 470 and new_amt < 600:
- l="rent"
- elif check_pay != None and new_amt > 1000:
- l="payroll"
- elif check_pay != None and new_amt < 1000:
- l="expenses"
- elif check_cc != None:
- l="credit cards"
- everything[i][j][4] = str(new_amt * -1)
- elif check_inv2 != None and new_amt > 2800 and new_amt < 3000:
- l="investing"
- elif act_type == "tngch":
- l="junk"
- else:
- l="misc"
- everything[i][j].append(l)
- everything[i][j].append(act_type)
- return everything
- def make_xlsx(everything, fName): #list of all statements and filename of xlsx
- wb = xlsxwriter.Workbook(fName)
- ws = wb.add_worksheet("statements")
- date_format = wb.add_format({'num_format': 'd mmm yyyy'})
- ws.write(0,0,"year")
- ws.write(0,1,"t_date")
- ws.write(0,2,"p_date")
- ws.write(0,3,"description")
- ws.write(0,4,"amount")
- ws.write(0,5,"label")
- ws.write(0,6,"act type")
- s_row = 1
- s_col = 0
- row=s_row
- col=s_col
- for s in range(0 ,len(everything)):
- for t in range(0, len(everything[s])):
- try:
- float(everything[s][t][4])
- except Exception as e:
- print(e)
- continue
- col=0
- ws.write_number(row,col,int(everything[s][t][0]))
- try:
- if fName == "tngch.xlsx":
- dte=datetime.strptime(everything[s][t][1], '%d %b %Y').date()
- elif fName == "td.xlsx":
- dte=datetime.strptime(everything[s][t][1], '%b %d %Y').date()
- ws.write_datetime(row,col+1,dte,date_format)
- except Exception as e:
- print(e)
- ws.write(row,col+1,str(everything[s][t][1]))
- ws.write(row,col+2,str(everything[s][t][2]))
- ws.write(row,col+3,str(everything[s][t][3]))
- ws.write_number(row,col+4,float(everything[s][t][4]))
- ws.write(row,col+5,str(everything[s][t][5]))
- ws.write(row,col+6,str(everything[s][t][6]))
- row+=1
- row+=1
- ws.set_column(0,0,4.3)
- ws.set_column(1,1,11.45)
- ws.set_column(2,2,11.45)
- ws.set_column(3,3,71.5)
- ws.set_column(4,4,9.7)
- wb.close()
- def show_image(item, title=""):
- """Display a pixmap.
- Just to display Pixmap image of "item" - ignore the man behind the curtain.
- Args:
- item: any PyMuPDF object having a "get_pixmap" method.
- title: a string to be used as image title
- Generates an RGB Pixmap from item using a constant DPI and using matplotlib
- to show it inline of the notebook.
- """
- DPI = 220 # use this resolution
- pix = item.get_pixmap(dpi=DPI)
- img = np.ndarray([pix.h, pix.w, 3], dtype=np.uint8, buffer=pix.samples_mv)
- plt.figure(dpi=DPI) # set the figure's DPI
- plt.title(title) # set title of image
- _ = plt.imshow(img, extent=(0, pix.w * 72 / DPI, pix.h * 72 / DPI, 0))
- def line_by_line(stuff):
- count = 0
- for line in stuff:
- print(count, line)
- count+=1
Advertisement
Add Comment
Please, Sign In to add comment