Guest User

Untitled

a guest
Jan 4th, 2025
34
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.75 KB | None | 0 0
  1. import fitz
  2. import re
  3. import matplotlib.pyplot as plt
  4. from common_elements import loop_statements, show_image, line_by_line
  5.  
  6. def read_pdf(fName, debug,year):
  7. doc = fitz.open(fName)
  8. t_date = []
  9. p_date = []
  10. desc = []
  11. amount = []
  12. count_pages = 0
  13. last_page = 0
  14. for page in doc:
  15. paths = page.get_drawings()
  16. dup_p = page
  17. count_rect = 0
  18. t_d_rect, p_d_rect, d_rect, a_rect, rect_1, rect_2 = None, None, None, None, None, None
  19. count_pages+=1
  20. for path in paths:
  21. p = path['rect']
  22. if p.width > 200 and p.x0 > 40 and p.x1 < 350:
  23. if count_rect == 1:
  24. if debug > 1:
  25. dup_p.draw_rect(p,color=fitz.pdfcolor["blue"],width=2)
  26. rect_1 = p
  27. if count_rect == 2:
  28. if debug > 1:
  29. dup_p.draw_rect(p,color=fitz.pdfcolor["red"],width=2)
  30. rect_2 = p
  31.  
  32. if rect_1 != None and rect_2 != None and last_page != count_pages:
  33. t_d_rect = [rect_1.x0,rect_1.y0,rect_1.x0+47,rect_2.y0]
  34. p_d_rect = [rect_1.x0+48,rect_1.y0,rect_1.x0+91,rect_2.y0]
  35. d_rect = [rect_1.x0+92,rect_1.y0,rect_1.x0+255,rect_2.y0]
  36. a_rect = [rect_1.x0+256,rect_1.y0,rect_1.x1,rect_2.y0]
  37.  
  38. if debug > 10:
  39. dup_p.draw_rect(t_d_rect,color=fitz.pdfcolor["yellow"],width=2)
  40. dup_p.draw_rect(p_d_rect,color=fitz.pdfcolor["green"],width=2)
  41. dup_p.draw_rect(d_rect,color=fitz.pdfcolor["cyan"],width=2)
  42. dup_p.draw_rect(a_rect,color=fitz.pdfcolor["magenta"],width=2)
  43. if debug > 1:
  44. show_image(dup_p,"rectangles!")
  45.  
  46. t_date.append(page.get_text("text", clip=t_d_rect).split("\n"))
  47. p_date.append(page.get_text("text", clip=p_d_rect).split("\n"))
  48. desc_text = page.get_text("block", clip=d_rect)
  49. desc_text = desc_text.split("\n")
  50.  
  51. amount_text = page.get_text("text", clip=a_rect).replace("$","").replace(",","").split("\n")
  52. if desc_text[0] == "PREVIOUS" and desc_text[1] == "STATEMENT" and desc_text[2] == "BALANCE":
  53. desc_text = desc_text[3:]
  54. amount_text = amount_text[1:]
  55.  
  56. rem = []
  57. for d in range(0,len(desc_text)-1):
  58. if len(desc_text[d]) <= 13 and len(desc_text[d]) > 1 and len(desc_text[d-1]) > 19:
  59. desc_text[d-1] = desc_text[d-1] + " " + desc_text[d]
  60. desc_text[d] = ""
  61. rem.append(d)
  62.  
  63. remd = 0
  64. for r in rem:
  65. if r < len(desc_text)+remd-1:
  66. desc_text.pop(r-remd)
  67. remd+=1
  68. desc.append(desc_text)
  69. amount.append(amount_text)
  70. last_page = count_pages
  71. count_rect+=1
  72.  
  73. all_data = []
  74. t_year, p_year = year, year
  75. check_date = re.match(r".*(\d{4}_01_\d{2}).*", fName, re.IGNORECASE)
  76. for i in range(0,len(t_date)):
  77. for j in range(0,len(t_date[i])):
  78. if t_date[i][j] == "" or desc[i][j] == "PAYMENT - THANK YOU":
  79. continue
  80. if desc[i][j][-4:] == " NET":
  81. desc[i][j] = desc[i][j][0:-4]
  82. if re.match(r".*(amz\*amazon|amz\*ware|amazon.ca\*|amazon.*downtown).*", desc[i][j], re.IGNORECASE) != None:
  83. desc[i][j] = "Amazon.ca"
  84. if re.match(r".*(AMZN Mktp CA|Amazon \*Mark).*", desc[i][j], re.IGNORECASE) != None:
  85. desc[i][j] = "AMZN Mktp CA"
  86. if re.match(r".*(AMZN Mktp US).*", desc[i][j], re.IGNORECASE) != None:
  87. desc[i][j] = "AMZN Mktp US"
  88. if re.match(r".*(audible).*", desc[i][j], re.IGNORECASE) != None:
  89. desc[i][j] = "Audible CA"
  90. if check_date != None:
  91. if t_date[i][j][0:3] == "DEC":
  92. t_year = str(int(year)-1)
  93. else:
  94. t_year = year
  95. if p_date[i][j][0:3] == "DEC":
  96. p_year = str(int(year)-1)
  97. else:
  98. p_year = year
  99. all_data.append([year,t_date[i][j]+ " " + t_year, p_date[i][j]+ " " + p_year, desc[i][j], amount[i][j]])
  100. doc.close()
  101. plt.show()
  102. return all_data
  103.  
  104. dir = ".\\td\\"
  105. loop_statements(dir, read_pdf, "td")
Advertisement
Add Comment
Please, Sign In to add comment