Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import fitz
- from datetime import datetime
- BASE_DIR = os.path.dirname(os.path.abspath(__file__))
- LOG_FILE = os.path.join(BASE_DIR, "skipped_files_log.txt")
- rename_mapping = {
- "formw2": ("0010", "W-2"),
- "w2statement": ("0010", "W-2"),
- "w2g": ("0015", "W-2G"),
- "ssa1099": ("0030", "SSA"),
- "form1099r": ("0020", "1099-R"),
- "1099r": ("0020", "1099-R"),
- "k-1": ("0050", "K-1"),
- "form1095c": ("0065", "HEALTH"),
- "1095c": ("0065", "HEALTH"),
- "form1095a": ("0065", "HEALTH"),
- "1095a": ("0065", "HEALTH"),
- "form1099int": ("0040", "INT DIV SCH D"),
- "form1099div": ("0040", "INT DIV SCH D"),
- "form1099b": ("0040", "INT DIV SCH D"),
- "interestincome": ("0040", "INT DIV SCH D"),
- "ordinarydividends": ("0040", "INT DIV SCH D"),
- "proceedsfrombroker": ("0040", "INT DIV SCH D"),
- "healthequity": ("0060", "HSA"),
- "hsastatement": ("0060", "HSA"),
- "1099sa": ("0060", "HSA"),
- "form1099sa": ("0060", "HSA"),
- "form5498sa": ("0060", "HSA"),
- "1098t": ("0070", "EDUCATION"),
- "1099q": ("0070", "EDUCATION"),
- "1098e": ("0070", "EDUCATION"),
- "mortgageinterest": ("1000", "SCH A - MORTGAGE"),
- "formptc": ("0080", "NE PTC"),
- "registration": ("1010", "SCH A - VEHICLE"),
- "ptclookup": ("0080", "NE PTC"),
- "donationreceipt": ("1020", "SCH A - DONATIONS"),
- "giftstatement": ("1020", "SCH A - DONATIONS"),
- "donation": ("1020", "SCH A - DONATIONS"),
- "charitable": ("1020", "SCH A - DONATIONS"),
- "501c": ("1020", "SCH A - DONATIONS"),
- }
- file_counters = {}
- renamed_total = 0
- skipped_total = 0
- skipped_files = []
- with open(LOG_FILE, "w") as log:
- log.write(f"Skipped Files Log - {datetime.now()}\n")
- log.write("=" * 40 + "\n")
- for filename in os.listdir(BASE_DIR):
- if not filename.lower().endswith(".pdf"):
- continue
- full_path = os.path.join(BASE_DIR, filename)
- print(f"\nš Checking: {filename}")
- try:
- doc = fitz.open(full_path)
- full_text = "".join([page.get_text() for page in doc])
- doc.close()
- except Exception as e:
- reason = f"Error reading PDF: {e}"
- print(f"ā {reason}")
- skipped_files.append((filename, reason))
- skipped_total += 1
- continue
- normalized_text = full_text.lower().replace(" ", "").replace("-", "").replace("\n", "")
- renamed = False
- for keyword, (prefix, label) in rename_mapping.items():
- if keyword in normalized_text:
- file_counters.setdefault(prefix, 0)
- file_counters[prefix] += 1
- new_filename = f"{prefix}.{file_counters[prefix]} {label}.pdf"
- new_path = os.path.join(BASE_DIR, new_filename)
- if os.path.exists(new_path):
- reason = "Target file already exists"
- print(f"ā ļø Skipped {filename}: {reason}")
- skipped_files.append((filename, reason))
- skipped_total += 1
- renamed = True
- break
- os.rename(full_path, new_path)
- print(f"ā Renamed: {filename} ā {new_filename}")
- renamed = True
- renamed_total += 1
- break
- if not renamed:
- reason = "No keyword match"
- print(f"ā ļø Skipped: {filename} - {reason}")
- skipped_files.append((filename, reason))
- skipped_total += 1
- with open(LOG_FILE, "a") as log:
- for file, reason in skipped_files:
- log.write(f"{file}: {reason}\n")
- print(f"\nā Done. Renamed: {renamed_total} | Skipped: {skipped_total}")
- print(f"š Log saved to: {LOG_FILE}")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement