Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import subprocess
- import pandas as pd
- import os
- def convert_pdf_to_text(pdf_path, text_path):
- # use Ghostscript for PDF in text file
- subprocess.run(["gs", "-q", "-dNOPAUSE", "-dBATCH", "-sDEVICE=txtwrite", f"-sOutputFile={text_path}", pdf_path])
- def convert_text_to_xlsx(text_path, xlsx_path):
- if os.path.exists(text_path):
- with open(text_path, 'r') as file:
- # Read lines of text
- lines = file.readlines()
- data = [tuple(line.split()) for line in lines]
- df = pd.DataFrame(data)
- df.to_excel(xlsx_path, index=False, header=False)
- print(f"Plik XLSX {xlsx_path} utworzony.")
- else:
- print(f"Plik tekstowy {text_path} nie istnieje.")
- if __name__ == "__main__":
- hostname = "michael"
- pdf_file = f"/home/{hostname}/SIWB/TEST_XLSX/inf-s1.pdf"
- text_file = f"/home/{hostname}/SIWB/TEST_XLSX/plik.txt"
- xlsx_file = f"/home/{hostname}/SIWB/TEST_XLSX/plik.xlsx"
- convert_pdf_to_text(pdf_file, text_file)
- print(f"Plik tekstowy {text_file} utworzony.")
- convert_text_to_xlsx(text_file, xlsx_file)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement