Advertisement
Guest User

Untitled

a guest
Dec 23rd, 2023
397
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.10 KB | None | 0 0
  1. import subprocess
  2. import pandas as pd
  3. import os
  4.  
  5. def convert_pdf_to_text(pdf_path, text_path):
  6.     # use Ghostscript for PDF in text file
  7.     subprocess.run(["gs", "-q", "-dNOPAUSE", "-dBATCH", "-sDEVICE=txtwrite", f"-sOutputFile={text_path}", pdf_path])
  8.  
  9. def convert_text_to_xlsx(text_path, xlsx_path):
  10.     if os.path.exists(text_path):
  11.         with open(text_path, 'r') as file:
  12.             # Read lines of text
  13.             lines = file.readlines()
  14.             data = [tuple(line.split()) for line in lines]
  15.  
  16.         df = pd.DataFrame(data)
  17.         df.to_excel(xlsx_path, index=False, header=False)
  18.         print(f"Plik XLSX {xlsx_path} utworzony.")
  19.     else:
  20.         print(f"Plik tekstowy {text_path} nie istnieje.")
  21.  
  22. if __name__ == "__main__":
  23.     hostname = "michael"
  24.     pdf_file = f"/home/{hostname}/SIWB/TEST_XLSX/inf-s1.pdf"
  25.     text_file = f"/home/{hostname}/SIWB/TEST_XLSX/plik.txt"
  26.     xlsx_file = f"/home/{hostname}/SIWB/TEST_XLSX/plik.xlsx"
  27.  
  28.     convert_pdf_to_text(pdf_file, text_file)
  29.     print(f"Plik tekstowy {text_file} utworzony.")
  30.     convert_text_to_xlsx(text_file, xlsx_file)
  31.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement