Th3NiKo

PDF - Table of Contents

Dec 9th, 2018
93
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.51 KB | None | 0 0
  1. import re     #REGEX
  2. import glob   #Search files
  3. import fitz   #PyMuPDF 1.14.2
  4.  
  5. #REGEX
  6. findEN = r"(^[a-z]{2})\s\D*?\s*([0-9]+)"
  7. #findHR = r"hr\D*\s*([0-9]+)\D*([0-9]*)"
  8.  
  9. #Search all pdfs from this directory
  10. pdfList = glob.glob('./*.pdf')
  11.  
  12.  
  13. #Go through each pdf
  14. for pdf in pdfList:
  15.     doc = fitz.open(pdf)
  16.     text = doc[0].getText("text")
  17.     numbero = doc.pageCount
  18.    
  19.     #Spis tresci
  20.     engTest = re.findall(findEN, text ,re.MULTILINE)
  21.     engTest = sorted(engTest, key=lambda tup: int(tup[1]))
  22.     for i,item in enumerate(engTest):
  23.         if "en" in item:
  24.             engStart = item[1]
  25.             if(len(engTest)>i+1):
  26.                 engEnd = engTest[i+1][1]
  27.             else:
  28.                 engEnd = doc.pageCount()
  29.         if "hr" in item:
  30.             hrStart = item[1]
  31.             if(len(engTest)>i+1):
  32.                 hrEnd = engTest[i+1][1]
  33.             else:
  34.                 hrEnd = doc.pageCount()
  35.  
  36.    
  37.  
  38.     #Save whole text to files
  39.     #EN
  40.     with open("EN", "a+", encoding="utf-8") as f:
  41.         for i in range(int(engStart)-1, int(engEnd)-1):
  42.             if i < numbero:
  43.                 actualPage = doc[i].getText("text")
  44.                 f.write(actualPage)
  45.                 f.truncate()
  46.     #HR
  47.     with open("HR", "a+", encoding="utf-8") as f:
  48.         for i in range(int(hrStart)-1, int(hrEnd)-1):
  49.             if i < numbero:
  50.                 actualPage = doc[i].getText("text")
  51.                 f.write(actualPage)
  52.                 f.truncate()
  53.  
  54.     doc.close()
Add Comment
Please, Sign In to add comment