Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re #REGEX
- import glob #Search files
- import fitz #PyMuPDF 1.14.2
- #REGEX
- findEN = r"(^[a-z]{2})\s\D*?\s*([0-9]+)"
- #findHR = r"hr\D*\s*([0-9]+)\D*([0-9]*)"
- #Search all pdfs from this directory
- pdfList = glob.glob('./*.pdf')
- #Go through each pdf
- for pdf in pdfList:
- doc = fitz.open(pdf)
- text = doc[0].getText("text")
- numbero = doc.pageCount
- #Spis tresci
- engTest = re.findall(findEN, text ,re.MULTILINE)
- engTest = sorted(engTest, key=lambda tup: int(tup[1]))
- for i,item in enumerate(engTest):
- if "en" in item:
- engStart = item[1]
- if(len(engTest)>i+1):
- engEnd = engTest[i+1][1]
- else:
- engEnd = doc.pageCount()
- if "hr" in item:
- hrStart = item[1]
- if(len(engTest)>i+1):
- hrEnd = engTest[i+1][1]
- else:
- hrEnd = doc.pageCount()
- #Save whole text to files
- #EN
- with open("EN", "a+", encoding="utf-8") as f:
- for i in range(int(engStart)-1, int(engEnd)-1):
- if i < numbero:
- actualPage = doc[i].getText("text")
- f.write(actualPage)
- f.truncate()
- #HR
- with open("HR", "a+", encoding="utf-8") as f:
- for i in range(int(hrStart)-1, int(hrEnd)-1):
- if i < numbero:
- actualPage = doc[i].getText("text")
- f.write(actualPage)
- f.truncate()
- doc.close()
Add Comment
Please, Sign In to add comment