Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import PyPDF2
- import re
- import glob
- for file in glob.glob("pdf/*.pdf"):
- print(file)
- if file.endswith('.pdf'):
- fileReader = PyPDF2.PdfFileReader(open(file, "rb"))
- count = 0
- count = fileReader.numPages
- while count >= 0:
- count -= 1
- pageObj = fileReader.getPage(count)
- text = pageObj.extractText()
- resul = re.sub('[^a-zA-Z \n\.]',' ',text)
- print(resul)
- teste = resul
- teste.lower()
- print(teste)
- else:
- print("Não tem esse formato!")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement