Advertisement
skip420

pdfHighlight_word_grabber

Sep 5th, 2021
1,174
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.01 KB | None | 0 0
  1. #grab_Highlighted_texts_and_its_page_Number_from_PDF
  2. #python3
  3. #python3 highlight.py example.pdf
  4.  
  5. #skip420@skip420:~/Desktop/pdfhighlight$ python3 highlight.py example.pdf
  6. challenge  11
  7. (nutritional  11
  8. legumes,  16
  9. Pritikin  17
  10. 4 annotation(s) found
  11.  
  12.  
  13.  
  14.  
  15.  
  16. #Author:Skip420
  17.  
  18. # python3 test5.py example.pdf
  19. #Note: Where it says "example.pdf" You may replace the file name to whatever it is.
  20.  
  21.  
  22. import popplerqt5 # sudo apt-get install -y python3-poppler-qt5
  23. import sys
  24. import PyQt5
  25. import urllib
  26. import os
  27. import PyPDF2
  28.  
  29.  
  30. def main():
  31.  
  32.     doc = popplerqt5.Poppler.Document.load(sys.argv[1])
  33.     total_annotations = 0
  34.     for x in range(doc.numPages()):
  35.         #print("========= PAGE {} =========".format(i+1))
  36.         page = doc.page(x)
  37.         annotations = page.annotations()
  38.         (pwidth, pheight) = (page.pageSize().width(), page.pageSize().height())
  39.         if len(annotations) > 0:
  40.             for annotation in annotations:
  41.                 if  isinstance(annotation, popplerqt5.Poppler.Annotation):
  42.                     total_annotations += 1
  43.                     if(isinstance(annotation, popplerqt5.Poppler.HighlightAnnotation)):
  44.                         quads = annotation.highlightQuads()
  45.                         txt = ""
  46.                         for quad in quads:
  47.                             rect = (quad.points[0].x() * pwidth,
  48.                                     quad.points[0].y() * pheight,
  49.                                     quad.points[2].x() * pwidth,
  50.                                     quad.points[2].y() * pheight)
  51.                             bdy = PyQt5.QtCore.QRectF()
  52.                             bdy.setCoords(*rect)
  53.                             txt = txt + str(page.text(bdy)) + ' '
  54.  
  55.                         #print("========= ANNOTATION =========")
  56.                         print (txt, x)
  57.  
  58.     if total_annotations > 0:
  59.         print (str(total_annotations) + " annotation(s) found")
  60.     else:
  61.         print ("no annotations found")
  62.  
  63. if __name__ == "__main__":
  64.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement