TroubleShooter78a

Pdftitle

Apr 8th, 2020
296
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.57 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. """
  4. Extract title from PDF file.
  5. Depends on: pyPDF, PDFMiner.
  6. Usage:
  7.    find . -name "*.pdf" |  xargs -I{} pdftitle -d tmp --rename {}
  8. """
  9.  
  10. import cStringIO
  11. import getopt
  12. import os
  13. import re
  14. import string
  15. import sys
  16.  
  17. from pyPdf import PdfFileReader
  18. from pyPdf.utils import PdfReadError
  19.  
  20. from pdfminer.converter import TextConverter
  21. from pdfminer.layout import LAParams
  22. from pdfminer.pdfinterp import PDFResourceManager, process_pdf, PDFTextExtractionNotAllowed
  23. from pdfminer.pdfparser import PDFSyntaxError
  24.  
  25. __all__ = ['pdf_title']
  26.  
  27. def sanitize(filename):
  28.     """Turn string to valid file name.
  29.    """
  30.     valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
  31.     return ''.join([c for c in filename if c in valid_chars])
  32.  
  33. def meta_title(filename):
  34.     """Title from pdf metadata.
  35.    """
  36.     try:
  37.         docinfo = PdfFileReader(file(filename, 'rb')).getDocumentInfo()
  38.         return docinfo.title if docinfo.title else ""
  39.     except PdfReadError:
  40.         return ""
  41.  
  42. def copyright_line(line):
  43.     """Judge if a line is copyright info.
  44.    """
  45.     return re.search(r'technical\s+report|proceedings|preprint|to\s+appear|submission', line.lower())
  46.  
  47. def empty_str(s):
  48.     return len(s.strip()) == 0
  49.  
  50. def pdf_text(filename):
  51.     try:
  52.         text = cStringIO.StringIO()
  53.         rsrc = PDFResourceManager()
  54.         device = TextConverter(rsrc, text, codec='utf-8', laparams=LAParams())
  55.         process_pdf(rsrc, device, file(filename, 'rb'), None, maxpages=1, password='')
  56.         device.close()
  57.         return text.getvalue()
  58.     except (PDFSyntaxError, PDFTextExtractionNotAllowed):
  59.         return ""
  60.  
  61. def title_start(lines):
  62.     for i, line in enumerate(lines):
  63.         if not empty_str(line) and not copyright_line(line):
  64.             return i;
  65.     return 0
  66.  
  67. def title_end(lines, start, max_lines=2):
  68.     for i, line in enumerate(lines[start+1:start+max_lines+1], start+1):
  69.         if empty_str(line):
  70.             return i
  71.     return start + 1
  72.  
  73. def text_title(filename):
  74.     """Extract title from PDF's text.
  75.    """
  76.     lines = pdf_text(filename).strip().split('\n')
  77.  
  78.     i = title_start(lines)
  79.     j = title_end(lines, i)
  80.  
  81.     return ' '.join(line.strip() for line in lines[i:j])
  82.  
  83. def valid_title(title):
  84.     return not empty_str(title) and empty_str(os.path.splitext(title)[1])
  85.  
  86. def pdf_title(filename):
  87.     title = meta_title(filename)
  88.     if valid_title(title):
  89.         return title
  90.  
  91.     title = text_title(filename)
  92.     if valid_title(title):
  93.         return title
  94.  
  95.     return os.path.basename(os.path.splitext(filename)[0])
  96.  
  97. if __name__ == "__main__":
  98.     opts, args = getopt.getopt(sys.argv[1:], 'nd:', ['dry-run', 'rename'])
  99.  
  100.     dry_run = False
  101.     rename = False
  102.     dir = "."
  103.  
  104.     for opt, arg in opts:
  105.         if opt in ['-n', '--dry-run']:
  106.             dry_run = True
  107.         elif opt in ['--rename']:
  108.             rename = True
  109.         elif opt in ['-d']:
  110.             dir = arg
  111.  
  112.     if len(args) == 0:
  113.         print "Usage: %s [-d output] [--dry-run] [--rename] filenames" % sys.argv[0]
  114.         sys.exit(1)
  115.  
  116.     for filename in args:
  117.         title = pdf_title(filename)
  118.         if rename:
  119.             new_name = os.path.join(dir, sanitize(' '.join(title.split())) + ".pdf")
  120.             print "%s => %s" % (filename, new_name)
  121.             if not dry_run:
  122.                 if os.path.exists(new_name):
  123.                     print "*** Target %s already exists! ***" % new_name
  124.                 else:
  125.                     os.rename(filename, new_name)
  126.         else:
  127.             print title
Add Comment
Please, Sign In to add comment