Advertisement
Guest User

Untitled

a guest
Jul 9th, 2017
107
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.25 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. # This is ugly, so I (Cley Faye) release it as Public Domain.
  3. from subprocess import call
  4. import re
  5.  
  6.  
  7. def splitIntoPages(source, template):
  8.     args = ['pdftk', source, 'burst', 'output', template, 'compress']
  9.     call(args);
  10.  
  11.  
  12. def pdfToText(source):
  13.     args = ['pdftotext', source]
  14.     call(args)
  15.  
  16.  
  17. def grabAllTitles(sources):
  18.     titles = []
  19.  
  20.     REGEXES = [
  21.             (1, re.compile('^Livre [0-9]+(er)? : (?P<TITLE>.*)')),
  22.             (2, re.compile('^Titre [0-9]+(er)? : (?P<TITLE>.*)')),
  23.             (3, re.compile('^(?P<TITLE>Article L.*)'))
  24.             ]
  25.  
  26.     page = 0
  27.     for filename in sources:
  28.         page += 1
  29.         with open(filename, 'r') as infile:
  30.             lines = infile.read().split('\n')
  31.         for line in lines:
  32.             line = line.strip()
  33.             for level, regex in REGEXES:
  34.                 match = regex.match(line)
  35.                 if match:
  36.                     titles.append((page, level, match.group('TITLE')))
  37.                     break
  38.     return titles
  39.  
  40.  
  41. def extractMeta(pdf):
  42.     args = ['pdftk', pdf, 'dump_data', 'output', 'temp.txt']
  43.     call(args)
  44.     meta = []
  45.     with open('temp.txt', 'r') as infile:
  46.         meta += infile.read().split('\n')
  47.     return meta
  48.  
  49.  
  50. def addTitleToMeta(meta, titles):
  51.     for page, level, title in titles:
  52.         meta += [
  53.                 'BookmarkBegin',
  54.                 'BookmarkTitle: %s' % title,
  55.                 'BookmarkLevel: %s' % level,
  56.                 'BookmarkPageNumber: %s' % page
  57.                 ]
  58.  
  59.  
  60. def putMetaIn(pdf, meta, out):
  61.     with open('temp.txt', 'w') as outfile:
  62.         for metaline in meta:
  63.             outfile.write('%s\n' % metaline)
  64.     args = ['pdftk', pdf, 'update_info_utf8', 'temp.txt', 'output', out]
  65.     call(args)
  66.  
  67.  
  68. def main():
  69.     SOURCE="in.pdf"
  70.     OUT="out.pdf"
  71.     TEMPLATE="out_%03d.pdf"
  72.     TEXTTEMPLATE="out_%03d.txt"
  73.     PAGECOUNT=444
  74.     splitIntoPages(SOURCE, TEMPLATE)
  75.     sourceTextFiles = []
  76.     for x in range(PAGECOUNT):
  77.         sourceTextFiles.append(TEXTTEMPLATE % (x + 1))
  78.         pdfToText(TEMPLATE % (x + 1))
  79.     titles = grabAllTitles(sourceTextFiles)
  80.     meta = extractMeta(SOURCE)
  81.     addTitleToMeta(meta, titles)
  82.     putMetaIn(SOURCE, meta, OUT)
  83.  
  84.  
  85. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement