Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- # This is ugly, so I (Cley Faye) release it as Public Domain.
- from subprocess import call
- import re
- def splitIntoPages(source, template):
- args = ['pdftk', source, 'burst', 'output', template, 'compress']
- call(args);
- def pdfToText(source):
- args = ['pdftotext', source]
- call(args)
- def grabAllTitles(sources):
- titles = []
- REGEXES = [
- (1, re.compile('^Livre [0-9]+(er)? : (?P<TITLE>.*)')),
- (2, re.compile('^Titre [0-9]+(er)? : (?P<TITLE>.*)')),
- (3, re.compile('^(?P<TITLE>Article L.*)'))
- ]
- page = 0
- for filename in sources:
- page += 1
- with open(filename, 'r') as infile:
- lines = infile.read().split('\n')
- for line in lines:
- line = line.strip()
- for level, regex in REGEXES:
- match = regex.match(line)
- if match:
- titles.append((page, level, match.group('TITLE')))
- break
- return titles
- def extractMeta(pdf):
- args = ['pdftk', pdf, 'dump_data', 'output', 'temp.txt']
- call(args)
- meta = []
- with open('temp.txt', 'r') as infile:
- meta += infile.read().split('\n')
- return meta
- def addTitleToMeta(meta, titles):
- for page, level, title in titles:
- meta += [
- 'BookmarkBegin',
- 'BookmarkTitle: %s' % title,
- 'BookmarkLevel: %s' % level,
- 'BookmarkPageNumber: %s' % page
- ]
- def putMetaIn(pdf, meta, out):
- with open('temp.txt', 'w') as outfile:
- for metaline in meta:
- outfile.write('%s\n' % metaline)
- args = ['pdftk', pdf, 'update_info_utf8', 'temp.txt', 'output', out]
- call(args)
- def main():
- SOURCE="in.pdf"
- OUT="out.pdf"
- TEMPLATE="out_%03d.pdf"
- TEXTTEMPLATE="out_%03d.txt"
- PAGECOUNT=444
- splitIntoPages(SOURCE, TEMPLATE)
- sourceTextFiles = []
- for x in range(PAGECOUNT):
- sourceTextFiles.append(TEXTTEMPLATE % (x + 1))
- pdfToText(TEMPLATE % (x + 1))
- titles = grabAllTitles(sourceTextFiles)
- meta = extractMeta(SOURCE)
- addTitleToMeta(meta, titles)
- putMetaIn(SOURCE, meta, OUT)
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement