Guest User

PDFParser - images replacer

a guest
Sep 2nd, 2018
73
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.37 KB | None | 0 0
  1. import re
  2. import os
  3.  
  4. class PdfParser(object):
  5.   r = re.compile(b'^(\d+) 0 obj', re.MULTILINE)
  6.   ro = re.compile(b'^(\d+) 0 obj\n.+?\nendobj\n', re.MULTILINE | re.DOTALL)
  7.   ri = re.compile(b'/Subtype\s*/Image', re.MULTILINE)
  8.    
  9.   def __init__(self, filename):
  10.     self.__filename = filename
  11.     self.__objects = []
  12.    
  13.   def __addObject(self, data, type=0, number=-1):
  14.     self.__objects.append( [data, type, number] )
  15.    
  16.   def __read(self):
  17.     with open(self.__filename, "rb") as fh:
  18.       return bytearray(fh.read())
  19.  
  20.   def parse(self):
  21.     p, data = 0, self.__read()
  22.     for m in re.finditer(__class__.ro, data):
  23.       if p != m.start(): self.__addObject( data[p:m.start()] )
  24.       p = m.end()
  25.       t = data[m.start():m.end()]
  26.       type = 1 if __class__.ri.search(t) else 2
  27.       self.__addObject( t, type, int(m.group(1)) )
  28.     if len(data) != p: self.__addObject(data[p:len(data)] )
  29.  
  30.   def getImageIndexes(self):
  31.     idxs = []
  32.     for i, r in enumerate(self.__objects):
  33.       if r[1] == 1: idxs.append(i)
  34.     return idxs
  35.  
  36.   def setData(self, idx, data):
  37.     m = __class__.r.search(data)
  38.     if m:
  39.       number = self.__objects[idx][2]
  40.       data = bytearray(str(number).encode() + data[m.end(1):])
  41.     self.__objects[idx][0] = data
  42.  
  43.   def getData(self, idx):
  44.     return self.__objects[idx][0]
  45.  
  46.   def write(self, filename):
  47.     with open(filename, "wb") as fh:
  48.       for r in self.__objects:
  49.         fh.write(r[0])
  50.  
  51.   def log(self, filename):
  52.     cnt = 0
  53.     with open(filename, "wt") as fh:
  54.       for r in self.__objects:
  55.         fh.write( "{} {} {}\n".format(r[1], r[2], len(r[0])) )
  56.         if r[1] == 1: cnt += 1
  57.     return(cnt)
  58.  
  59. # ------------------------------------------------------------------------------
  60. if __name__ == '__main__':
  61.   fn_img = r"xxx.pdf"
  62.   fn_txt = r"xxx + text.pdf"
  63.   fn_rep = os.path.splitext(fn_txt)[0] + ' - replaced.pdf'
  64.   fn_log = os.path.splitext(fn_rep)[0] + '.log'
  65.  
  66.   pdf_img = PdfParser(fn_img)
  67.   pdf_img.parse()
  68.   idxs_img = pdf_img.getImageIndexes()
  69.  
  70.   pdf_txt = PdfParser(fn_txt)
  71.   pdf_txt.parse()
  72.   idxs_txt = pdf_txt.getImageIndexes()
  73.  
  74.   if len(idxs_img) != len(idxs_txt):
  75.     raise Exception("Images number does not match!")
  76.  
  77.   for i,j in zip(idxs_txt, idxs_img):
  78.     pdf_txt.setData(i, pdf_img.getData(j))
  79.  
  80.   pdf_txt.write(fn_rep)
  81.   cnt = pdf_txt.log(fn_log)
  82.   print("images replaced:", cnt)
Advertisement
Add Comment
Please, Sign In to add comment