Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import os
- class PdfParser(object):
- r = re.compile(b'^(\d+) 0 obj', re.MULTILINE)
- ro = re.compile(b'^(\d+) 0 obj\n.+?\nendobj\n', re.MULTILINE | re.DOTALL)
- ri = re.compile(b'/Subtype\s*/Image', re.MULTILINE)
- def __init__(self, filename):
- self.__filename = filename
- self.__objects = []
- def __addObject(self, data, type=0, number=-1):
- self.__objects.append( [data, type, number] )
- def __read(self):
- with open(self.__filename, "rb") as fh:
- return bytearray(fh.read())
- def parse(self):
- p, data = 0, self.__read()
- for m in re.finditer(__class__.ro, data):
- if p != m.start(): self.__addObject( data[p:m.start()] )
- p = m.end()
- t = data[m.start():m.end()]
- type = 1 if __class__.ri.search(t) else 2
- self.__addObject( t, type, int(m.group(1)) )
- if len(data) != p: self.__addObject(data[p:len(data)] )
- def getImageIndexes(self):
- idxs = []
- for i, r in enumerate(self.__objects):
- if r[1] == 1: idxs.append(i)
- return idxs
- def setData(self, idx, data):
- m = __class__.r.search(data)
- if m:
- number = self.__objects[idx][2]
- data = bytearray(str(number).encode() + data[m.end(1):])
- self.__objects[idx][0] = data
- def getData(self, idx):
- return self.__objects[idx][0]
- def write(self, filename):
- with open(filename, "wb") as fh:
- for r in self.__objects:
- fh.write(r[0])
- def log(self, filename):
- cnt = 0
- with open(filename, "wt") as fh:
- for r in self.__objects:
- fh.write( "{} {} {}\n".format(r[1], r[2], len(r[0])) )
- if r[1] == 1: cnt += 1
- return(cnt)
- # ------------------------------------------------------------------------------
- if __name__ == '__main__':
- fn_img = r"xxx.pdf"
- fn_txt = r"xxx + text.pdf"
- fn_rep = os.path.splitext(fn_txt)[0] + ' - replaced.pdf'
- fn_log = os.path.splitext(fn_rep)[0] + '.log'
- pdf_img = PdfParser(fn_img)
- pdf_img.parse()
- idxs_img = pdf_img.getImageIndexes()
- pdf_txt = PdfParser(fn_txt)
- pdf_txt.parse()
- idxs_txt = pdf_txt.getImageIndexes()
- if len(idxs_img) != len(idxs_txt):
- raise Exception("Images number does not match!")
- for i,j in zip(idxs_txt, idxs_img):
- pdf_txt.setData(i, pdf_img.getData(j))
- pdf_txt.write(fn_rep)
- cnt = pdf_txt.log(fn_log)
- print("images replaced:", cnt)
Advertisement
Add Comment
Please, Sign In to add comment