Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- '''
- 1 - Convert PDF to PNG files
- 2 - Tesseract PNG files to create HOCR
- 3 - Run this program on each individual PNG/HOCR file
- 4 - Combine PDF pages
- '''
- import logging
- import math
- import codecs
- from reportlab.pdfgen.canvas import Canvas
- from reportlab.lib.units import inch
- from reportlab.pdfbase.pdfmetrics import getDescent, getFont
- import xml.etree.cElementTree as ET
- import Image, re, sys
- draw_image = False
- hide_text = False
- draw_line_rect = False
- draw_word_rect = False
- bottom_by_line = True
- class HocrConverter():
- """
- A class for converting documents to/from the hOCR format.
- For details of the hOCR format, see:
- http://docs.google.com/View?docid=dfxcv4vc_67g844kf
- See also:
- http://code.google.com/p/hocr-tools/
- Basic usage:
- Create a PDF from an hOCR file and an image:
- hocr = HocrConverter("path/to/hOCR/file")
- hocr.to_pdf("path/to/image/file", "path/to/output/file")
- """
- def __init__(self, hocrFileName = None):
- self.hocr = None
- self.xmlns = ''
- self.boxPattern = re.compile('bbox((\s+\d+){4})')
- if hocrFileName is not None:
- self.parse_hocr(hocrFileName)
- def __str__(self):
- """
- Return the textual content of the HTML body
- """
- if self.hocr is None:
- return ''
- body = self.hocr.find(".//%sbody"%(self.xmlns))
- if body:
- return self._get_element_text(body).encode('utf-8') # XML gives unicode
- else:
- return ''
- def _get_element_text(self, element):
- """
- Return the textual content of the element and its children
- """
- text = ''
- if element.text is not None:
- text = text + element.text
- for child in element.getchildren():
- text = text + self._get_element_text(child)
- if element.tail is not None:
- text = text + element.tail
- return text
- def element_coordinates(self, element):
- """
- Returns a tuple containing the coordinates of the bounding box around
- an element
- """
- if 'title' in element.attrib:
- matches = self.boxPattern.search(element.attrib['title'])
- if matches:
- coords = matches.group(1).split()
- return (int(coords[0]),int(coords[1]),int(coords[2]),int(coords[3]))
- return None
- def parse_hocr(self, hocrFileName):
- """
- Reads an XML/XHTML file into an ElementTree object
- """
- self.hocr = ET.fromstring(open(hocrFileName, 'r').read())
- #self.hocr.parse(hocrFileName)
- # if the hOCR file has a namespace, ElementTree requires its use to find elements
- matches = re.match('({.*})html', self.hocr.tag)
- if matches:
- self.xmlns = matches.group(1)
- else:
- self.xmlns = ''
- def to_pdf(self, imageFileName, outFileName, fontname="Times-Roman", fontsize=8):
- """
- Creates a PDF file with an image superimposed on top of the text.
- Text is positioned according to the bounding box of the lines in
- the hOCR file.
- The image need not be identical to the image used to create the hOCR file.
- It can be scaled, have a lower resolution, different color mode, etc.
- """
- if self.hocr is None:
- # warn that no text will be embedded in the output PDF
- print "Warning: No hOCR file specified. PDF will be image-only."
- im = Image.open(imageFileName)
- imwidthpx, imheightpx = im.size
- if 'dpi' in im.info:
- width = float(im.size[0])/im.info['dpi'][0]
- height = float(im.size[1])/im.info['dpi'][1]
- else:
- # we have to make a reasonable guess
- # set to None for now and try again using info from hOCR file
- logging.info("No Image DPI Info, get from hOCR File")
- width = height = None
- ocr_dpi = (300, 300) # a default, in case we can't find it
- # get dimensions of the OCR, which may not match the image
- if self.hocr is not None:
- for div in self.hocr.findall(".//%sdiv"%(self.xmlns)):
- if div.attrib['class'] == 'ocr_page':
- coords = self.element_coordinates(div)
- ocrwidth = coords[2]-coords[0]
- ocrheight = coords[3]-coords[1]
- if width is None:
- # no dpi info with the image
- # assume OCR was done at 300 dpi
- width = ocrwidth/300.0
- height = ocrheight/300.0
- ocr_dpi = (ocrwidth/width, ocrheight/height)
- break # there shouldn't be more than one, and if there is, we don't want it
- if width is None:
- # no dpi info with the image, and no help from the hOCR file either
- # this will probably end up looking awful, so issue a warning
- logging.error("DPI unavailable for image %s. Assuming 96 DPI."%(imageFileName))
- width = float(im.size[0])/96
- height = float(im.size[1])/96
- # create the PDF file
- pdf = Canvas(outFileName, pagesize=(width*inch, height*inch), pageCompression=1) # page size in points (1/72 in.)
- logging.info((width, height))
- # put the image on the page, scaled to fill the page
- if draw_image:
- pdf.drawInlineImage(im, 0, 0, width=width*inch, height=height*inch)
- if self.hocr is not None:
- for line in self.hocr.findall(".//%sspan"%(self.xmlns)):
- if line.attrib['class'] in {'ocr_line', 'ocrx_line'}:
- # Set the top and bottom of the bounding box for each line
- coords = self.element_coordinates(line)
- if not coords:
- continue
- bottom = inch*(height - float(coords[3])/ocr_dpi[1])
- top = inch*(height - float(coords[1])/ocr_dpi[1])
- box_height = top - bottom
- metrics = getDescent(fontname)
- # First guess the fontsize based on box height
- fontsize = max(8, box_height)
- # Adjust the bottom text but the descent amount
- bottom -= getDescent(fontname, fontsize)
- # Now get a more accurate fontsize taking into account the descent
- fontsize = max(8, top - bottom)
- if draw_line_rect:
- right_line = inch*float(coords[2])/ocr_dpi[0]
- left_line = inch*float(coords[0])/ocr_dpi[0]
- pdf.rect(left_line, bottom, right_line-left_line, top-bottom)
- if line.attrib['class'] in {'ocrx_word', 'ocr_word'}:
- # print line_text.encode('ascii','ignore')
- coords = self.element_coordinates(line)
- if not coords:
- continue
- line_text = (u"".join(list(line.itertext()))
- ).replace(u"\uFB01",u"fi").replace(u"\uFB02",u"fl")
- #print "".join(list(line.itertext()))
- text = pdf.beginText()
- if hide_text:
- text.setTextRenderMode(3) # invisible
- bot_text = inch*(height - float(coords[3])/ocr_dpi[1])
- top_text = inch*(height - float(coords[1])/ocr_dpi[1])
- right_text = inch*float(coords[2])/ocr_dpi[0]
- left_text = inch*float(coords[0])/ocr_dpi[0]
- # set cursor to bottom left corner of line bbox (adjust for dpi)
- if bottom_by_line:
- text.setTextOrigin(left_text, bottom-(top-bottom-fontsize))
- else:
- fontsize = max(8, math.ceil(top-bottom))
- text.setTextOrigin(left_text, bot_text)
- text.setFont(fontname, fontsize)
- # scale the width of the text to fill the width of the line's bbox
- stringwidth = pdf.stringWidth(line_text, fontname, fontsize)
- if not stringwidth: stringwidth = 1
- horiz_scale= 100.0*(right_text-left_text)/stringwidth
- # print horiz_scale, stringwidth, right_text-left_text
- text.setHorizScale(horiz_scale)
- # write the text to the page
- text.textLine(line_text)
- pdf.drawText(text)
- if draw_word_rect:
- pdf.setStrokeColorRGB(0.6,0.4,0.9)
- pdf.RoundRect(left_text, bot_text, right_text-left_text, top_text-bot_text, radius=3)
- pdf.setStrokeColorRGB(0.0,0.0,0.0)
- # finish up the page and save it
- pdf.showPage()
- pdf.save()
- def to_text(self, outFileName):
- """
- Writes the textual content of the hOCR body to a file.
- """
- f = open(outFileName, "w")
- f.write(self.__str__())
- f.close()
- if __name__ == "__main__":
- logging.basicConfig(
- level = logging.DEBUG,
- format = '%(asctime)s %(levelname)s %(message)s',
- )
- if len(sys.argv) < 4:
- print 'Usage: python HocrConverter.py inputHocrFile inputImageFile outputPdfFile'
- sys.exit(1)
- hocr = HocrConverter(sys.argv[1])
- hocr.to_pdf(sys.argv[2], sys.argv[3])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement