Untitled

#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
1 - Convert PDF to PNG files
2 - Tesseract PNG files to create HOCR
3 - Run this program on each individual PNG/HOCR file
4 - Combine PDF pages

'''

import logging
import math
import codecs
from reportlab.pdfgen.canvas import Canvas
from reportlab.lib.units import inch
from reportlab.pdfbase.pdfmetrics import getDescent, getFont
import xml.etree.cElementTree as ET
import Image, re, sys

draw_image = False
hide_text = False

draw_line_rect = False
draw_word_rect = False
bottom_by_line = True

class HocrConverter():
 """
 A class for converting documents to/from the hOCR format.

 For details of the hOCR format, see:

 http://docs.google.com/View?docid=dfxcv4vc_67g844kf

 See also:

 http://code.google.com/p/hocr-tools/

 Basic usage:

 Create a PDF from an hOCR file and an image:

 hocr = HocrConverter("path/to/hOCR/file")
 hocr.to_pdf("path/to/image/file", "path/to/output/file")
 """

def __init__(self, hocrFileName = None):
 self.hocr = None
 self.xmlns = ''
 self.boxPattern = re.compile('bbox((\s+\d+){4})')

 if hocrFileName is not None:
  self.parse_hocr(hocrFileName)

def __str__(self):
 """
 Return the textual content of the HTML body
 """
 if self.hocr is None:
  return ''
 body = self.hocr.find(".//%sbody"%(self.xmlns))
 if body:
  return self._get_element_text(body).encode('utf-8') # XML gives unicode
 else:
  return ''

def _get_element_text(self, element):
 """
 Return the textual content of the element and its children
 """
 text = ''
 if element.text is not None:
  text = text + element.text
 for child in element.getchildren():
  text = text + self._get_element_text(child)
 if element.tail is not None:
  text = text + element.tail
 return text

def element_coordinates(self, element):
 """
 Returns a tuple containing the coordinates of the bounding box around
 an element
 """
 if 'title' in element.attrib:
  matches = self.boxPattern.search(element.attrib['title'])

 if matches:
  coords = matches.group(1).split()
  return (int(coords[0]),int(coords[1]),int(coords[2]),int(coords[3]))
 return None

def parse_hocr(self, hocrFileName):
 """
 Reads an XML/XHTML file into an ElementTree object
 """
 self.hocr = ET.fromstring(open(hocrFileName, 'r').read())
 #self.hocr.parse(hocrFileName)

 # if the hOCR file has a namespace, ElementTree requires its use to find elements
 matches = re.match('({.*})html', self.hocr.tag)
 if matches:
  self.xmlns = matches.group(1)
 else:
  self.xmlns = ''

def to_pdf(self, imageFileName, outFileName, fontname="Times-Roman", fontsize=8):
 """
 Creates a PDF file with an image superimposed on top of the text.

 Text is positioned according to the bounding box of the lines in
 the hOCR file.

 The image need not be identical to the image used to create the hOCR file.
 It can be scaled, have a lower resolution, different color mode, etc.
 """
 if self.hocr is None:
  # warn that no text will be embedded in the output PDF
  print "Warning: No hOCR file specified. PDF will be image-only."

 im = Image.open(imageFileName)
 imwidthpx, imheightpx = im.size
 if 'dpi' in im.info:
  width = float(im.size[0])/im.info['dpi'][0]
  height = float(im.size[1])/im.info['dpi'][1]
 else:
  # we have to make a reasonable guess
  # set to None for now and try again using info from hOCR file
  logging.info("No Image DPI Info, get from hOCR File")
  width = height = None
  ocr_dpi = (300, 300) # a default, in case we can't find it

 # get dimensions of the OCR, which may not match the image
 if self.hocr is not None:
  for div in self.hocr.findall(".//%sdiv"%(self.xmlns)):
   if div.attrib['class'] == 'ocr_page':
    coords = self.element_coordinates(div)
    ocrwidth = coords[2]-coords[0]
    ocrheight = coords[3]-coords[1]

   if width is None:
   # no dpi info with the image
   # assume OCR was done at 300 dpi
    width = ocrwidth/300.0
    height = ocrheight/300.0
    ocr_dpi = (ocrwidth/width, ocrheight/height)
    break # there shouldn't be more than one, and if there is, we don't want it

   if width is None:
    # no dpi info with the image, and no help from the hOCR file either
    # this will probably end up looking awful, so issue a warning
    logging.error("DPI unavailable for image %s. Assuming 96 DPI."%(imageFileName))
    width = float(im.size[0])/96
    height = float(im.size[1])/96

   # create the PDF file
   pdf = Canvas(outFileName, pagesize=(width*inch, height*inch), pageCompression=1) # page size in points (1/72 in.)

   logging.info((width, height))
   # put the image on the page, scaled to fill the page

   if draw_image:
    pdf.drawInlineImage(im, 0, 0, width=width*inch, height=height*inch)

   if self.hocr is not None:
    for line in self.hocr.findall(".//%sspan"%(self.xmlns)):
     if line.attrib['class'] in {'ocr_line', 'ocrx_line'}:
      # Set the top and bottom of the bounding box for each line
      coords = self.element_coordinates(line)
      if not coords:
       continue

     bottom = inch*(height - float(coords[3])/ocr_dpi[1])
     top = inch*(height - float(coords[1])/ocr_dpi[1])
     box_height = top - bottom
     metrics = getDescent(fontname)
     # First guess the fontsize based on box height
     fontsize = max(8, box_height)
     # Adjust the bottom text but the descent amount
     bottom -= getDescent(fontname, fontsize)
     # Now get a more accurate fontsize taking into account the descent
     fontsize = max(8, top - bottom)

     if draw_line_rect:
      right_line = inch*float(coords[2])/ocr_dpi[0]
      left_line = inch*float(coords[0])/ocr_dpi[0]
      pdf.rect(left_line, bottom, right_line-left_line, top-bottom)

     if line.attrib['class'] in {'ocrx_word', 'ocr_word'}:
      # print line_text.encode('ascii','ignore')
      coords = self.element_coordinates(line)
      if not coords:
       continue

     line_text = (u"".join(list(line.itertext()))
).replace(u"\uFB01",u"fi").replace(u"\uFB02",u"fl")
     #print "".join(list(line.itertext()))
     text = pdf.beginText()
     if hide_text:
      text.setTextRenderMode(3) # invisible

     bot_text = inch*(height - float(coords[3])/ocr_dpi[1])
     top_text = inch*(height - float(coords[1])/ocr_dpi[1])

     right_text = inch*float(coords[2])/ocr_dpi[0]
     left_text = inch*float(coords[0])/ocr_dpi[0]

     # set cursor to bottom left corner of line bbox (adjust for dpi)
     if bottom_by_line:
      text.setTextOrigin(left_text, bottom-(top-bottom-fontsize))
     else:
      fontsize = max(8, math.ceil(top-bottom))
      text.setTextOrigin(left_text, bot_text)

     text.setFont(fontname, fontsize)

     # scale the width of the text to fill the width of the line's bbox
     stringwidth = pdf.stringWidth(line_text, fontname, fontsize)
     if not stringwidth: stringwidth = 1

     horiz_scale= 100.0*(right_text-left_text)/stringwidth
     # print horiz_scale, stringwidth, right_text-left_text
     text.setHorizScale(horiz_scale)

     # write the text to the page
     text.textLine(line_text)
     pdf.drawText(text)

     if draw_word_rect:
      pdf.setStrokeColorRGB(0.6,0.4,0.9)
      pdf.RoundRect(left_text, bot_text, right_text-left_text, top_text-bot_text, radius=3)

     pdf.setStrokeColorRGB(0.0,0.0,0.0)
     # finish up the page and save it
     pdf.showPage()
     pdf.save()

def to_text(self, outFileName):
 """
 Writes the textual content of the hOCR body to a file.
 """
 f = open(outFileName, "w")
 f.write(self.__str__())
 f.close()

 if __name__ == "__main__":
  logging.basicConfig(
  level = logging.DEBUG,
  format = '%(asctime)s %(levelname)s %(message)s',
 )

if len(sys.argv) < 4:
 print 'Usage: python HocrConverter.py inputHocrFile inputImageFile outputPdfFile'
 sys.exit(1)

hocr = HocrConverter(sys.argv[1])
hocr.to_pdf(sys.argv[2], sys.argv[3])