Advertisement
Guest User

Untitled

a guest
Jul 19th, 2013
370
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.85 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. '''
  4. 1 - Convert PDF to PNG files
  5. 2 - Tesseract PNG files to create HOCR
  6. 3 - Run this program on each individual PNG/HOCR file
  7. 4 - Combine PDF pages
  8.  
  9. '''
  10.  
  11. import logging
  12. import math
  13. import codecs
  14. from reportlab.pdfgen.canvas import Canvas
  15. from reportlab.lib.units import inch
  16. from reportlab.pdfbase.pdfmetrics import getDescent, getFont
  17. import xml.etree.cElementTree as ET
  18. import Image, re, sys
  19.  
  20. draw_image = False
  21. hide_text = False
  22.  
  23. draw_line_rect = False
  24. draw_word_rect = False
  25. bottom_by_line = True
  26.  
  27. class HocrConverter():
  28.  """
  29. A class for converting documents to/from the hOCR format.
  30.  
  31. For details of the hOCR format, see:
  32.  
  33. http://docs.google.com/View?docid=dfxcv4vc_67g844kf
  34.  
  35. See also:
  36.  
  37. http://code.google.com/p/hocr-tools/
  38.  
  39. Basic usage:
  40.  
  41. Create a PDF from an hOCR file and an image:
  42.  
  43. hocr = HocrConverter("path/to/hOCR/file")
  44. hocr.to_pdf("path/to/image/file", "path/to/output/file")
  45. """
  46.  
  47. def __init__(self, hocrFileName = None):
  48.  self.hocr = None
  49.  self.xmlns = ''
  50.  self.boxPattern = re.compile('bbox((\s+\d+){4})')
  51.  
  52.  if hocrFileName is not None:
  53.   self.parse_hocr(hocrFileName)
  54.  
  55. def __str__(self):
  56.  """
  57. Return the textual content of the HTML body
  58. """
  59.  if self.hocr is None:
  60.   return ''
  61.  body = self.hocr.find(".//%sbody"%(self.xmlns))
  62.  if body:
  63.   return self._get_element_text(body).encode('utf-8') # XML gives unicode
  64.  else:
  65.   return ''
  66.  
  67. def _get_element_text(self, element):
  68.  """
  69. Return the textual content of the element and its children
  70. """
  71.  text = ''
  72.  if element.text is not None:
  73.   text = text + element.text
  74.  for child in element.getchildren():
  75.   text = text + self._get_element_text(child)
  76.  if element.tail is not None:
  77.   text = text + element.tail
  78.  return text
  79.  
  80. def element_coordinates(self, element):
  81.  """
  82. Returns a tuple containing the coordinates of the bounding box around
  83. an element
  84. """
  85.  if 'title' in element.attrib:
  86.   matches = self.boxPattern.search(element.attrib['title'])
  87.  
  88.  if matches:
  89.   coords = matches.group(1).split()
  90.   return (int(coords[0]),int(coords[1]),int(coords[2]),int(coords[3]))
  91.  return None
  92.  
  93. def parse_hocr(self, hocrFileName):
  94.  """
  95. Reads an XML/XHTML file into an ElementTree object
  96. """
  97.  self.hocr = ET.fromstring(open(hocrFileName, 'r').read())
  98.  #self.hocr.parse(hocrFileName)
  99.  
  100.  # if the hOCR file has a namespace, ElementTree requires its use to find elements
  101.  matches = re.match('({.*})html', self.hocr.tag)
  102.  if matches:
  103.   self.xmlns = matches.group(1)
  104.  else:
  105.   self.xmlns = ''
  106.  
  107. def to_pdf(self, imageFileName, outFileName, fontname="Times-Roman", fontsize=8):
  108.  """
  109. Creates a PDF file with an image superimposed on top of the text.
  110.  
  111. Text is positioned according to the bounding box of the lines in
  112. the hOCR file.
  113.  
  114. The image need not be identical to the image used to create the hOCR file.
  115. It can be scaled, have a lower resolution, different color mode, etc.
  116. """
  117.  if self.hocr is None:
  118.   # warn that no text will be embedded in the output PDF
  119.   print "Warning: No hOCR file specified. PDF will be image-only."
  120.  
  121.  im = Image.open(imageFileName)
  122.  imwidthpx, imheightpx = im.size
  123.  if 'dpi' in im.info:
  124.   width = float(im.size[0])/im.info['dpi'][0]
  125.   height = float(im.size[1])/im.info['dpi'][1]
  126.  else:
  127.   # we have to make a reasonable guess
  128.   # set to None for now and try again using info from hOCR file
  129.   logging.info("No Image DPI Info, get from hOCR File")
  130.   width = height = None
  131.   ocr_dpi = (300, 300) # a default, in case we can't find it
  132.  
  133.  # get dimensions of the OCR, which may not match the image
  134.  if self.hocr is not None:
  135.   for div in self.hocr.findall(".//%sdiv"%(self.xmlns)):
  136.    if div.attrib['class'] == 'ocr_page':
  137.     coords = self.element_coordinates(div)
  138.     ocrwidth = coords[2]-coords[0]
  139.     ocrheight = coords[3]-coords[1]
  140.  
  141.    if width is None:
  142.    # no dpi info with the image
  143.    # assume OCR was done at 300 dpi
  144.     width = ocrwidth/300.0
  145.     height = ocrheight/300.0
  146.     ocr_dpi = (ocrwidth/width, ocrheight/height)
  147.     break # there shouldn't be more than one, and if there is, we don't want it
  148.  
  149.    if width is None:
  150.     # no dpi info with the image, and no help from the hOCR file either
  151.     # this will probably end up looking awful, so issue a warning
  152.     logging.error("DPI unavailable for image %s. Assuming 96 DPI."%(imageFileName))
  153.     width = float(im.size[0])/96
  154.     height = float(im.size[1])/96
  155.  
  156.    # create the PDF file
  157.    pdf = Canvas(outFileName, pagesize=(width*inch, height*inch), pageCompression=1) # page size in points (1/72 in.)
  158.  
  159.    logging.info((width, height))
  160.    # put the image on the page, scaled to fill the page
  161.  
  162.    if draw_image:
  163.     pdf.drawInlineImage(im, 0, 0, width=width*inch, height=height*inch)
  164.  
  165.    if self.hocr is not None:
  166.     for line in self.hocr.findall(".//%sspan"%(self.xmlns)):
  167.      if line.attrib['class'] in {'ocr_line', 'ocrx_line'}:
  168.       # Set the top and bottom of the bounding box for each line
  169.       coords = self.element_coordinates(line)
  170.       if not coords:
  171.        continue
  172.  
  173.      bottom = inch*(height - float(coords[3])/ocr_dpi[1])
  174.      top = inch*(height - float(coords[1])/ocr_dpi[1])
  175.      box_height = top - bottom
  176.      metrics = getDescent(fontname)
  177.      # First guess the fontsize based on box height
  178.      fontsize = max(8, box_height)
  179.      # Adjust the bottom text but the descent amount
  180.      bottom -= getDescent(fontname, fontsize)
  181.      # Now get a more accurate fontsize taking into account the descent
  182.      fontsize = max(8, top - bottom)
  183.  
  184.      if draw_line_rect:
  185.       right_line = inch*float(coords[2])/ocr_dpi[0]
  186.       left_line = inch*float(coords[0])/ocr_dpi[0]
  187.       pdf.rect(left_line, bottom, right_line-left_line, top-bottom)
  188.  
  189.      if line.attrib['class'] in {'ocrx_word', 'ocr_word'}:
  190.       # print line_text.encode('ascii','ignore')
  191.       coords = self.element_coordinates(line)
  192.       if not coords:
  193.        continue
  194.  
  195.      line_text = (u"".join(list(line.itertext()))
  196. ).replace(u"\uFB01",u"fi").replace(u"\uFB02",u"fl")
  197.      #print "".join(list(line.itertext()))
  198.      text = pdf.beginText()
  199.      if hide_text:
  200.       text.setTextRenderMode(3) # invisible
  201.  
  202.      bot_text = inch*(height - float(coords[3])/ocr_dpi[1])
  203.      top_text = inch*(height - float(coords[1])/ocr_dpi[1])
  204.  
  205.      right_text = inch*float(coords[2])/ocr_dpi[0]
  206.      left_text = inch*float(coords[0])/ocr_dpi[0]
  207.  
  208.      # set cursor to bottom left corner of line bbox (adjust for dpi)
  209.      if bottom_by_line:
  210.       text.setTextOrigin(left_text, bottom-(top-bottom-fontsize))
  211.      else:
  212.       fontsize = max(8, math.ceil(top-bottom))
  213.       text.setTextOrigin(left_text, bot_text)
  214.  
  215.      text.setFont(fontname, fontsize)
  216.  
  217.      # scale the width of the text to fill the width of the line's bbox
  218.      stringwidth = pdf.stringWidth(line_text, fontname, fontsize)
  219.      if not stringwidth: stringwidth = 1
  220.  
  221.      horiz_scale= 100.0*(right_text-left_text)/stringwidth
  222.      # print horiz_scale, stringwidth, right_text-left_text
  223.      text.setHorizScale(horiz_scale)
  224.  
  225.      # write the text to the page
  226.      text.textLine(line_text)
  227.      pdf.drawText(text)
  228.  
  229.      if draw_word_rect:
  230.       pdf.setStrokeColorRGB(0.6,0.4,0.9)
  231.       pdf.RoundRect(left_text, bot_text, right_text-left_text, top_text-bot_text, radius=3)
  232.  
  233.      pdf.setStrokeColorRGB(0.0,0.0,0.0)
  234.      # finish up the page and save it
  235.      pdf.showPage()
  236.      pdf.save()
  237.  
  238. def to_text(self, outFileName):
  239.  """
  240. Writes the textual content of the hOCR body to a file.
  241. """
  242.  f = open(outFileName, "w")
  243.  f.write(self.__str__())
  244.  f.close()
  245.  
  246.  if __name__ == "__main__":
  247.   logging.basicConfig(
  248.   level = logging.DEBUG,
  249.   format = '%(asctime)s %(levelname)s %(message)s',
  250.  )
  251.  
  252. if len(sys.argv) < 4:
  253.  print 'Usage: python HocrConverter.py inputHocrFile inputImageFile outputPdfFile'
  254.  sys.exit(1)
  255.  
  256. hocr = HocrConverter(sys.argv[1])
  257. hocr.to_pdf(sys.argv[2], sys.argv[3])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement