zdenop

tesseract-ocr C-API with file via ctypes in python

Nov 21st, 2013
445
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3.  
  4. # Copyright 2012-2013 Zdenko Podobný
  5. # Author: Zdenko Podobný
  6. #
  7. # Licensed under the Apache License, Version 2.0 (the "License");
  8. # you may not use this file except in compliance with the License.
  9. # You may obtain a copy of the License at
  10. #
  11. #      http://www.apache.org/licenses/LICENSE-2.0
  12. #
  13. # Unless required by applicable law or agreed to in writing, software
  14. # distributed under the License is distributed on an "AS IS" BASIS,
  15. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. # See the License for the specific language governing permissions and
  17. # limitations under the License.
  18.  
  19. """
  20. Simple python demo script of tesseract-ocr 3.02 c-api and filehandle
  21. """
  22.  
  23. import os
  24. import sys
  25. import ctypes
  26. from ctypes import pythonapi, util, py_object
  27.  
  28. # Demo variables
  29. lang = "eng"
  30. output = "dump.config"
  31. filename = "../phototest.tif"
  32. libpath = "/usr/local/lib64/"
  33. libpath_w = "../vs2008/DLL_Release/"
  34. tessdata = "/usr/src/tesseract-ocr/"
  35.  
  36. if sys.platform == "win32":
  37.     libname = libpath_w + "libtesseract302.dll"
  38.     libname_alt = "libtesseract302.dll"
  39.     os.environ["PATH"] += os.pathsep + libpath_w
  40. else:
  41.     libname = libpath + "libtesseract.so.3.0.2"
  42.     libname_alt = "libtesseract.so.3"
  43.  
  44. try:
  45.     tesseract = ctypes.cdll.LoadLibrary(libname)
  46. except:
  47.     try:
  48.         tesseract = ctypes.cdll.LoadLibrary(libname_alt)
  49.     except WindowsError, err:
  50.         print("Trying to load '%s'..." % libname)
  51.         print("Trying to load '%s'..." % libname_alt)
  52.         print(err)
  53.         exit(1)
  54.  
  55. tesseract.TessVersion.restype = ctypes.c_char_p
  56. tesseract_version = tesseract.TessVersion()
  57.  
  58. # We need to check library version because libtesseract.so.3 is symlink
  59. # and can point to other version than 3.02
  60. if float(tesseract_version) < 3.02:
  61.     print("Found tesseract-ocr library version %s." % tesseract_version)
  62.     print("C-API is present only in version 3.02!")
  63.     exit(2)
  64.  
  65. api = tesseract.TessBaseAPICreate()
  66.  
  67. rc = tesseract.TessBaseAPIInit3(api, tessdata, lang);
  68. if (rc):
  69.     tesseract.TessBaseAPIDelete(api)
  70.     print("Could not initialize tesseract.\n")
  71.     exit(3)
  72.  
  73. fh = open(output,'wb')
  74. PyFile_AsFile = pythonapi.PyFile_AsFile
  75. PyFile_AsFile.argtypes = [ctypes.py_object]
  76. PyFile_AsFile.restype = ctypes.c_void_p
  77.  
  78. tesseract.TessBaseAPIPrintVariables(api, PyFile_AsFile(fh));
  79. fh.close()
RAW Paste Data