Guest User

Untitled

a guest
Jan 1st, 2018
1,437
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/bin/bash
  2. # (C) Copyright 2014, Google Inc.
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. # http://www.apache.org/licenses/LICENSE-2.0
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. # This script provides an easy way to execute various phases of training
  14. # Tesseract.  For a detailed description of the phases, see
  15. # https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
  16. #
  17. # USAGE:
  18. #
  19. # tesstrain.sh
  20. #    --fontlist FONTS           # A list of fontnames to train on.
  21. #    --fonts_dir FONTS_PATH     # Path to font files.
  22. #    --lang LANG_CODE           # ISO 639 code.
  23. #    --langdata_dir DATADIR     # Path to tesseract/training/langdata directory.
  24. #    --output_dir OUTPUTDIR     # Location of output traineddata file.
  25. #    --overwrite                # Safe to overwrite files in output_dir.
  26. #    --linedata_only            # Only generate training data for lstmtraining.
  27. #    --run_shape_clustering     # Run shape clustering (use for Indic langs).
  28. #    --exposures EXPOSURES      # A list of exposure levels to use (e.g. "-1 0 1").
  29. #
  30. # OPTIONAL flags for input data. If unspecified we will look for them in
  31. # the langdata_dir directory.
  32. #    --training_text TEXTFILE   # Text to render and use for training.
  33. #    --wordlist WORDFILE        # Word list for the language ordered by
  34. #                               # decreasing frequency.
  35. #    --textlist FONTS           # A list of filenames for text to render.
  36. #                               # Works like fontlist. Overrides training_text.
  37. #
  38. # OPTIONAL flag to specify location of existing traineddata files, required
  39. # during feature extraction. If unspecified will use TESSDATA_PREFIX defined in
  40. # the current environment.
  41. #    --tessdata_dir TESSDATADIR     # Path to tesseract/tessdata directory.
  42. #
  43. # NOTE:
  44. # The font names specified in --fontlist need to be recognizable by Pango using
  45. # fontconfig. An easy way to list the canonical names of all fonts available on
  46. # your system is to run text2image with --list_available_fonts and the
  47. # appropriate --fonts_dir path.
  48.  
  49.  
  50. source "$(dirname $0)/tesstrain_utils.sh"
  51.  
  52. ARGV=("$@")
  53. parse_flags
  54.  
  55. mkdir -p ${TRAINING_DIR}
  56. tlog "\n=== Starting training for language '${LANG_CODE}'"
  57.  
  58. source "$(dirname $0)/language-specific.sh"
  59. set_lang_specific_parameters ${LANG_CODE}
  60.  
  61. initialize_fontconfig
  62.  
  63. phase_I_generate_image 8
  64. phase_UP_generate_unicharset
  65. if ((LINEDATA)); then
  66.   phase_E_extract_features "lstm.train" 8 "lstmf"
  67.   make__lstmdata
  68. else
  69.   phase_D_generate_dawg
  70.   phase_E_extract_features "box.train" 8 "tr"
  71.   phase_C_cluster_prototypes "${TRAINING_DIR}/${LANG_CODE}.normproto"
  72.   if [[ "${ENABLE_SHAPE_CLUSTERING}" == "y" ]]; then
  73.       phase_S_cluster_shapes
  74.   fi
  75.   phase_M_cluster_microfeatures
  76.   phase_B_generate_ambiguities
  77.   make__traineddata
  78. fi
  79.  
  80. tlog "\nCompleted training for language '${LANG_CODE}'\n"
RAW Paste Data