Guest User

Untitled

a guest
Jan 1st, 2018
1,015
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/bin/bash
  2. # (C) Copyright 2014, Google Inc.
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. # http://www.apache.org/licenses/LICENSE-2.0
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. # This script defines functions that are used by tesstrain.sh
  14. # For a detailed description of the phases, see
  15. # https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
  16. #
  17. # USAGE: source tesstrain_utils.sh
  18.  
  19. if [ "$(uname)" == "Darwin" ];then
  20.     FONTS_DIR="/Library/Fonts/"
  21. else
  22.     FONTS_DIR="/usr/share/fonts/"
  23. fi
  24. OUTPUT_DIR="/tmp/tesstrain/tessdata"
  25. OVERWRITE=0
  26. LINEDATA=0
  27. RUN_SHAPE_CLUSTERING=0
  28. EXTRACT_FONT_PROPERTIES=1
  29. WORKSPACE_DIR=$(mktemp -d)
  30.  
  31. # Logging helper functions.
  32. tlog() {
  33.     echo -e $* 2>&1 1>&2 | tee -a ${LOG_FILE}
  34. }
  35.  
  36. err_exit() {
  37.     echo -e "ERROR: "$* 2>&1 1>&2 | tee -a ${LOG_FILE}
  38.     exit 1
  39. }
  40.  
  41. # Helper function to run a command and append its output to a log. Aborts early
  42. # if the program file is not found.
  43. # Usage: run_command CMD ARG1 ARG2...
  44. run_command() {
  45.     local cmd=$(which $1)
  46.     if [[ -z ${cmd} ]]; then
  47.       for d in api training; do
  48.         cmd=$(which $d/$1)
  49.         if [[ ! -z ${cmd} ]]; then
  50.           break
  51.         fi
  52.       done
  53.       if [[ -z ${cmd} ]]; then
  54.           err_exit "$1 not found"
  55.       fi
  56.     fi
  57.     shift
  58.     tlog "[$(date)] ${cmd} $@"
  59.     "${cmd}" "$@" 2>&1 1>&2 | tee -a ${LOG_FILE}
  60.     # check completion status
  61.     if [[ $? -gt 0 ]]; then
  62.         err_exit "Program $(basename ${cmd}) failed. Abort."
  63.     fi
  64. }
  65.  
  66. # Check if all the given files exist, or exit otherwise.
  67. # Used to check required input files and produced output files in each phase.
  68. # Usage: check_file_readable FILE1 FILE2...
  69. check_file_readable() {
  70.     for file in $@; do
  71.         if [[ ! -r ${file} ]]; then
  72.             err_exit "${file} does not exist or is not readable"
  73.         fi
  74.     done
  75. }
  76.  
  77. # Sets the named variable to given value. Aborts if the value is missing or
  78. # if it looks like a flag.
  79. # Usage: parse_value VAR_NAME VALUE
  80. parse_value() {
  81.     local val="$2"
  82.     if [[ -z $val ]]; then
  83.         err_exit "Missing value for variable $1"
  84.         exit
  85.     fi
  86.     if [[ ${val:0:2} == "--" ]]; then
  87.         err_exit "Invalid value $val passed for variable $1"
  88.         exit
  89.     fi
  90.     eval $1=\"$val\"
  91. }
  92.  
  93. # Does simple command-line parsing and initialization.
  94. parse_flags() {
  95.     local i=0
  96.     while test $i -lt ${#ARGV[@]}; do
  97.         local j=$((i+1))
  98.         case ${ARGV[$i]} in
  99.             --)
  100.                 break;;
  101.             --fontlist)
  102.                 fn=0
  103.                 FONTS=""
  104.                 while test $j -lt ${#ARGV[@]}; do
  105.                     test -z "${ARGV[$j]}" && break
  106.                     test $(echo ${ARGV[$j]} | cut -c -2) = "--" && break
  107.                     FONTS[$fn]="${ARGV[$j]}"
  108.                     fn=$((fn+1))
  109.                     j=$((j+1))
  110.                 done
  111.                 i=$((j-1)) ;;
  112.             --exposures)
  113.                 exp=""
  114.                 while test $j -lt ${#ARGV[@]}; do
  115.                     test -z "${ARGV[$j]}" && break
  116.                     test $(echo ${ARGV[$j]} | cut -c -2) = "--" && break
  117.                     exp="$exp ${ARGV[$j]}"
  118.                     j=$((j+1))
  119.                 done
  120.                 parse_value "EXPOSURES" "$exp"
  121.                 i=$((j-1)) ;;
  122.             --fonts_dir)
  123.                 parse_value "FONTS_DIR" ${ARGV[$j]}
  124.                 i=$j ;;
  125.             --lang)
  126.                 parse_value "LANG_CODE" ${ARGV[$j]}
  127.                 i=$j ;;
  128.             --langdata_dir)
  129.                 parse_value "LANGDATA_ROOT" ${ARGV[$j]}
  130.                 i=$j ;;
  131.             --output_dir)
  132.                 parse_value "OUTPUT_DIR" ${ARGV[$j]}
  133.                 i=$j ;;
  134.             --overwrite)
  135.                 OVERWRITE=1 ;;
  136.             --linedata_only)
  137.                 LINEDATA=1 ;;
  138.             --extract_font_properties)
  139.                 EXTRACT_FONT_PROPERTIES=1 ;;
  140.             --noextract_font_properties)
  141.                 EXTRACT_FONT_PROPERTIES=0 ;;
  142.             --tessdata_dir)
  143.                 parse_value "TESSDATA_DIR" ${ARGV[$j]}
  144.                 i=$j ;;
  145.             --training_text)
  146.                 parse_value "TRAINING_TEXT" "${ARGV[$j]}"
  147.                 i=$j ;;
  148.             --textlist)
  149.                 fn=0
  150.                 TEXTS=""
  151.                 while test $j -lt ${#ARGV[@]}; do
  152.                     test -z "${ARGV[$j]}" && break
  153.                     test $(echo ${ARGV[$j]} | cut -c -2) = "--" && break
  154.                     TEXTS[$fn]="${ARGV[$j]}"
  155.                     fn=$((fn+1))
  156.                     j=$((j+1))
  157.                 done
  158.                 i=$((j-1)) ;;
  159.             --wordlist)
  160.                 parse_value "WORDLIST_FILE" ${ARGV[$j]}
  161.                 i=$j ;;
  162.             *)
  163.                 err_exit "Unrecognized argument ${ARGV[$i]}" ;;
  164.         esac
  165.         i=$((i+1))
  166.     done
  167.     if [[ -z ${LANG_CODE} ]]; then
  168.         err_exit "Need to specify a language --lang"
  169.     fi
  170.     if [[ -z ${LANGDATA_ROOT} ]]; then
  171.         err_exit "Need to specify path to language files --langdata_dir"
  172.     fi
  173.     if [[ -z ${TESSDATA_DIR} ]]; then
  174.         if [[ -z ${TESSDATA_PREFIX} ]]; then
  175.             err_exit "Need to specify a --tessdata_dir or have a "\
  176.         "TESSDATA_PREFIX variable defined in your environment"
  177.         else
  178.             TESSDATA_DIR="${TESSDATA_PREFIX}"
  179.         fi
  180.     fi
  181.  
  182.     # Location where intermediate files will be created.
  183.     TRAINING_DIR=${WORKSPACE_DIR}/${LANG_CODE}
  184.     # Location of log file for the whole run.
  185.     LOG_FILE=${TRAINING_DIR}/tesstrain.log
  186.  
  187.     # Take training text and wordlist from the langdata directory if not
  188.     # specified in the command-line.
  189.     if [[ -z ${TRAINING_TEXT} ]]; then
  190.         TRAINING_TEXT=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.training_text
  191.     fi
  192.     if [[ -z ${WORDLIST_FILE} ]]; then
  193.         WORDLIST_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.wordlist
  194.     fi
  195.     WORD_BIGRAMS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.word.bigrams
  196.     NUMBERS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.numbers
  197.     PUNC_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.punc
  198.     BIGRAM_FREQS_FILE=${TRAINING_TEXT}.bigram_freqs
  199.     UNIGRAM_FREQS_FILE=${TRAINING_TEXT}.unigram_freqs
  200.     TRAIN_NGRAMS_FILE=${TRAINING_TEXT}.train_ngrams
  201.     GENERATE_DAWGS=1
  202. }
  203.  
  204. # Function initializes font config with a unique font cache dir.
  205. initialize_fontconfig() {
  206.     export FONT_CONFIG_CACHE="/c/users/wareya/bogus_fcfg/"
  207.     local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt
  208.     echo "Text" >${sample_path}
  209.     run_command text2image --fonts_dir=${FONTS_DIR} \
  210.         --font="${FONTS[0]}" --outputbase=${sample_path} --text=${sample_path} \
  211.         --fontconfig_tmpdir=${FONT_CONFIG_CACHE}
  212. }
  213.  
  214. # Helper function for phaseI_generate_image. Generates the image for a single
  215. # language/font combination in a way that can be run in parallel.
  216. generate_font_image() {
  217.     local font="$1"
  218.     tlog "Rendering using ${font}"
  219.     local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
  220.     local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
  221.  
  222.     local common_args="--fontconfig_tmpdir=${FONT_CONFIG_CACHE}"
  223.     common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words=false"
  224.     common_args+=" --leading=${LEADING}"
  225.     common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}"
  226.     if [[ ${#TEXTS[@]} -eq 0 ]]; then
  227.         common_args+=" --outputbase=${outbase} --max_pages=100"
  228.  
  229.         # add --writing_mode=vertical-upright to common_args if the font is
  230.         # specified to be rendered vertically.
  231.         for vfont in "${VERTICAL_FONTS[@]}"; do
  232.           if [[ "${font}" == "${vfont}" ]]; then
  233.             common_args+=" --writing_mode=vertical-upright "
  234.             break
  235.           fi
  236.         done
  237.  
  238.         run_command text2image ${common_args} --font="${font}" \
  239.             --text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS}
  240.         check_file_readable ${outbase}.box ${outbase}.tif
  241.     else
  242.         for ((i = 0; i < ${#TEXTS[@]}; i++))
  243.         do
  244.             outbase=${TRAINING_DIR}/${LANG_CODE}.${i}.${fontname}.exp${EXPOSURE}
  245.  
  246.             local base_args="${common_args} --outputbase=${outbase} --max_pages=100"
  247.  
  248.             # add --writing_mode=vertical-upright to common_args if the font is
  249.             # specified to be rendered vertically.
  250.             for vfont in "${VERTICAL_FONTS[@]}"; do
  251.               if [[ "${font}" == "${vfont}" ]]; then
  252.                 base_args+=" --writing_mode=vertical-upright "
  253.                 break
  254.               fi
  255.             done
  256.  
  257.             run_command text2image ${base_args} --font="${font}" \
  258.                 --text=${TEXTS[$i]} ${TEXT2IMAGE_EXTRA_ARGS}
  259.             check_file_readable ${outbase}.box ${outbase}.tif
  260.         done
  261.     fi
  262.  
  263.     if ((EXTRACT_FONT_PROPERTIES)) &&
  264.         [[ -r ${TRAIN_NGRAMS_FILE} ]]; then
  265.         tlog "Extracting font properties of ${font}"
  266.         run_command text2image ${common_args} --font="${font}" \
  267.             --ligatures=false --text=${TRAIN_NGRAMS_FILE} \
  268.             --only_extract_font_properties --ptsize=32
  269.         check_file_readable ${outbase}.fontinfo
  270.     fi
  271. }
  272.  
  273.  
  274. # Phase I : Generate (I)mages from training text for each font.
  275. phase_I_generate_image() {
  276.     local par_factor=$1
  277.     if [[ -z ${par_factor} || ${par_factor} -le 0 ]]; then
  278.         par_factor=1
  279.     fi
  280.     tlog "\n=== Phase I: Generating training images ==="
  281.     if [[ -z ${TRAINING_TEXT} ]] || [[ ! -r ${TRAINING_TEXT} ]]; then
  282.         err_exit "Could not find training text file ${TRAINING_TEXT}"
  283.     fi
  284.     CHAR_SPACING="0.0"
  285.  
  286.     for EXPOSURE in $EXPOSURES; do
  287.         if ((EXTRACT_FONT_PROPERTIES)) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
  288.             # Parse .bigram_freqs file and compose a .train_ngrams file with text
  289.             # for tesseract to recognize during training. Take only the ngrams whose
  290.             # combined weight accounts for 95% of all the bigrams in the language.
  291.             NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
  292.                 | awk '{s=s+$2}; END {print (s/100)*p}' p=99)
  293.             cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
  294.                 | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
  295.                 x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
  296.             check_file_readable ${TRAIN_NGRAMS_FILE}
  297.         fi
  298.  
  299.         local counter=0
  300.         for font in "${FONTS[@]}"; do
  301.             generate_font_image "${font}" &
  302.             let counter=counter+1
  303.             let rem=counter%par_factor
  304.             if [[ "${rem}" -eq 0 ]]; then
  305.               wait
  306.             fi
  307.         done
  308.         wait
  309.         # Check that each process was successful.
  310.         for font in "${FONTS[@]}"; do
  311.             if [[ ${#TEXTS[@]} -eq 0 ]]; then
  312.                 local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
  313.                 local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
  314.                 check_file_readable ${outbase}.box ${outbase}.tif
  315.             else
  316.                 for ((i = 0; i < ${#TEXTS[@]}; i++))
  317.                 do
  318.                     local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
  319.                     local outbase=${TRAINING_DIR}/${LANG_CODE}.${i}.${fontname}.exp${EXPOSURE}
  320.                     check_file_readable ${outbase}.box ${outbase}.tif
  321.                 done
  322.             fi
  323.         done
  324.     done
  325. }
  326.  
  327. # Phase UP : Generate (U)nicharset and (P)roperties file.
  328. phase_UP_generate_unicharset() {
  329.     tlog "\n=== Phase UP: Generating unicharset and unichar properties files ==="
  330.  
  331.     local box_files=$(ls ${TRAINING_DIR}/*.box)
  332.     UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset"
  333.     run_command unicharset_extractor --output_unicharset "${UNICHARSET_FILE}" \
  334.       --norm_mode "${NORM_MODE}" ${box_files}
  335.     check_file_readable ${UNICHARSET_FILE}
  336.  
  337.     XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights"
  338.     run_command set_unicharset_properties \
  339.         -U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \
  340.         --script_dir=${LANGDATA_ROOT}
  341.     check_file_readable ${XHEIGHTS_FILE}
  342. }
  343.  
  344. # Phase D : Generate (D)awg files from unicharset file and wordlist files
  345. phase_D_generate_dawg() {
  346.     tlog "\n=== Phase D: Generating Dawg files ==="
  347.  
  348.     # Skip if requested
  349.     if [[ ${GENERATE_DAWGS} -eq 0 ]]; then
  350.       tlog "Skipping ${phase_name}"
  351.       return
  352.     fi
  353.  
  354.     # Output files
  355.     WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg
  356.     FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg
  357.     PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg
  358.     NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg
  359.     BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg
  360.  
  361.     # Word DAWG
  362.     local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq
  363.     if [[ -s ${WORDLIST_FILE} ]]; then
  364.         tlog "Generating word Dawg"
  365.         check_file_readable ${UNICHARSET_FILE}
  366.         run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
  367.             ${UNICHARSET_FILE}
  368.         check_file_readable ${WORD_DAWG}
  369.  
  370.         FREQ_DAWG_SIZE=100
  371.         head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file}
  372.     fi
  373.  
  374.     # Freq-word DAWG
  375.     if [[ -s ${freq_wordlist_file} ]]; then
  376.         check_file_readable ${UNICHARSET_FILE}
  377.         tlog "Generating frequent-word Dawg"
  378.         run_command wordlist2dawg  -r 1 ${freq_wordlist_file} \
  379.             ${FREQ_DAWG} ${UNICHARSET_FILE}
  380.         check_file_readable ${FREQ_DAWG}
  381.     fi
  382.  
  383.     # Punctuation DAWG
  384.     # -r arguments to wordlist2dawg denote RTL reverse policy
  385.     # (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h).
  386.     # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
  387.     # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
  388.     # 2/RRP_FORCE_REVERSE for the punctuation DAWG.
  389.     local punc_reverse_policy=0;
  390.     if [[ "${LANG_IS_RTL}" == "1" ]]; then
  391.       punc_reverse_policy=2
  392.     fi
  393.     if [[ ! -s ${PUNC_FILE} ]]; then
  394.         PUNC_FILE="${LANGDATA_ROOT}/common.punc"
  395.     fi
  396.     check_file_readable ${PUNC_FILE}
  397.     run_command wordlist2dawg -r ${punc_reverse_policy} \
  398.         ${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE}
  399.     check_file_readable ${PUNC_DAWG}
  400.  
  401.     # Numbers DAWG
  402.     if [[ -s ${NUMBERS_FILE} ]]; then
  403.         run_command wordlist2dawg -r 0 \
  404.             ${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE}
  405.         check_file_readable ${NUMBER_DAWG}
  406.     fi
  407.  
  408.     # Bigram dawg
  409.     if [[ -s ${WORD_BIGRAMS_FILE} ]]; then
  410.         run_command wordlist2dawg -r 1 \
  411.             ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
  412.         check_file_readable ${BIGRAM_DAWG}
  413.     fi
  414. }
  415.  
  416. # Phase E : (E)xtract .tr feature files from .tif/.box files
  417. phase_E_extract_features() {
  418.     local box_config=$1
  419.     local par_factor=$2
  420.     local ext=$3
  421.     if [[ -z ${par_factor} || ${par_factor} -le 0 ]]; then
  422.         par_factor=1
  423.     fi
  424.     tlog "\n=== Phase E: Generating ${ext} files ==="
  425.  
  426.     local img_files=""
  427.     for exposure in ${EXPOSURES}; do
  428.         img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
  429.     done
  430.  
  431.     # Use any available language-specific configs.
  432.     local config=""
  433.     if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config ]]; then
  434.         config=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config
  435.     fi
  436.  
  437.     OLD_TESSDATA_PREFIX=${TESSDATA_PREFIX}
  438.     export TESSDATA_PREFIX=${TESSDATA_DIR}
  439.     tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}"
  440.     local counter=0
  441.     for img_file in ${img_files}; do
  442.         run_command tesseract ${img_file} ${img_file%.*} \
  443.             ${box_config} ${config} &
  444.       let counter=counter+1
  445.       let rem=counter%par_factor
  446.       if [[ "${rem}" -eq 0 ]]; then
  447.         wait
  448.       fi
  449.     done
  450.     wait
  451.     export TESSDATA_PREFIX=${OLD_TESSDATA_PREFIX}
  452.     # Check that all the output files were produced.
  453.     for img_file in ${img_files}; do
  454.         check_file_readable "${img_file%.*}.${ext}"
  455.     done
  456. }
  457.  
  458. # Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining)
  459. # phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto
  460. phase_C_cluster_prototypes() {
  461.     tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
  462.     local out_normproto=$1
  463.  
  464.     run_command cntraining -D "${TRAINING_DIR}/" \
  465.         $(ls ${TRAINING_DIR}/*.tr)
  466.  
  467.     check_file_readable ${TRAINING_DIR}/normproto
  468.     mv ${TRAINING_DIR}/normproto ${out_normproto}
  469. }
  470.  
  471. # Phase S : (S)hape clustering
  472. phase_S_cluster_shapes() {
  473.     if ((! RUN_SHAPE_CLUSTERING)); then
  474.         tlog "\n=== Shape Clustering disabled ==="
  475.         return
  476.     fi
  477.     check_file_readable ${LANGDATA_ROOT}/font_properties
  478.     local font_props="-F ${LANGDATA_ROOT}/font_properties"
  479.     if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\
  480.        [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
  481.         font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
  482.     fi
  483.  
  484.     run_command shapeclustering \
  485.         -D "${TRAINING_DIR}/" \
  486.         -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
  487.         -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
  488.         ${font_props} \
  489.         $(ls ${TRAINING_DIR}/*.tr)
  490.     check_file_readable ${TRAINING_DIR}/shapetable \
  491.         ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
  492. }
  493.  
  494. # Phase M : Clustering microfeatures (mfTraining)
  495. phase_M_cluster_microfeatures() {
  496.     tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ==="
  497.  
  498.     check_file_readable ${LANGDATA_ROOT}/font_properties
  499.     font_props="-F ${LANGDATA_ROOT}/font_properties"
  500.     if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \
  501.        [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
  502.         font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
  503.     fi
  504.  
  505.     run_command mftraining \
  506.         -D "${TRAINING_DIR}/" \
  507.         -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
  508.         -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
  509.         ${font_props} \
  510.         $(ls ${TRAINING_DIR}/*.tr)
  511.     check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \
  512.         ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
  513.     mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp
  514.     mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable
  515.     mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable
  516.     mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset
  517. }
  518.  
  519. phase_B_generate_ambiguities() {
  520.   tlog "\n=== Phase B : ambiguities training ==="
  521.  
  522.   # Check for manually created ambiguities data.
  523.   if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then
  524.       tlog "Found file ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs"
  525.       cp ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs \
  526.           ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
  527.       # Make it writable, as it may be read-only in the client.
  528.       chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
  529.       return
  530.   else
  531.       tlog "No unicharambigs file found!"
  532.   fi
  533.  
  534.   # TODO: Add support for generating ambiguities automatically.
  535. }
  536.  
  537. make__lstmdata() {
  538.   tlog "\n=== Constructing LSTM training data ==="
  539.   local lang_prefix="${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}"
  540.   if [[ ! -d "${OUTPUT_DIR}" ]]; then
  541.       tlog "Creating new directory ${OUTPUT_DIR}"
  542.       mkdir -p "${OUTPUT_DIR}"
  543.   fi
  544.   local lang_is_rtl=""
  545.   if [[ "${LANG_IS_RTL}" == "1" ]]; then
  546.     lang_is_rtl="--lang_is_rtl"
  547.   fi
  548.   local pass_through=""
  549.   if [[ "${NORM_MODE}" -ge "2" ]]; then
  550.     pass_through="--pass_through_recoder"
  551.   fi
  552.  
  553.   # Build the starter traineddata from the inputs.
  554.   run_command combine_lang_model \
  555.     --input_unicharset "${TRAINING_DIR}/${LANG_CODE}.unicharset" \
  556.     --script_dir "${LANGDATA_ROOT}" \
  557.     --words "${lang_prefix}.wordlist" \
  558.     --numbers "${lang_prefix}.numbers" \
  559.     --puncs "${lang_prefix}.punc" \
  560.     --output_dir "${OUTPUT_DIR}" --lang "${LANG_CODE}" \
  561.     "${pass_through}" "${lang_is_rtl}"
  562.   for f in "${TRAINING_DIR}/${LANG_CODE}".*.lstmf; do
  563.     tlog "Moving ${f} to ${OUTPUT_DIR}"
  564.     mv "${f}" "${OUTPUT_DIR}"
  565.   done
  566.   local lstm_list="${OUTPUT_DIR}/${LANG_CODE}.training_files.txt"
  567.   ls -1 "${OUTPUT_DIR}/${LANG_CODE}".*.lstmf > "${lstm_list}"
  568. }
  569.  
  570. make__traineddata() {
  571.   tlog "\n=== Making final traineddata file ==="
  572.   local lang_prefix=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}
  573.  
  574.   # Combine available files for this language from the langdata dir.
  575.   if [[ -r ${lang_prefix}.config ]]; then
  576.     tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}"
  577.     cp ${lang_prefix}.config ${TRAINING_DIR}
  578.     chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config
  579.   fi
  580.   if [[ -r ${lang_prefix}.cube-unicharset ]]; then
  581.     tlog "Copying ${lang_prefix}.cube-unicharset to ${TRAINING_DIR}"
  582.     cp ${lang_prefix}.cube-unicharset ${TRAINING_DIR}
  583.     chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-unicharset
  584.   fi
  585.   if [[ -r ${lang_prefix}.cube-word-dawg ]]; then
  586.     tlog "Copying ${lang_prefix}.cube-word-dawg to ${TRAINING_DIR}"
  587.     cp ${lang_prefix}.cube-word-dawg ${TRAINING_DIR}
  588.     chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-word-dawg
  589.   fi
  590.   if [[ -r ${lang_prefix}.params-model ]]; then
  591.     tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}"
  592.     cp ${lang_prefix}.params-model ${TRAINING_DIR}
  593.     chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model
  594.   fi
  595.  
  596.   # Compose the traineddata file.
  597.   run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}.
  598.  
  599.   # Copy it to the output dir, overwriting only if allowed by the cmdline flag.
  600.   if [[ ! -d ${OUTPUT_DIR} ]]; then
  601.       tlog "Creating new directory ${OUTPUT_DIR}"
  602.       mkdir -p ${OUTPUT_DIR}
  603.   fi
  604.   local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata;
  605.   if [[ -f ${destfile} ]] && ((! OVERWRITE)); then
  606.       err_exit "File ${destfile} exists and no --overwrite specified";
  607.   fi
  608.   tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}"
  609.   cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile}
  610. }
RAW Paste Data