Advertisement
Guest User

Untitled

a guest
Dec 11th, 2017
3,660
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Bash 4.26 KB | None | 0 0
  1. #!/bin/bash
  2.  
  3. # based on https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract-4.00#fine-tuning-for--a-few-characters
  4.  
  5. # directory with training scripts - this is not the usual place
  6. #   because they are not installed by default
  7. tesstrain_dir=/usr/local/bin/tesseract-training/
  8.  
  9. # directory with the old 'best' training set
  10. tessdata_dir=./tessdata
  11.  
  12. # fonts directory for this system
  13. #fonts_dir=/Library/Fonts
  14.  
  15. # using macports pango in this case
  16. fonts_dir=/opt/local/share/fonts
  17.  
  18.  
  19. # fonts to use for training - not a huge set but we hope enough to
  20. #    add the extra chars
  21. fonts_for_training="'Helvetica' 'Helvetica Bold' 'Helvetica Medium Oblique' 'Helvetica Bold Oblique'  'Times'  'Times Bold' 'Times Medium Italic'  'Times Bold Italic' 'Courier' 'Courier Bold' 'Courier Italic' 'Utopia' 'Utopia Bold' 'Utopia Bold Italic' 'Terminal' 'Terminal Bold'  'New Century Schoolbook' 'New Century Schoolbook Bold' 'New Century Schoolbook Medium Italic' 'New Century Schoolbook Bold Italic' 'Lucida Medium' 'Lucida Medium Italic' 'Lucida Bold' 'Lucida Bold Italic'"
  22.  
  23. # a minimal set for fast tests
  24. #fonts_for_training="'Helvetica' 'Times' 'Courier Bold'"
  25.  
  26. # fonts for computing evals of best fit model
  27. fonts_for_eval="Helvetica"
  28.  
  29.  
  30. # downloaded directory with language data -
  31. #   like XXX.unicharset, eng directory, etc
  32. # IMPORTANT - ADD THE NEW CHARS TO langdata/eng/eng.training_text with
  33. #    about 15 instances per char
  34. langdata_dir=./langdata
  35.  
  36.  
  37.  
  38. # output directories for this run
  39. train_output_dir=./trained_plus_chars
  40. eval_output_dir=./eval_plus_chars
  41.  
  42. # the output trained data file to drop into tesseract
  43. final_trained_data_file=$train_output_dir/eng.traineddata.NEW
  44.  
  45. # fatal bug workaround for pango
  46. export  PANGOCAIRO_BACKEND=fc
  47.  
  48. ################################################################
  49. # variables to set tasks performed
  50. MakeTraining=no
  51. MakeEval=no
  52. MakeLSTM=no
  53. RunTraining=no
  54. BuildFinalTrainedFile=yes
  55. ################################################################
  56.  
  57.  
  58.  
  59.  
  60. if [ $MakeTraining = "yes" ]; then
  61.     echo "###### MAKING TRAINING DATA ######"
  62.  
  63. # the EVAL handles the quotes in the font list
  64. eval $tesstrain_dir/tesstrain.sh  \
  65.      --fonts_dir $fonts_dir \
  66.      --fontlist $fonts_for_training \
  67.      --lang eng \
  68.      --linedata_only\
  69.      --noextract_font_properties \
  70.      --langdata_dir $langdata_dir \
  71.      --tessdata_dir $tessdata_dir \
  72.      --output_dir $train_output_dir
  73. fi
  74.  
  75. # at this point, $train_output_dir should have eng.FontX.exp0.lstmf
  76. # and eng.training_files.txt
  77.  
  78.  
  79. # eval data
  80. if [ $MakeEval = "yes" ]; then
  81.  echo "###### MAKING EVAL DATA ######"
  82. eval $tesstrain_dir/tesstrain.sh \
  83.      --fonts_dir $fonts_dir\
  84.      --fontlist $fonts_for_eval \
  85.      --lang eng \
  86.      --linedata_only \
  87.      --noextract_font_properties \
  88.      --langdata_dir  $langdata_dir \
  89.      --tessdata_dir  $tessdata_dir \
  90.      --output_dir $eval_output_dir
  91.  
  92. fi
  93.  
  94. # at this point, $eval_output_dir should have similar viles as
  95. # $train_output_dir but for different font set
  96.  
  97. if [ $MakeLSTM = "yes" ]; then
  98.     echo "#### combine_testdata to make lstm from previous trained set ####"
  99.     $tesstrain_dir/combine_tessdata \
  100.       -e $tessdata_dir/eng.traineddata  \
  101.        $train_output_dir/eng.lstm
  102. fi
  103.  
  104. # at this point, we should have $train_output_dir/eng.lstm
  105.  
  106. if [ $RunTraining = "yes" ]; then
  107.     echo "#### training from previous optimum  #####"
  108.     $tesstrain_dir/lstmtraining\
  109.     --model_output    $train_output_dir/pluschars \
  110.     --continue_from   $train_output_dir/eng.lstm \
  111.     --traineddata     $train_output_dir/eng/eng.traineddata \
  112.     --old_traineddata $tessdata_dir/eng.traineddata \
  113.     --max_iterations 3600 \
  114.     --train_listfile $train_output_dir/eng.training_files.txt
  115. fi
  116.  
  117.  
  118. if [ $BuildFinalTrainedFile = "yes" ] ; then
  119.     echo "#### Building final trained file $final_trained_data_file d####"
  120.     $tesstrain_dir/lstmtraining \
  121.     --stop_training \
  122.     --continue_from $train_output_dir/pluschars_checkpoint \
  123.     --traineddata $train_output_dir/eng/eng.traineddata \
  124.     --U  $train_output_dir/eng/eng.unicharset \
  125.     --model_output $final_trained_data_file
  126.  
  127. fi
  128.  
  129.  
  130. # now  $final_trained_data_file is substituted for installed
  131. # trained file /usr/local/share/tessdata/eng.traineddata
  132. # (but new chars are not recognized)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement