Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/bash
- # based on https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract-4.00#fine-tuning-for--a-few-characters
- # directory with training scripts - this is not the usual place
- # because they are not installed by default
- tesstrain_dir=/usr/local/bin/tesseract-training/
- # directory with the old 'best' training set
- tessdata_dir=./tessdata
- # fonts directory for this system
- #fonts_dir=/Library/Fonts
- # using macports pango in this case
- fonts_dir=/opt/local/share/fonts
- # fonts to use for training - not a huge set but we hope enough to
- # add the extra chars
- fonts_for_training="'Helvetica' 'Helvetica Bold' 'Helvetica Medium Oblique' 'Helvetica Bold Oblique' 'Times' 'Times Bold' 'Times Medium Italic' 'Times Bold Italic' 'Courier' 'Courier Bold' 'Courier Italic' 'Utopia' 'Utopia Bold' 'Utopia Bold Italic' 'Terminal' 'Terminal Bold' 'New Century Schoolbook' 'New Century Schoolbook Bold' 'New Century Schoolbook Medium Italic' 'New Century Schoolbook Bold Italic' 'Lucida Medium' 'Lucida Medium Italic' 'Lucida Bold' 'Lucida Bold Italic'"
- # a minimal set for fast tests
- #fonts_for_training="'Helvetica' 'Times' 'Courier Bold'"
- # fonts for computing evals of best fit model
- fonts_for_eval="Helvetica"
- # downloaded directory with language data -
- # like XXX.unicharset, eng directory, etc
- # IMPORTANT - ADD THE NEW CHARS TO langdata/eng/eng.training_text with
- # about 15 instances per char
- langdata_dir=./langdata
- # output directories for this run
- train_output_dir=./trained_plus_chars
- eval_output_dir=./eval_plus_chars
- # the output trained data file to drop into tesseract
- final_trained_data_file=$train_output_dir/eng.traineddata.NEW
- # fatal bug workaround for pango
- export PANGOCAIRO_BACKEND=fc
- ################################################################
- # variables to set tasks performed
- MakeTraining=no
- MakeEval=no
- MakeLSTM=no
- RunTraining=no
- BuildFinalTrainedFile=yes
- ################################################################
- if [ $MakeTraining = "yes" ]; then
- echo "###### MAKING TRAINING DATA ######"
- # the EVAL handles the quotes in the font list
- eval $tesstrain_dir/tesstrain.sh \
- --fonts_dir $fonts_dir \
- --fontlist $fonts_for_training \
- --lang eng \
- --linedata_only\
- --noextract_font_properties \
- --langdata_dir $langdata_dir \
- --tessdata_dir $tessdata_dir \
- --output_dir $train_output_dir
- fi
- # at this point, $train_output_dir should have eng.FontX.exp0.lstmf
- # and eng.training_files.txt
- # eval data
- if [ $MakeEval = "yes" ]; then
- echo "###### MAKING EVAL DATA ######"
- eval $tesstrain_dir/tesstrain.sh \
- --fonts_dir $fonts_dir\
- --fontlist $fonts_for_eval \
- --lang eng \
- --linedata_only \
- --noextract_font_properties \
- --langdata_dir $langdata_dir \
- --tessdata_dir $tessdata_dir \
- --output_dir $eval_output_dir
- fi
- # at this point, $eval_output_dir should have similar viles as
- # $train_output_dir but for different font set
- if [ $MakeLSTM = "yes" ]; then
- echo "#### combine_testdata to make lstm from previous trained set ####"
- $tesstrain_dir/combine_tessdata \
- -e $tessdata_dir/eng.traineddata \
- $train_output_dir/eng.lstm
- fi
- # at this point, we should have $train_output_dir/eng.lstm
- if [ $RunTraining = "yes" ]; then
- echo "#### training from previous optimum #####"
- $tesstrain_dir/lstmtraining\
- --model_output $train_output_dir/pluschars \
- --continue_from $train_output_dir/eng.lstm \
- --traineddata $train_output_dir/eng/eng.traineddata \
- --old_traineddata $tessdata_dir/eng.traineddata \
- --max_iterations 3600 \
- --train_listfile $train_output_dir/eng.training_files.txt
- fi
- if [ $BuildFinalTrainedFile = "yes" ] ; then
- echo "#### Building final trained file $final_trained_data_file d####"
- $tesstrain_dir/lstmtraining \
- --stop_training \
- --continue_from $train_output_dir/pluschars_checkpoint \
- --traineddata $train_output_dir/eng/eng.traineddata \
- --U $train_output_dir/eng/eng.unicharset \
- --model_output $final_trained_data_file
- fi
- # now $final_trained_data_file is substituted for installed
- # trained file /usr/local/share/tessdata/eng.traineddata
- # (but new chars are not recognized)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement