Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/bash
- # BASH SCRIPT FOR ADDING CHARACTERS
- ################################################
- # directory with training scripts
- tesstrain_dir=/usr/local/bin/tesseract-training/
- # directory with the old 'best' training set
- tessdata_dir=~/Desktop/tesseract/tessdata
- # fonts directory for this system
- fonts_dir=~/../../Library/Fonts
- # fonts to use for training - not a huge set but we hope enough to
- # add the extra chars
- #fonts_for_training="Magic R Regular"
- # fonts for computing evals of best fit model
- #fonts_for_eval="Arial"
- # IMPORTANT - ADD THE NEW CHARS TO langdata/eng/eng.training_text with
- # about 15 instances per char
- langdata_dir=~/Desktop/langdata_lstm
- # output directories for this run
- train_output_dir=~/Desktop/tesstutorial/trainplusminus
- eval_output_dir=~/Desktop/tesstutorial/evalplusminus
- # maxpages one wants to create on training
- maxpages=1000
- # maximum iterations for finetuning
- maxiterations=3600
- # the output trained data file to drop into tesseract
- final_trained_data_file=$train_output_dir/eng.traineddata.NEW
- # fatal bug workaround for pango
- export PANGOCAIRO_BACKEND=fc
- ################################################
- ################################################################
- # variables to set tasks performed
- MakeTraining=no
- MakeEval=yes
- MakeLSTM=yes
- RunTraining=yes
- BuildFinalTrainedFile=yes
- ################################################################
- ################################################
- if [ $MakeTraining = "yes" ]; then
- echo "###### MAKING TRAINING DATA ######"
- # GENERATE TRAINING DATA
- rm $train_output_dir/*
- ~/../../usr/local/bin/tesstrain.sh \
- --fonts_dir $fonts_dir \
- --lang eng \
- --linedata_only \
- --langdata_dir $langdata_dir \
- --tessdata_dir $tessdata_dir \
- --fontlist "Arial" \
- --noextract_font_properties\
- --exposures "0" \
- --maxpages $maxpages \
- --save_box_tiff \
- --output_dir $train_output_dir
- fi
- ################################################
- ################################################
- if [ $MakeEval = "yes" ]; then
- echo "###### MAKING EVAL DATA ######"
- # GENERATE TRAINING DATA
- rm $eval_output_dir/*
- ~/../../usr/local/bin/tesstrain.sh \
- --fonts_dir $fonts_dir \
- --lang eng \
- --linedata_only \
- --langdata_dir $langdata_dir \
- --tessdata_dir $tessdata_dir \
- --fontlist "Times New Roman" \
- --noextract_font_properties \
- --exposures "0" \
- --maxpages $maxpages \
- --save_box_tiff \
- --output_dir $eval_output_dir
- fi
- ################################################
- ################################################
- # EXTRACT THE CURRENT MODEL OF THE BEST TRAINING DATA SET
- if [ $MakeLSTM = "yes" ]; then
- echo "#### combine_testdata to make lstm from previous trained set ####"
- ~/../../usr/local/bin/combine_tessdata \
- -e $tessdata_dir/eng.traineddata $train_output_dir/eng.lstm
- fi
- ################################################
- ################################################
- # FINETUNE THE CURRENT MODEL VIA THE NEW TRAINING DATA
- if [ $RunTraining = "yes" ]; then
- echo "#### training from previous optimum #####"
- ~/../../usr/local/bin/lstmtraining \
- --continue_from $train_output_dir/eng.lstm \
- --model_output $train_output_dir/plusminus \
- --traineddata $train_output_dir/eng/eng.traineddata \
- --old_traineddata $tessdata_dir/eng.traineddata \
- --train_listfile $train_output_dir/eng.training_files.txt \
- --max_iterations $maxiterations
- fi
- ################################################
- ################################################
- # COMBINE THE NEW BEST TRAINING DATA
- if [ $BuildFinalTrainedFile = "yes" ] ; then
- echo "#### Building final trained file $final_trained_data_file ####"
- ~/../../usr/local/bin/lstmtraining --stop_training\
- --continue_from $train_output_dir/plusminus_checkpoint\
- --traineddata $train_output_dir/eng/eng.traineddata\
- --old_traineddata $tessdata_dir/eng.traineddata\
- --U $train_output_dir/eng/eng.unicharset\
- --model_output $train_output_dir/eng.traineddata
- fi
- ################################################
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement