Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- diff -r -u0 willuslib/ocrtess.c willuslib/ocrtess.c
- --- willuslib/ocrtess.c 2018-12-31 19:59:58.000000000 +0100
- +++ willuslib/ocrtess.c 2019-07-27 18:47:06.706765733 +0200
- @@ -29,0 +30,258 @@
- +
- +
- +/*
- +** ocr_type=0: OEM_DEFAULT
- +** ocr_type=1: OEM_TESSERACT_ONLY
- +** ocr_type=2: OEM_LSTM_ONLY
- +** ocr_type=3: OEM_TESSERACT_LSTM_COMBINED
- +*/
- +void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out,
- + char *initstr,int maxlen,int *status)
- +
- + {
- + char original_locale[256];
- + TessBaseAPI *api = TessBaseAPICreate();
- +
- + /* willus mod, 11-24-16 */
- + /* Tesseract needs "C" locale to correctly parse all data .traineddata files. */
- +
- + strncpy(original_locale,setlocale(LC_ALL,NULL),255);
- + original_locale[255]='\0';
- + setlocale(LC_ALL,"C");
- +
- + // Make the order of args a bit more forgiving than it used to be.
- + const char* lang = "eng";
- + TessPageSegMode pagesegmode = PSM_SINGLE_BLOCK;
- + if (language!=NULL && language[0]!='\0')
- + lang = language;
- +
- +/*
- +v4.00 loads either TESSERACT enginer, LSTM engine, or both. No CUBE.
- +*/
- + ocr_type=0; /* Ignore specified and use default */
- + TessBaseAPISetOutputName(api, NULL);
- + (*status)=TessBaseAPIInit2(api, datapath,lang,
- + ocr_type==0 ? OEM_DEFAULT :
- + (ocr_type==1 ? OEM_TESSERACT_ONLY :
- + (ocr_type==2 ? OEM_LSTM_ONLY :
- + (OEM_TESSERACT_LSTM_COMBINED))));
- + if ((*status)!=0)
- + {
- + /* willus mod, 11-24-16 */
- + setlocale(LC_ALL,original_locale);
- + TessBaseAPIEnd(api);
- + TessBaseAPIDelete(api);
- + return(NULL);
- + }
- + /*
- + api.Init("tesscapi",lang,tesseract::OEM_DEFAULT,
- + &(argv[arg]), argc - arg, NULL, NULL, false);
- + */
- + // We have 2 possible sources of pagesegmode: a config file and
- + // the command line. For backwards compatability reasons, the
- + // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
- + // default for this program is tesseract::PSM_AUTO. We will let
- + // the config file take priority, so the command-line default
- + // can take priority over the tesseract default, so we use the
- + // value from the command line only if the retrieved mode
- + // is still tesseract::PSM_SINGLE_BLOCK, indicating no change
- + // in any config file. Therefore the only way to force
- + // tesseract::PSM_SINGLE_BLOCK is from the command line.
- + // It would be simpler if we could set the value before Init,
- + // but that doesn't work.
- + if (TessBaseAPIGetPageSegMode(api) == PSM_SINGLE_BLOCK)
- + TessBaseAPISetPageSegMode(api, pagesegmode);
- +
- + /*
- + ** Initialization message
- + */
- + {
- + char istr[1024];
- +
- +// printf("tessedit_ocr_engine_mode = %d\n",tessedit_ocr_engine_mode);
- + sprintf(istr,"%s",TessVersion());
- + sprintf(&istr[strlen(istr)],"\n Tesseract data folder = '%s'",datapath==NULL?getenv("TESSDATA_PREFIX"):datapath);
- + strcat(istr,"\n Tesseract languages: ");
- + char** languages = TessBaseAPIGetAvailableLanguagesAsVector(api);
- +/*
- +printf("OEM=%d\n",api->oem());
- +printf("Langs='%s'\n",api->GetInitLanguagesAsString());
- +printf("AnyTessLang()=%d\n",(int)api->tesseract()->AnyTessLang());
- +printf("AnyLSTMLang()=%d\n",(int)api->tesseract()->AnyLSTMLang());
- +printf("num_sub_langs()=%d\n",api->tesseract()->num_sub_langs());
- +printf("languages.size()=%d\n",(int)languages.size());
- +*/
- + char* l = languages;
- + int eng = 0;
- + TessBaseAPI *lang1;
- + while (l != NULL)
- + {
- + eng=(int)TessBaseAPIOem(api);
- + sprintf(&istr[strlen(istr)],"%s%s [%s]",l==languages?"":", ",l,
- + eng==2?"LSTM+Tess":(eng==1?"LSTM":"Tess"));
- + l++;
- + }
- +
- + TessDeleteTextArray(languages);
- +
- + /*
- + if (ocr_type==0 || ocr_type==3)
- + sprintf(&istr[strlen(istr)],"[LSTM+] (lang=");
- + else if (ocr_type==2)
- + sprintf(&istr[strlen(istr)],"[LSTM] (lang=");
- + strncpy(&istr[strlen(istr)],language,253-strlen(istr));
- + istr[253]='\0';
- + strcat(istr,")");
- + */
- + if (out!=NULL)
- + fprintf(out,"%s\n",istr);
- + if (initstr!=NULL)
- + {
- + strncpy(initstr,istr,maxlen-1);
- + initstr[maxlen-1]='\0';
- + }
- + }
- +
- +
- + /* Turn off LSTM debugging output */
- + TessBaseAPISetVariable(api,"lstm_debug_level","0");
- +#if (WILLUSDEBUG & 1)
- + TessBaseAPISetVariable(api,"lstm_debug_level","9");
- + TessBaseAPISetVariable(api,"paragraph_debug_level","9");
- + TessBaseAPISetVariable(api,"tessdata_manager_debug_level","9");
- + TessBaseAPISetVariable(api,"tosp_debug_level","9");
- + TessBaseAPISetVariable(api,"wordrec_debug_level","9");
- + TessBaseAPISetVariable(api,"segsearch_debug_level","9");
- +#endif
- + /* willus mod, 11-24-16 */
- + setlocale(LC_ALL,original_locale);
- + return((void *)api);
- + }
- +
- +
- +int tess_capi_get_ocr(void *vapi,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out)
- +
- + {
- + TessBaseAPI *api;
- + static int old_segmode=-1;
- +
- + api=(TessBaseAPI *)vapi;
- + if (old_segmode != segmode)
- + {
- + old_segmode=segmode;
- + TessBaseAPISetPageSegMode(api, (TessPageSegMode)segmode);
- + }
- + if (!TessBaseAPIProcessPage(api,pix,0,NULL,NULL,0,NULL))
- + {
- + /* pixDestroy(&pix); */
- + if (out!=NULL)
- + fprintf(out,"tesscapi: Error during bitmap processing.\n");
- + TessBaseAPIClear(api);
- + return(-1);
- + }
- + char* text = TessBaseAPIGetUTF8Text(api);
- + strncpy(outstr,text,maxlen-1);
- + outstr[maxlen-1]='\0';
- + TessDeleteText(text);
- + TessBaseAPIClear(api);
- + return(0);
- + }
- +
- +
- +int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode,
- + int **left,int **top,int **right,int **bottom,
- + int **ybase,char **text,int *nw,
- + FILE *out)
- +
- + {
- + TessBaseAPI *api;
- + static int old_segmode=-1;
- +
- + api=(TessBaseAPI *)vapi;
- + if (old_segmode != segmode)
- + {
- + old_segmode=segmode;
- + TessBaseAPISetPageSegMode(api, (TessPageSegMode)segmode);
- + }
- + if (!TessBaseAPIProcessPage(api,pix,0,NULL,NULL,0,NULL))
- + {
- + if (out!=NULL)
- + fprintf(out,"tesscapi: Error during bitmap processing.\n");
- + TessBaseAPIClear(api);
- + (*nw)=0;
- + return(-1);
- + }
- +
- + //(*nw)=api->GetOCRWords(left,top,right,bottom,ybase,text);
- +
- + int iword,nwords,totlen,it8;
- + int *x0,*y0,*x1,*y1,*ybaseline;
- + char *tutf8;
- +
- + TessResultIterator *res_it = TessBaseAPIGetIterator(api);
- + /* Count words */
- + iword=0;
- + totlen=0;
- + while(TessResultIteratorNext(res_it, RIL_BLOCK))
- + {
- + if(!TessResultIteratorNext(res_it, RIL_WORD))
- + {
- + continue;
- + }
- + iword++;
- + char* textstr = TessResultIteratorGetUTF8Text(res_it, RIL_WORD);
- + totlen+=strlen(textstr)+1;
- + }
- + nwords = iword;
- +
- + x0=(*left)=(int *)malloc(sizeof(int)*5*nwords);
- + y0=(*top)=&x0[nwords];
- + x1=(*right)=&y0[nwords];
- + y1=(*bottom)=&x1[nwords];
- + ybaseline=(*ybase)=&y1[nwords];
- + tutf8=(*text)=(char *)malloc(totlen);
- + iword=0;
- + it8=0;
- + TessPageIteratorBegin( (TessPageIterator *) res_it);
- + while (TessResultIteratorNext(res_it, RIL_BLOCK))
- + {
- + if (!TessResultIteratorNext(res_it, RIL_WORD))
- + {
- + continue;
- + }
- + char* textstr = TessResultIteratorGetUTF8Text(res_it, RIL_WORD);
- + strcpy(&tutf8[it8],textstr);
- + it8 += strlen(&tutf8[it8])+1;
- +
- + int bbleft, bbtop, bbright, bbbottom;
- + int u1,v1,u2,v2;
- + TessPageIteratorBoundingBox( (TessPageIterator *) res_it, RIL_WORD, &bbleft, &bbtop, &bbright, &bbbottom);
- + TessPageIteratorBaseline( (TessPageIterator *) res_it, RIL_WORD, &u1, &v1, &u2, &v2);
- + x0[iword]=bbleft;
- + x1[iword]=bbright;
- + y0[iword]=bbtop;
- + y1[iword]=bbbottom;
- + ybaseline[iword]=(v1+v2)/2;
- + iword++;
- + }
- +
- + TessResultIteratorDelete(res_it);
- +
- + (*nw) = iword;
- +
- + TessBaseAPIClear(api);
- + return(0);
- + }
- +
- +
- +void tess_capi_end(void *vapi)
- +
- + {
- + TessBaseAPI *api;
- +
- + if (vapi==NULL)
- + return;
- + api=(TessBaseAPI *)vapi;
- + TessBaseAPIEnd(api);
- + TessBaseAPIDelete(api);
- + }
Add Comment
Please, Sign In to add comment