Guest User

Untitled

a guest
Aug 21st, 2019
109
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 8.61 KB | None | 0 0
  1. diff -r -u0 willuslib/ocrtess.c willuslib/ocrtess.c
  2. --- willuslib/ocrtess.c 2018-12-31 19:59:58.000000000 +0100
  3. +++ willuslib/ocrtess.c 2019-07-27 18:47:06.706765733 +0200
  4. @@ -29,0 +30,258 @@
  5. +
  6. +
  7. +/*
  8. +** ocr_type=0:  OEM_DEFAULT
  9. +** ocr_type=1:  OEM_TESSERACT_ONLY
  10. +** ocr_type=2:  OEM_LSTM_ONLY
  11. +** ocr_type=3:  OEM_TESSERACT_LSTM_COMBINED
  12. +*/
  13. +void *tess_capi_init(char *datapath,char *language,int ocr_type,FILE *out,
  14. +                     char *initstr,int maxlen,int *status)
  15. +
  16. +    {
  17. +    char original_locale[256];
  18. +    TessBaseAPI *api = TessBaseAPICreate();
  19. +
  20. +    /* willus mod, 11-24-16 */
  21. +    /* Tesseract needs "C" locale to correctly parse all data .traineddata files. */
  22. +
  23. +    strncpy(original_locale,setlocale(LC_ALL,NULL),255);
  24. +    original_locale[255]='\0';
  25. +    setlocale(LC_ALL,"C");
  26. +
  27. +    // Make the order of args a bit more forgiving than it used to be.
  28. +    const char* lang = "eng";
  29. +    TessPageSegMode pagesegmode = PSM_SINGLE_BLOCK;
  30. +    if (language!=NULL && language[0]!='\0')
  31. +        lang = language;
  32. +
  33. +/*
  34. +v4.00 loads either TESSERACT enginer, LSTM engine, or both.  No CUBE.
  35. +*/
  36. +    ocr_type=0; /* Ignore specified and use default */
  37. +    TessBaseAPISetOutputName(api, NULL);
  38. +    (*status)=TessBaseAPIInit2(api, datapath,lang,
  39. +              ocr_type==0 ? OEM_DEFAULT :
  40. +                (ocr_type==1 ? OEM_TESSERACT_ONLY :
  41. +                   (ocr_type==2 ? OEM_LSTM_ONLY :
  42. +                                  (OEM_TESSERACT_LSTM_COMBINED))));
  43. +    if ((*status)!=0)
  44. +        {
  45. +        /* willus mod, 11-24-16 */
  46. +        setlocale(LC_ALL,original_locale);
  47. +        TessBaseAPIEnd(api);
  48. +        TessBaseAPIDelete(api);
  49. +        return(NULL);
  50. +        }
  51. +    /*
  52. +    api.Init("tesscapi",lang,tesseract::OEM_DEFAULT,
  53. +           &(argv[arg]), argc - arg, NULL, NULL, false);
  54. +    */
  55. +    // We have 2 possible sources of pagesegmode: a config file and
  56. +    // the command line. For backwards compatability reasons, the
  57. +    // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
  58. +    // default for this program is tesseract::PSM_AUTO. We will let
  59. +    // the config file take priority, so the command-line default
  60. +    // can take priority over the tesseract default, so we use the
  61. +    // value from the command line only if the retrieved mode
  62. +    // is still tesseract::PSM_SINGLE_BLOCK, indicating no change
  63. +    // in any config file. Therefore the only way to force
  64. +    // tesseract::PSM_SINGLE_BLOCK is from the command line.
  65. +    // It would be simpler if we could set the value before Init,
  66. +    // but that doesn't work.
  67. +    if (TessBaseAPIGetPageSegMode(api) == PSM_SINGLE_BLOCK)
  68. +        TessBaseAPISetPageSegMode(api, pagesegmode);
  69. +
  70. +    /*
  71. +    ** Initialization message
  72. +    */
  73. +    {
  74. +    char istr[1024];
  75. +
  76. +// printf("tessedit_ocr_engine_mode = %d\n",tessedit_ocr_engine_mode);
  77. +    sprintf(istr,"%s",TessVersion());
  78. +    sprintf(&istr[strlen(istr)],"\n    Tesseract data folder = '%s'",datapath==NULL?getenv("TESSDATA_PREFIX"):datapath);
  79. +    strcat(istr,"\n    Tesseract languages: ");
  80. +    char** languages = TessBaseAPIGetAvailableLanguagesAsVector(api);
  81. +/*
  82. +printf("OEM=%d\n",api->oem());
  83. +printf("Langs='%s'\n",api->GetInitLanguagesAsString());
  84. +printf("AnyTessLang()=%d\n",(int)api->tesseract()->AnyTessLang());
  85. +printf("AnyLSTMLang()=%d\n",(int)api->tesseract()->AnyLSTMLang());
  86. +printf("num_sub_langs()=%d\n",api->tesseract()->num_sub_langs());
  87. +printf("languages.size()=%d\n",(int)languages.size());
  88. +*/
  89. +    char* l = languages;
  90. +    int eng = 0;
  91. +    TessBaseAPI *lang1;
  92. +    while (l != NULL)
  93. +        {
  94. +            eng=(int)TessBaseAPIOem(api);
  95. +            sprintf(&istr[strlen(istr)],"%s%s [%s]",l==languages?"":", ",l,
  96. +                    eng==2?"LSTM+Tess":(eng==1?"LSTM":"Tess"));
  97. +            l++;
  98. +        }
  99. +    
  100. +    TessDeleteTextArray(languages);
  101. +
  102. +    /*
  103. +    if (ocr_type==0 || ocr_type==3)
  104. +        sprintf(&istr[strlen(istr)],"[LSTM+] (lang=");
  105. +    else if (ocr_type==2)
  106. +        sprintf(&istr[strlen(istr)],"[LSTM] (lang=");
  107. +    strncpy(&istr[strlen(istr)],language,253-strlen(istr));
  108. +    istr[253]='\0';
  109. +    strcat(istr,")");
  110. +    */
  111. +    if (out!=NULL)
  112. +        fprintf(out,"%s\n",istr);
  113. +    if (initstr!=NULL)
  114. +        {
  115. +        strncpy(initstr,istr,maxlen-1);
  116. +        initstr[maxlen-1]='\0';
  117. +        }
  118. +    }
  119. +
  120. +
  121. +    /* Turn off LSTM debugging output */
  122. +    TessBaseAPISetVariable(api,"lstm_debug_level","0");
  123. +#if (WILLUSDEBUG & 1)
  124. +    TessBaseAPISetVariable(api,"lstm_debug_level","9");
  125. +    TessBaseAPISetVariable(api,"paragraph_debug_level","9");
  126. +    TessBaseAPISetVariable(api,"tessdata_manager_debug_level","9");
  127. +    TessBaseAPISetVariable(api,"tosp_debug_level","9");
  128. +    TessBaseAPISetVariable(api,"wordrec_debug_level","9");
  129. +    TessBaseAPISetVariable(api,"segsearch_debug_level","9");
  130. +#endif
  131. +    /* willus mod, 11-24-16 */
  132. +    setlocale(LC_ALL,original_locale);
  133. +    return((void *)api);
  134. +    }
  135. +
  136. +
  137. +int tess_capi_get_ocr(void *vapi,PIX *pix,char *outstr,int maxlen,int segmode,FILE *out)
  138. +
  139. +    {
  140. +    TessBaseAPI *api;
  141. +    static int old_segmode=-1;
  142. +
  143. +    api=(TessBaseAPI *)vapi;
  144. +    if (old_segmode != segmode)
  145. +        {
  146. +        old_segmode=segmode;
  147. +        TessBaseAPISetPageSegMode(api, (TessPageSegMode)segmode);
  148. +        }
  149. +    if (!TessBaseAPIProcessPage(api,pix,0,NULL,NULL,0,NULL))
  150. +        {
  151. +        /* pixDestroy(&pix); */
  152. +        if (out!=NULL)
  153. +            fprintf(out,"tesscapi:  Error during bitmap processing.\n");
  154. +        TessBaseAPIClear(api);
  155. +        return(-1);
  156. +        }
  157. +    char* text = TessBaseAPIGetUTF8Text(api);
  158. +    strncpy(outstr,text,maxlen-1);
  159. +    outstr[maxlen-1]='\0';
  160. +    TessDeleteText(text);
  161. +    TessBaseAPIClear(api);
  162. +    return(0);
  163. +    }
  164. +
  165. +
  166. +int tess_capi_get_ocr_multiword(void *vapi,PIX *pix,int segmode,
  167. +                                int **left,int **top,int **right,int **bottom,
  168. +                                int **ybase,char **text,int *nw,
  169. +                                FILE *out)
  170. +
  171. +    {
  172. +    TessBaseAPI *api;
  173. +    static int old_segmode=-1;
  174. +
  175. +    api=(TessBaseAPI *)vapi;
  176. +    if (old_segmode != segmode)
  177. +        {
  178. +        old_segmode=segmode;
  179. +        TessBaseAPISetPageSegMode(api, (TessPageSegMode)segmode);
  180. +        }
  181. +    if (!TessBaseAPIProcessPage(api,pix,0,NULL,NULL,0,NULL))
  182. +        {
  183. +        if (out!=NULL)
  184. +            fprintf(out,"tesscapi:  Error during bitmap processing.\n");
  185. +        TessBaseAPIClear(api);
  186. +        (*nw)=0;
  187. +        return(-1);
  188. +        }
  189. +    
  190. +    //(*nw)=api->GetOCRWords(left,top,right,bottom,ybase,text);
  191. +    
  192. +    int iword,nwords,totlen,it8;
  193. +    int *x0,*y0,*x1,*y1,*ybaseline;
  194. +    char *tutf8;
  195. +
  196. +    TessResultIterator *res_it = TessBaseAPIGetIterator(api);
  197. +    /* Count words */
  198. +    iword=0;
  199. +    totlen=0;
  200. +    while(TessResultIteratorNext(res_it, RIL_BLOCK))
  201. +        {
  202. +        if(!TessResultIteratorNext(res_it, RIL_WORD))
  203. +            {
  204. +            continue;
  205. +            }
  206. +        iword++;
  207. +        char* textstr = TessResultIteratorGetUTF8Text(res_it, RIL_WORD);
  208. +        totlen+=strlen(textstr)+1;
  209. +        }
  210. +    nwords = iword;
  211. +    
  212. +    x0=(*left)=(int *)malloc(sizeof(int)*5*nwords);
  213. +    y0=(*top)=&x0[nwords];
  214. +    x1=(*right)=&y0[nwords];
  215. +    y1=(*bottom)=&x1[nwords];
  216. +    ybaseline=(*ybase)=&y1[nwords];
  217. +    tutf8=(*text)=(char *)malloc(totlen);
  218. +    iword=0;
  219. +    it8=0;
  220. +    TessPageIteratorBegin( (TessPageIterator *) res_it);
  221. +    while (TessResultIteratorNext(res_it, RIL_BLOCK))
  222. +        {
  223. +        if (!TessResultIteratorNext(res_it, RIL_WORD))
  224. +            {
  225. +            continue;
  226. +            }
  227. +        char* textstr = TessResultIteratorGetUTF8Text(res_it, RIL_WORD);
  228. +        strcpy(&tutf8[it8],textstr);
  229. +        it8 += strlen(&tutf8[it8])+1;
  230. +        
  231. +        int bbleft, bbtop, bbright, bbbottom;
  232. +        int u1,v1,u2,v2;
  233. +        TessPageIteratorBoundingBox( (TessPageIterator *) res_it, RIL_WORD, &bbleft, &bbtop, &bbright, &bbbottom);
  234. +        TessPageIteratorBaseline( (TessPageIterator *) res_it, RIL_WORD, &u1, &v1, &u2, &v2);
  235. +        x0[iword]=bbleft;
  236. +        x1[iword]=bbright;
  237. +        y0[iword]=bbtop;
  238. +        y1[iword]=bbbottom;
  239. +        ybaseline[iword]=(v1+v2)/2;
  240. +        iword++;
  241. +        }
  242. +        
  243. +    TessResultIteratorDelete(res_it);
  244. +    
  245. +    (*nw) = iword;
  246. +    
  247. +    TessBaseAPIClear(api);
  248. +    return(0);
  249. +    }
  250. +
  251. +
  252. +void tess_capi_end(void *vapi)
  253. +
  254. +    {
  255. +    TessBaseAPI *api;
  256. +
  257. +    if (vapi==NULL)
  258. +        return;
  259. +    api=(TessBaseAPI *)vapi;
  260. +    TessBaseAPIEnd(api);
  261. +    TessBaseAPIDelete(api);
  262. +    }
Add Comment
Please, Sign In to add comment