michciu

pcre

Dec 13th, 2019
247
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 12.14 KB | None | 0 0
  1. /*************************************************
  2. *           PCRE DEMONSTRATION PROGRAM           *
  3. *************************************************/
  4.  
  5. /* This is a demonstration program to illustrate the most straightforward ways
  6. of calling the PCRE regular expression library from a C program. See the
  7. pcresample documentation for a short discussion ("man pcresample" if you have
  8. the PCRE man pages installed).
  9.  
  10. In Unix-like environments, if PCRE is installed in your standard system
  11. libraries, you should be able to compile this program using this command:
  12.  
  13. gcc -Wall pcredemo.c -lpcre -o pcredemo
  14.  
  15. If PCRE is not installed in a standard place, it is likely to be installed with
  16. support for the pkg-config mechanism. If you have pkg-config, you can compile
  17. this program using this command:
  18.  
  19. gcc -Wall pcredemo.c `pkg-config --cflags --libs libpcre` -o pcredemo
  20.  
  21. If you do not have pkg-config, you may have to use this:
  22.  
  23. gcc -Wall pcredemo.c -I/usr/local/include -L/usr/local/lib \
  24.   -R/usr/local/lib -lpcre -o pcredemo
  25.  
  26. Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
  27. library files for PCRE are installed on your system. Only some operating
  28. systems (e.g. Solaris) use the -R option.
  29.  
  30. Building under Windows:
  31.  
  32. If you want to statically link this program against a non-dll .a file, you must
  33. define PCRE_STATIC before including pcre.h, otherwise the pcre_malloc() and
  34. pcre_free() exported functions will be declared __declspec(dllimport), with
  35. unwanted results. So in this environment, uncomment the following line. */
  36.  
  37. #define PCRE_STATIC
  38.  
  39. #include <stdio.h>
  40. #include <string.h>
  41. #include <pcre.h>
  42.  
  43. #define OVECCOUNT 30    /* should be a multiple of 3 */
  44.  
  45. FILE *inputFile;
  46. FILE *outputFile;
  47. char PLIK_TXT[] = "/home/rocket/CLionProjects/clear/sample2.tex";
  48. char patternPreserveTextInsideBrackets[] = "(.*)\\\\(?>title|author|date|enumsentence|section|subsection|emph).?\\{(.*)\\}(.*)";
  49. char patternDeleteAllOccurences[] = "";
  50. char paternDeletWholeLine[] = "^(?>\\s*)\\\\(?>documentclass|usepackage|begin|maketitle|begin|end|includegraphics|nodeconnect).*$";
  51.  
  52. char *returnLine;
  53.  
  54. char *checkLineWithRegex(char *pattern, char *subject);
  55.  
  56. void odczyt_pliku_tekstowego(void);
  57.  
  58. int main() {
  59.     returnLine = malloc(sizeof(char) * 255);
  60.  
  61.     odczyt_pliku_tekstowego();
  62.  
  63.     free(returnLine);
  64.     return 0;
  65. }
  66.  
  67. char *checkLineWithRegex(char *pattern, char *subject) {
  68.     pcre *re;
  69.     const char *error;
  70.     int erroffset;
  71.     int ovector[OVECCOUNT];
  72.     int subject_length;
  73.     int rc, i;
  74.  
  75.     subject_length = (int) strlen(subject);
  76.  
  77.  
  78. /*************************************************************************
  79. * Now we are going to compile the regular expression pattern, and handle *
  80. * and errors that are detected.                                          *
  81. *************************************************************************/
  82.  
  83.     re = pcre_compile(
  84.             pattern,              /* the pattern */
  85.             0,                    /* default options */
  86.             &error,               /* for error message */
  87.             &erroffset,           /* for error offset */
  88.             NULL);                /* use default character tables */
  89.  
  90. /* Compilation failed: print the error message and exit */
  91.  
  92.     if (re == NULL) {
  93.         printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
  94.         return "";
  95.     }
  96.  
  97.  
  98. /*************************************************************************
  99. * If the compilation succeeded, we call PCRE again, in order to do a     *
  100. * pattern match against the subject string. This does just ONE match. If *
  101. * further matching is needed, it will be done below.                     *
  102. *************************************************************************/
  103.  
  104.     rc = pcre_exec(
  105.             re,                   /* the compiled pattern */
  106.             NULL,                 /* no extra data - we didn't study the pattern */
  107.             subject,              /* the subject string */
  108.             subject_length,       /* the length of the subject */
  109.             0,                    /* start at offset 0 in the subject */
  110.             0,                    /* default options */
  111.             ovector,              /* output vector for substring information */
  112.             OVECCOUNT);           /* number of elements in the output vector */
  113.  
  114. /* Matching failed: handle error cases */
  115.  
  116.     if (rc < 0) {
  117.         switch (rc) {
  118.             case PCRE_ERROR_NOMATCH:
  119.                 printf("No match\n");
  120.  
  121.                 return strcpy(returnLine, subject);
  122. //                break;
  123.                 /*
  124.                 Handle other special cases if you like
  125.                 */
  126.             default:
  127.                 printf("Matching error %d\n", rc);
  128.                 break;
  129.         }
  130.         pcre_free(re);     /* Release memory used for the compiled pattern */
  131.         return "";
  132.     }
  133.  
  134. /* Match succeded */
  135.  
  136. //    printf("\nMatch succeeded at offset %d\n", ovector[0]);
  137.  
  138.  
  139. /*************************************************************************
  140. * We have found the first match within the subject string. If the output *
  141. * vector wasn't big enough, say so. Then output any substrings that were *
  142. * captured.                                                              *
  143. *************************************************************************/
  144.  
  145. /* The output vector wasn't big enough */
  146.  
  147.     if (rc == 0) {
  148.         rc = OVECCOUNT / 3;
  149.         printf("ovector only has room for %d captured substrings\n", rc - 1);
  150.     }
  151.  
  152. /* Show substrings stored in the output vector by number. Obviously, in a real
  153. application you might want to do things other than print them. */
  154. //    if (rc >= 2) {
  155.         for (i = 0; i < rc; i++) {
  156.             char *substring_start = subject + ovector[2 * i];
  157.             int substring_length = ovector[2 * i + 1] - ovector[2 * i];
  158.             printf("%2d: %.*s\n", i, substring_length, substring_start);
  159.         }
  160. //    }
  161.     if (rc >= 2) {
  162.             char *substring_start = subject + ovector[2 * 2];
  163.             return strcpy(returnLine, substring_start);
  164.  
  165.     }
  166.  
  167.     // wyciety kod
  168.     unsigned int option_bits;
  169.     int crlf_is_newline;
  170.     int utf8;
  171.     /* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
  172. sequence. First, find the options with which the regex was compiled; extract
  173. the UTF-8 state, and mask off all but the newline options. */
  174.  
  175.     (void)
  176.             pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &option_bits
  177.             );
  178.     utf8 = option_bits & PCRE_UTF8;
  179.     option_bits &= PCRE_NEWLINE_CR | PCRE_NEWLINE_LF | PCRE_NEWLINE_CRLF |
  180.                    PCRE_NEWLINE_ANY |
  181.                    PCRE_NEWLINE_ANYCRLF;
  182.  
  183.     crlf_is_newline =
  184.             option_bits == PCRE_NEWLINE_ANY ||
  185.             option_bits == PCRE_NEWLINE_CRLF ||
  186.             option_bits == PCRE_NEWLINE_ANYCRLF;
  187.     //kilka loopow
  188.     for (;;) {
  189.         int options = 0;                 /* Normally no options */
  190.         int start_offset = ovector[1];   /* Start at end of previous match */
  191.  
  192. /* If the previous match was for an empty string, we are finished if we are
  193. at the end of the subject. Otherwise, arrange to run another match at the
  194. same point to see if a non-empty match can be found. */
  195.  
  196.         if (ovector[0] == ovector[1]) {
  197.             if (ovector[0] == subject_length) break;
  198.             options = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED;
  199.         }
  200.  
  201. /* Run the next matching operation */
  202.  
  203.         rc = pcre_exec(
  204.                 re,                   /* the compiled pattern */
  205.                 NULL,                 /* no extra data - we didn't study the pattern */
  206.                 subject,              /* the subject string */
  207.                 subject_length,       /* the length of the subject */
  208.                 start_offset,         /* starting offset in the subject */
  209.                 options,              /* options */
  210.                 ovector,              /* output vector for substring information */
  211.                 OVECCOUNT);           /* number of elements in the output vector */
  212.  
  213. /* This time, a result of NOMATCH isn't an error. If the value in "options"
  214. is zero, it just means we have found all possible matches, so the loop ends.
  215. Otherwise, it means we have failed to find a non-empty-string match at a
  216. point where there was a previous empty-string match. In this case, we do what
  217. Perl does: advance the matching position by one character, and continue. We
  218. do this by setting the "end of previous match" offset, because that is picked
  219. up at the top of the loop as the point at which to start again.
  220.  
  221. There are two complications: (a) When CRLF is a valid newline sequence, and
  222. the current position is just before it, advance by an extra byte. (b)
  223. Otherwise we must ensure that we skip an entire UTF-8 character if we are in
  224. UTF-8 mode. */
  225.  
  226.         if (rc == PCRE_ERROR_NOMATCH) {
  227.             if (options == 0) break;                    /* All matches found */
  228.             ovector[1] = start_offset + 1;              /* Advance one byte */
  229.             if (
  230.                     crlf_is_newline &&                      /* If CRLF is newline & */
  231.                     start_offset < subject_length
  232.                                    - 1 &&    /* we are at CRLF, */
  233.                     subject[start_offset] == '\r' &&
  234.                     subject[start_offset + 1] == '\n')
  235.                 ovector[1] += 1;                          /* Advance by one more. */
  236.             else if (utf8)                              /* Otherwise, ensure we */
  237.             {                                         /* advance a whole UTF-8 */
  238.                 while (ovector[1] < subject_length)       /* character. */
  239.                 {
  240.                     if ((subject[ovector[1]] & 0xc0) != 0x80) break;
  241.                     ovector[1] += 1;
  242.                 }
  243.             }
  244.             continue;    /* Go round the loop again */
  245.         }
  246.  
  247. /* Other matching errors are not recoverable. */
  248.  
  249.         if (rc < 0) {
  250.             printf("Matching error %d\n", rc);
  251.             pcre_free(re);    /* Release memory used for the compiled pattern */
  252.             return "";
  253.         }
  254.  
  255. /* Match succeded */
  256.  
  257.         printf("\nMatch succeeded again at offset %d\n", ovector[0]);
  258.  
  259. /* The match succeeded, but the output vector wasn't big enough. */
  260.  
  261.         if (rc == 0) {
  262.             rc = OVECCOUNT / 3;
  263.             printf("ovector only has room for %d captured substrings\n", rc - 1);
  264.         }
  265.  
  266. /* As before, show substrings stored in the output vector by number, and then
  267. also any named substrings. */
  268.  
  269.         for (i = 0; i < rc; i++) {
  270.             char *substring_start = subject + ovector[2 * i];
  271.             int substring_length = ovector[2 * i + 1] - ovector[2 * i];
  272.             printf("%2d: %.*s\n", i, substring_length, substring_start);
  273.         }
  274.  
  275.         printf("\n");
  276.         pcre_free(re);       /* Release memory used for the compiled pattern */
  277.         return 0;
  278.     }
  279. }
  280.  
  281.  
  282. void odczyt_pliku_tekstowego(void) {
  283.  
  284.     char buffer[255];
  285.  
  286.     char *temp = malloc(sizeof(char) * 255);
  287.     char outName[60];
  288.  
  289.  
  290.     if ((inputFile = fopen(PLIK_TXT, "r")) !=
  291.         NULL)    // Probuje otworzyc plik w trybie tekstowym do odczytu - ta operacja moze sie nie udac. Kiedy?
  292.     {
  293.         fprintf(stdout, "Otwarto plik %s w trybie odczytu tekstowego.\n",
  294.                 PLIK_TXT);  // W taki sposob rowniez mozna wyswietlac informacje w konsoli
  295.  
  296.         strcpy(outName, PLIK_TXT);
  297.         outputFile = fopen(strcat(outName, "_cleaned.txt"), "w");
  298.         printf("Utworzono plik tekstowy %s\n", outName);
  299.  
  300.         while (fgets(buffer, 255, (FILE *) inputFile)) {
  301.             strcpy(temp, buffer);
  302. //            checkLineWithRegex(paternDeletWholeLine, temp);
  303.             strcpy(returnLine, checkLineWithRegex(patternPreserveTextInsideBrackets, temp));
  304. //            strcpy(returnLine, checkLineWithRegex(patternDeleteAllOccurences, returnLine));
  305.  
  306.             fputs(returnLine, outputFile);
  307.             printf("%s\n", returnLine);
  308.         }
  309.  
  310.  
  311.         fclose(outputFile);
  312.         free(temp);
  313.         fclose(inputFile);
  314.     } else {
  315.         printf("Nie moge odczytac pliku %s!\n", PLIK_TXT);
  316.     }
  317. }
Add Comment
Please, Sign In to add comment