Advertisement
Guest User

Untitled

a guest
Jul 20th, 2018
75
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 15.16 KB | None | 0 0
  1. #pragma once
  2.  
  3. // This header contains functionality for writing recursive descent parsers.
  4. // If you're not familiar with the concept, recursive descent uses mutually
  5. // recursive functions to mimic the structure of a grammar and parse it out.
  6. //
  7. // A good, simple example of this is in json.h  This is a full featured json
  8. // parser (thanks largely to JSONs very simple grammar).  Recursive descent
  9. // parsers parse "top down" (starting with the most general grammatical element).
  10. // So the parsing begins with parse_json_value which calls sub-parsing functions
  11. // for each grammatical element (number, string, array, etc).
  12. //
  13. // This particular implementation is tokenless, meaning that there is no
  14. // separate tokenization phase that produces tokens for the parser to consume.
  15. // The parser matches against the input text directly.
  16. //
  17. // The primary type provided is `parser`.  It's constructed with a string
  18. // buffer (either std::string or const char*) and provides convenience functions
  19. // for parsing data.  It wraps the provided text buffer in a `stream` object
  20. // that provides a wrapper around it to advance the stream and manage seeking
  21. // to handle eg: backtracking.  Stream is designed so that the stream can be
  22. // moved through by only advancing an offset structure.  No string copying is
  23. // necessary, so it's quite fast.
  24. //
  25. // Basic usage:
  26. //   1) inherit from parser (you can inherit constructors if you like):
  27. //         struct json_parser : parser {
  28. //             using parser::parser;
  29. //
  30. //
  31. //   2) implement parsing functions
  32. //
  33. //            // run the parser
  34. //             maybe<json_value> parse() {
  35. //                 return parse_json_value();
  36. //             }
  37. //
  38. //         private:
  39. //             maybe<json_value> parse_json_value() {
  40. //                 // consume any whitespace
  41. //                 whitespace();
  42. //
  43. //                 // peak ahead to see if we have an object to parse
  44. //                 // luckily JSON is unambiguous here.
  45. //                 switch (*stream) {
  46. //                     case '[': return parse_json_array (); break;
  47. //                     case '{': return parse_json_object(); break;
  48. //                     case '"': return parse_json_string(); break;
  49. //                     case 't': if (literal("true"))  return json_bool(true);  break;
  50. //                     case 'f': if (literal("false")) return json_bool(false); break;
  51. //                     case 'n': if (literal("null"))  return json_value();     break;
  52. //                     default:  break;
  53. //                 }
  54. //
  55. //                 // if not any of the above, try to parse a number
  56. //                 return parse_json_number();
  57. //         }
  58. //
  59. //
  60. // The end result of parsing should be some sort of object built from the parsed code.
  61. // In general, you may access the underlying data stream via the stream member, and it
  62. // is acted upon implicitly when using the helpers.
  63. //
  64. // Recursive descent parsers are implemented as recursive functions as can be seen above.
  65. // The parser generally ends up matching the underlying grammar quite closely.  In a
  66. // language like C++ you don't want to recurse too deeply, so iteration is often a better
  67. // alternative when eg: parsing a list of values.
  68. //
  69. // It's recommended to write parsing functions so that if they fail to parse, there is no
  70. // change to the underlying stream.  This can be done using the cursor() and seek() functions
  71. // of stream, which will return and set the current stream position, respectively.  Recursive
  72. // descent parsers can backtrack arbitrarily, so it's easier to reason about functions that
  73. // are "all or nothing", eg they don't partially consume input when they fail.
  74. //
  75. // Helpers
  76. //   Several "micro-parser" helpers are provided for common operations.
  77. //
  78. // whitespace - matches and consumes any whitespace
  79. // literal    - matches a literal character or string
  80. // token      - matches a literal character or string, ignoring whitespace
  81. // to_eol     - matches data to the next newline character (or end of input)
  82. //
  83. // Errors
  84. //   In general, recursive descent parsers can backtrack arbitrarily, so it's possible to have
  85. // many errors.  Rather than trying to guess at how to handle them, the parser class simple
  86. // provides a push_error() and pop_error() function that push a new error message on a stack
  87. // with stream position information.
  88. //
  89. //   Parsers can then choose how to pop errors and when to print error information.  The
  90. // last_error() and print_last_error() functions can help with this.
  91. //
  92.  
  93. // local
  94. #include <maybe.h>
  95. #include <verify.h>
  96. #include <strutil.h>
  97. #include <terminal.h>
  98.  
  99. // c++
  100. #include <memory>
  101. #include <string>
  102. #include <vector>
  103. #include <cassert>
  104.  
  105. // c
  106. #include <string.h>
  107.  
  108. using std::string;
  109. using std::vector;
  110. using std::shared_ptr;
  111.  
  112.  
  113. // main class to inherit from to build a parser
  114. struct parser {
  115.     // an index into the stream, holds linear offset and row/column information
  116.     struct index {
  117.         ssize_t off=0;
  118.         ssize_t row=0;
  119.         ssize_t col=0;
  120.     };
  121.  
  122.     // an error is a message and a position in the stream
  123.     struct error {
  124.         index  idx;
  125.         string msg;
  126.     };
  127.  
  128.     // a match in the underlying stream
  129.     struct match {
  130.         match(const char* str, ssize_t len, index pos)
  131.             : str(str), len(len), pos(pos) {}
  132.        
  133.         const char* str = nullptr;
  134.         ssize_t     len = 0;
  135.         index       pos;
  136.  
  137.         operator string() const {
  138.             return string(str, len);
  139.         }
  140.     };
  141.    
  142.     // stream wraps up a char array and lets us access it nicely
  143.     // it handles the bookkeeping related to where we are in the stream
  144.     // and consuming characters/generating match objects.
  145.     //
  146.     struct stream {
  147.         // create stream from c-string
  148.         stream() =default;
  149.         stream(const char* str, ssize_t len=-1)
  150.             : str_(str), len_(len >= 0 ? len : strlen(str)), end_(str_ + len_) {}
  151.  
  152.         // index stream, get character at given offset from current position
  153.         const char& operator[](ssize_t off) const {
  154.             return str_[cursor_.off + off];
  155.         }
  156.        
  157.         // de-reference stream to get c-string containing rest of buffer
  158.         const char& operator*() const {
  159.             return (*this)[0];
  160.         }
  161.  
  162.         // iterators
  163.         const char* begin() const  { return &str_[cursor_.off];      } // iterator to start of stream
  164.         const char* end()   const  { return end_;                    } // iterator to end of stream
  165.         const char* raw()   const  { return str_;                    } // raw pointer to stream
  166.  
  167.         // accessors
  168.         ssize_t remaining() const  { return end()-begin();           } // how much of stream is left
  169.         index   cursor()    const  { return cursor_;                 } // return current position in stream
  170.         stream& seek(index cursor) { cursor_ = cursor; return *this; } // set cursor position
  171.        
  172.         // return string containing num characters from string.
  173.         //
  174.         // @param nchar  number of characters to slice (if < 0, rest of stream)
  175.         //
  176.         string slice(ssize_t nchar=0) const {
  177.             return string(begin(), (nchar > 0) ? begin() + nchar : end());
  178.         }
  179.        
  180.         // return rest of stream and advance to end
  181.         match rest() {
  182.             return matched(end() - begin());
  183.         }
  184.  
  185.         // return a match from the current position and advance stream
  186.         match matched(ssize_t len) {
  187.             match ret(begin(), len, cursor());
  188.             advance(len);
  189.             return ret;
  190.         }
  191.  
  192.         // advance current stream a given number of characters
  193.         stream& advance(ssize_t num=1) {
  194.             // don't move backwards
  195.             if (num < 0) {
  196.                 return *this;
  197.             }
  198.            
  199.             // copy current stream state
  200.             return munch(num);
  201.         }
  202.  
  203.     private:
  204.         // advance internal pointers by a given number of characters, handle
  205.         // counting rows and columns as we go.
  206.         stream& munch(ssize_t num) {
  207.             for (ssize_t ii=0; ii < num; ii++) {
  208.                 if ((str_ + cursor_.off) < end_) {
  209.                     cursor_.col++;
  210.                     if (str_[cursor_.off++] == '\n') {
  211.                         cursor_.row++;
  212.                         cursor_.col = 0;
  213.                     }
  214.                 }
  215.             }
  216.  
  217.             return *this;
  218.         }
  219.        
  220.         index cursor_;    // cursor specifying current position in buffer
  221.         const char* str_; // underlying buffer
  222.         ssize_t     len_; // length of buffer
  223.         const char* end_; // pointer to end of buffer + 1
  224.     };
  225.  
  226.     // create parser from c-string
  227.     parser(const char* text)
  228.         : stream(text) {}
  229.  
  230.     // create parser from std::string
  231.     parser(const string &text)
  232.         : stream(text.c_str(), text.size()) {}
  233.    
  234.     // return last error if we have one
  235.     maybe<error> last_error() {
  236.         return pop_error();
  237.     }
  238.  
  239.     // pop last error off stack and print it
  240.     void print_last_error() {
  241.         maybe<error> err = last_error();
  242.         if (err) {
  243.             print_error(err.value().idx, err.value().msg);
  244.         }
  245.     }
  246.  
  247. protected:
  248.     parser::stream stream;
  249.  
  250.     // push a new error message on the error stack at the current position
  251.     void push_error(string msg) {
  252.         errors_.emplace_back(
  253.             error {stream.cursor(), msg}
  254.         );
  255.     }
  256.  
  257.    
  258.     // pop last error off stack
  259.     maybe<error> pop_error() {
  260.         if (errors_.size() == 0) {
  261.             return false;
  262.         } else {
  263.             error err = std::move(errors_.back());
  264.             errors_.pop_back();
  265.             return err;
  266.         }
  267.     }
  268.    
  269.     // print the source code around the given index and print an error message
  270.     // between lines.  Be careful to preserve whitespace in the input text.
  271.     //
  272.     void print_error(index idx, string msg) const {
  273.         vector<string> lines = split(stream.raw(), "\n");
  274.  
  275.         // print source up to error row
  276.         for (ssize_t ii=0; ii <= idx.row; ii++) {
  277.             fprintf(stderr, "%s\n", lines[ii].c_str());
  278.         }
  279.  
  280.         // print spacing to match line error is on
  281.         for (ssize_t ii=0; ii < std::min(idx.col, (ssize_t)lines[idx.row].size()); ii++) {
  282.             char cc = lines[idx.row][ii];
  283.             if (isspace(cc)) {
  284.                 fputc(cc,  stderr);
  285.             } else {
  286.                 fputc(' ', stderr);
  287.             }
  288.         }
  289.  
  290.         // print error message
  291.         if (isatty(STDERR_FILENO)) {
  292.             fprintf(stderr, "%s\n", (ansi_seq(TERM_FG_RED) + "^" + ansi_seq(TERM_RESET) + " - " + msg).c_str());
  293.         } else {
  294.             fprintf(stderr, "%s\n", ("^ - " + msg).c_str());
  295.         }
  296.  
  297.         // print a few lines after for context
  298.         for (ssize_t ii=idx.row+1; ii < std::min(idx.row+4, (ssize_t)lines.size()); ii++) {
  299.             fprintf(stderr, "%s\n", lines[ii].c_str());
  300.         }
  301.     }
  302.    
  303.  
  304.     // match given string exactly
  305.     maybe<match> literal(string lit) {
  306.         if (strncmp(lit.c_str(), &stream[0], lit.size()) == 0) {
  307.             return stream.matched(lit.size());
  308.         }
  309.         return false;
  310.     }
  311.  
  312.     // match single char exactly
  313.     maybe<match> literal(char lit) {
  314.         if (*stream == lit) {
  315.             return stream.matched(1);
  316.         }
  317.         return false;
  318.     }
  319.  
  320.     // match whitespace
  321.     maybe<match> whitespace() {
  322.         // match whitespace characters
  323.         const char* ptr = stream.begin();
  324.         while (*ptr && isspace(*ptr)) {
  325.             ptr++;
  326.         }
  327.  
  328.         int nmatch = ptr - stream.begin();
  329.         if (nmatch > 0) {
  330.             return stream.matched(nmatch);
  331.         }
  332.         return false;
  333.     }
  334.  
  335.     // match a literal string, ignoring leading whitespace
  336.     maybe<match> token(string token) {
  337.         index cursor = stream.cursor();
  338.         {
  339.             whitespace();
  340.             maybe<match> mm = literal(token);
  341.             if (!mm) {
  342.                 stream.seek(cursor);
  343.                 return false;
  344.             }
  345.             return mm;
  346.         }
  347.     }
  348.  
  349.     // match a single char, ignoring leading whitespace
  350.     maybe<match> token(char token) {
  351.         index cursor = stream.cursor();
  352.         {
  353.             whitespace();
  354.             maybe<match> mm = literal(token);
  355.             if (!mm) {
  356.                 stream.seek(cursor);
  357.                 return false;
  358.             }
  359.             return mm;
  360.         }
  361.     }
  362.    
  363.     // match to end of line, or end of input, whichever comes first,
  364.     // consumes the newline at the end of the line.  If no input was
  365.     // between current position and newline, return false.
  366.     //
  367.     maybe<match> to_eol() {
  368.         const char* beg = stream.begin();
  369.         const char* ptr = beg;
  370.  
  371.         // seek to newline
  372.         while (*ptr && *ptr != '\n') {
  373.             ptr++;
  374.         }
  375.  
  376.         // consume newline if we have one
  377.         ssize_t sub=0;
  378.         if (*ptr) {
  379.             ptr++;
  380.             sub++;
  381.         }
  382.        
  383.         int nmatch = (ptr-beg)-sub;
  384.         if (nmatch > 0) {
  385.             return stream.matched(nmatch);
  386.         }
  387.         return false;
  388.     }
  389.  
  390. private:
  391.     vector<error> errors_;
  392. };
  393.  
  394.  
  395. // // regexp matches the start of the stream against a regular expression
  396. // // this is a functor so we can create/optimize the regex once instead
  397. // // of each call.
  398. // //
  399. // struct regexp {
  400. //     using pcre2_string = const unsigned char*;
  401.        
  402. //     // constructor
  403. //     regexp(string exp)
  404. //         : exp_(exp) {
  405. //         compile();
  406. //     }
  407.  
  408. //     // execute parser
  409. //     parse_match operator()(parse_stream& stream) const {
  410. //         if (pcre2_jit_match(regex_.get(), (pcre2_string)&stream, stream.left(), 0, 0, match_.get(), NULL) > 0) {
  411. //             size_t *ovector = pcre2_get_ovector_pointer(match_.get());
  412. //             return stream.match(ovector[1]-ovector[0]);
  413. //         }
  414. //         return parse_match::no_match();
  415. //     }
  416.  
  417. // private:
  418. //     void compile() {
  419. //         // compile regexp
  420. //         int    error_num;
  421. //         size_t error_off;
  422. //         pcre2_code *regex = pcre2_compile(
  423. //             (pcre2_string)exp_.c_str(),
  424. //             PCRE2_ZERO_TERMINATED | PCRE2_DOTALL,
  425. //             0, &error_num, &error_off, NULL
  426. //         );
  427. //         assert(regex != NULL && "regex compilation failed");
  428. //         assert(pcre2_jit_compile(regex, PCRE2_JIT_COMPLETE) == 0 && "regex JIT compilation failed");
  429.            
  430. //         // use the pattern itself to generate match storage
  431. //         pcre2_match_data *match = pcre2_match_data_create_from_pattern(regex, NULL);
  432.  
  433. //         // save into shared pointers with deleters
  434. //         regex_ = shared_ptr<pcre2_code>      (regex, [](pcre2_code       *regex) { pcre2_code_free(regex); });
  435. //         match_ = shared_ptr<pcre2_match_data>(match, [](pcre2_match_data *match) { pcre2_match_data_free(match); });
  436. //     }
  437.        
  438. //     string exp_;                         // original regex text
  439. //     shared_ptr<pcre2_code>       regex_; // compiled regex
  440. //     shared_ptr<pcre2_match_data> match_; // match data
  441. // };
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement