Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #pragma once
- // This header contains functionality for writing recursive descent parsers.
- // If you're not familiar with the concept, recursive descent uses mutually
- // recursive functions to mimic the structure of a grammar and parse it out.
- //
- // A good, simple example of this is in json.h This is a full featured json
- // parser (thanks largely to JSONs very simple grammar). Recursive descent
- // parsers parse "top down" (starting with the most general grammatical element).
- // So the parsing begins with parse_json_value which calls sub-parsing functions
- // for each grammatical element (number, string, array, etc).
- //
- // This particular implementation is tokenless, meaning that there is no
- // separate tokenization phase that produces tokens for the parser to consume.
- // The parser matches against the input text directly.
- //
- // The primary type provided is `parser`. It's constructed with a string
- // buffer (either std::string or const char*) and provides convenience functions
- // for parsing data. It wraps the provided text buffer in a `stream` object
- // that provides a wrapper around it to advance the stream and manage seeking
- // to handle eg: backtracking. Stream is designed so that the stream can be
- // moved through by only advancing an offset structure. No string copying is
- // necessary, so it's quite fast.
- //
- // Basic usage:
- // 1) inherit from parser (you can inherit constructors if you like):
- // struct json_parser : parser {
- // using parser::parser;
- //
- //
- // 2) implement parsing functions
- //
- // // run the parser
- // maybe<json_value> parse() {
- // return parse_json_value();
- // }
- //
- // private:
- // maybe<json_value> parse_json_value() {
- // // consume any whitespace
- // whitespace();
- //
- // // peak ahead to see if we have an object to parse
- // // luckily JSON is unambiguous here.
- // switch (*stream) {
- // case '[': return parse_json_array (); break;
- // case '{': return parse_json_object(); break;
- // case '"': return parse_json_string(); break;
- // case 't': if (literal("true")) return json_bool(true); break;
- // case 'f': if (literal("false")) return json_bool(false); break;
- // case 'n': if (literal("null")) return json_value(); break;
- // default: break;
- // }
- //
- // // if not any of the above, try to parse a number
- // return parse_json_number();
- // }
- //
- //
- // The end result of parsing should be some sort of object built from the parsed code.
- // In general, you may access the underlying data stream via the stream member, and it
- // is acted upon implicitly when using the helpers.
- //
- // Recursive descent parsers are implemented as recursive functions as can be seen above.
- // The parser generally ends up matching the underlying grammar quite closely. In a
- // language like C++ you don't want to recurse too deeply, so iteration is often a better
- // alternative when eg: parsing a list of values.
- //
- // It's recommended to write parsing functions so that if they fail to parse, there is no
- // change to the underlying stream. This can be done using the cursor() and seek() functions
- // of stream, which will return and set the current stream position, respectively. Recursive
- // descent parsers can backtrack arbitrarily, so it's easier to reason about functions that
- // are "all or nothing", eg they don't partially consume input when they fail.
- //
- // Helpers
- // Several "micro-parser" helpers are provided for common operations.
- //
- // whitespace - matches and consumes any whitespace
- // literal - matches a literal character or string
- // token - matches a literal character or string, ignoring whitespace
- // to_eol - matches data to the next newline character (or end of input)
- //
- // Errors
- // In general, recursive descent parsers can backtrack arbitrarily, so it's possible to have
- // many errors. Rather than trying to guess at how to handle them, the parser class simple
- // provides a push_error() and pop_error() function that push a new error message on a stack
- // with stream position information.
- //
- // Parsers can then choose how to pop errors and when to print error information. The
- // last_error() and print_last_error() functions can help with this.
- //
- // local
- #include <maybe.h>
- #include <verify.h>
- #include <strutil.h>
- #include <terminal.h>
- // c++
- #include <memory>
- #include <string>
- #include <vector>
- #include <cassert>
- // c
- #include <string.h>
- using std::string;
- using std::vector;
- using std::shared_ptr;
- // main class to inherit from to build a parser
- struct parser {
- // an index into the stream, holds linear offset and row/column information
- struct index {
- ssize_t off=0;
- ssize_t row=0;
- ssize_t col=0;
- };
- // an error is a message and a position in the stream
- struct error {
- index idx;
- string msg;
- };
- // a match in the underlying stream
- struct match {
- match(const char* str, ssize_t len, index pos)
- : str(str), len(len), pos(pos) {}
- const char* str = nullptr;
- ssize_t len = 0;
- index pos;
- operator string() const {
- return string(str, len);
- }
- };
- // stream wraps up a char array and lets us access it nicely
- // it handles the bookkeeping related to where we are in the stream
- // and consuming characters/generating match objects.
- //
- struct stream {
- // create stream from c-string
- stream() =default;
- stream(const char* str, ssize_t len=-1)
- : str_(str), len_(len >= 0 ? len : strlen(str)), end_(str_ + len_) {}
- // index stream, get character at given offset from current position
- const char& operator[](ssize_t off) const {
- return str_[cursor_.off + off];
- }
- // de-reference stream to get c-string containing rest of buffer
- const char& operator*() const {
- return (*this)[0];
- }
- // iterators
- const char* begin() const { return &str_[cursor_.off]; } // iterator to start of stream
- const char* end() const { return end_; } // iterator to end of stream
- const char* raw() const { return str_; } // raw pointer to stream
- // accessors
- ssize_t remaining() const { return end()-begin(); } // how much of stream is left
- index cursor() const { return cursor_; } // return current position in stream
- stream& seek(index cursor) { cursor_ = cursor; return *this; } // set cursor position
- // return string containing num characters from string.
- //
- // @param nchar number of characters to slice (if < 0, rest of stream)
- //
- string slice(ssize_t nchar=0) const {
- return string(begin(), (nchar > 0) ? begin() + nchar : end());
- }
- // return rest of stream and advance to end
- match rest() {
- return matched(end() - begin());
- }
- // return a match from the current position and advance stream
- match matched(ssize_t len) {
- match ret(begin(), len, cursor());
- advance(len);
- return ret;
- }
- // advance current stream a given number of characters
- stream& advance(ssize_t num=1) {
- // don't move backwards
- if (num < 0) {
- return *this;
- }
- // copy current stream state
- return munch(num);
- }
- private:
- // advance internal pointers by a given number of characters, handle
- // counting rows and columns as we go.
- stream& munch(ssize_t num) {
- for (ssize_t ii=0; ii < num; ii++) {
- if ((str_ + cursor_.off) < end_) {
- cursor_.col++;
- if (str_[cursor_.off++] == '\n') {
- cursor_.row++;
- cursor_.col = 0;
- }
- }
- }
- return *this;
- }
- index cursor_; // cursor specifying current position in buffer
- const char* str_; // underlying buffer
- ssize_t len_; // length of buffer
- const char* end_; // pointer to end of buffer + 1
- };
- // create parser from c-string
- parser(const char* text)
- : stream(text) {}
- // create parser from std::string
- parser(const string &text)
- : stream(text.c_str(), text.size()) {}
- // return last error if we have one
- maybe<error> last_error() {
- return pop_error();
- }
- // pop last error off stack and print it
- void print_last_error() {
- maybe<error> err = last_error();
- if (err) {
- print_error(err.value().idx, err.value().msg);
- }
- }
- protected:
- parser::stream stream;
- // push a new error message on the error stack at the current position
- void push_error(string msg) {
- errors_.emplace_back(
- error {stream.cursor(), msg}
- );
- }
- // pop last error off stack
- maybe<error> pop_error() {
- if (errors_.size() == 0) {
- return false;
- } else {
- error err = std::move(errors_.back());
- errors_.pop_back();
- return err;
- }
- }
- // print the source code around the given index and print an error message
- // between lines. Be careful to preserve whitespace in the input text.
- //
- void print_error(index idx, string msg) const {
- vector<string> lines = split(stream.raw(), "\n");
- // print source up to error row
- for (ssize_t ii=0; ii <= idx.row; ii++) {
- fprintf(stderr, "%s\n", lines[ii].c_str());
- }
- // print spacing to match line error is on
- for (ssize_t ii=0; ii < std::min(idx.col, (ssize_t)lines[idx.row].size()); ii++) {
- char cc = lines[idx.row][ii];
- if (isspace(cc)) {
- fputc(cc, stderr);
- } else {
- fputc(' ', stderr);
- }
- }
- // print error message
- if (isatty(STDERR_FILENO)) {
- fprintf(stderr, "%s\n", (ansi_seq(TERM_FG_RED) + "^" + ansi_seq(TERM_RESET) + " - " + msg).c_str());
- } else {
- fprintf(stderr, "%s\n", ("^ - " + msg).c_str());
- }
- // print a few lines after for context
- for (ssize_t ii=idx.row+1; ii < std::min(idx.row+4, (ssize_t)lines.size()); ii++) {
- fprintf(stderr, "%s\n", lines[ii].c_str());
- }
- }
- // match given string exactly
- maybe<match> literal(string lit) {
- if (strncmp(lit.c_str(), &stream[0], lit.size()) == 0) {
- return stream.matched(lit.size());
- }
- return false;
- }
- // match single char exactly
- maybe<match> literal(char lit) {
- if (*stream == lit) {
- return stream.matched(1);
- }
- return false;
- }
- // match whitespace
- maybe<match> whitespace() {
- // match whitespace characters
- const char* ptr = stream.begin();
- while (*ptr && isspace(*ptr)) {
- ptr++;
- }
- int nmatch = ptr - stream.begin();
- if (nmatch > 0) {
- return stream.matched(nmatch);
- }
- return false;
- }
- // match a literal string, ignoring leading whitespace
- maybe<match> token(string token) {
- index cursor = stream.cursor();
- {
- whitespace();
- maybe<match> mm = literal(token);
- if (!mm) {
- stream.seek(cursor);
- return false;
- }
- return mm;
- }
- }
- // match a single char, ignoring leading whitespace
- maybe<match> token(char token) {
- index cursor = stream.cursor();
- {
- whitespace();
- maybe<match> mm = literal(token);
- if (!mm) {
- stream.seek(cursor);
- return false;
- }
- return mm;
- }
- }
- // match to end of line, or end of input, whichever comes first,
- // consumes the newline at the end of the line. If no input was
- // between current position and newline, return false.
- //
- maybe<match> to_eol() {
- const char* beg = stream.begin();
- const char* ptr = beg;
- // seek to newline
- while (*ptr && *ptr != '\n') {
- ptr++;
- }
- // consume newline if we have one
- ssize_t sub=0;
- if (*ptr) {
- ptr++;
- sub++;
- }
- int nmatch = (ptr-beg)-sub;
- if (nmatch > 0) {
- return stream.matched(nmatch);
- }
- return false;
- }
- private:
- vector<error> errors_;
- };
- // // regexp matches the start of the stream against a regular expression
- // // this is a functor so we can create/optimize the regex once instead
- // // of each call.
- // //
- // struct regexp {
- // using pcre2_string = const unsigned char*;
- // // constructor
- // regexp(string exp)
- // : exp_(exp) {
- // compile();
- // }
- // // execute parser
- // parse_match operator()(parse_stream& stream) const {
- // if (pcre2_jit_match(regex_.get(), (pcre2_string)&stream, stream.left(), 0, 0, match_.get(), NULL) > 0) {
- // size_t *ovector = pcre2_get_ovector_pointer(match_.get());
- // return stream.match(ovector[1]-ovector[0]);
- // }
- // return parse_match::no_match();
- // }
- // private:
- // void compile() {
- // // compile regexp
- // int error_num;
- // size_t error_off;
- // pcre2_code *regex = pcre2_compile(
- // (pcre2_string)exp_.c_str(),
- // PCRE2_ZERO_TERMINATED | PCRE2_DOTALL,
- // 0, &error_num, &error_off, NULL
- // );
- // assert(regex != NULL && "regex compilation failed");
- // assert(pcre2_jit_compile(regex, PCRE2_JIT_COMPLETE) == 0 && "regex JIT compilation failed");
- // // use the pattern itself to generate match storage
- // pcre2_match_data *match = pcre2_match_data_create_from_pattern(regex, NULL);
- // // save into shared pointers with deleters
- // regex_ = shared_ptr<pcre2_code> (regex, [](pcre2_code *regex) { pcre2_code_free(regex); });
- // match_ = shared_ptr<pcre2_match_data>(match, [](pcre2_match_data *match) { pcre2_match_data_free(match); });
- // }
- // string exp_; // original regex text
- // shared_ptr<pcre2_code> regex_; // compiled regex
- // shared_ptr<pcre2_match_data> match_; // match data
- // };
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement