Untitled

#pragma once

// This header contains functionality for writing recursive descent parsers.
// If you're not familiar with the concept, recursive descent uses mutually
// recursive functions to mimic the structure of a grammar and parse it out.
//
// A good, simple example of this is in json.h  This is a full featured json
// parser (thanks largely to JSONs very simple grammar).  Recursive descent
// parsers parse "top down" (starting with the most general grammatical element).
// So the parsing begins with parse_json_value which calls sub-parsing functions
// for each grammatical element (number, string, array, etc).
//
// This particular implementation is tokenless, meaning that there is no
// separate tokenization phase that produces tokens for the parser to consume.
// The parser matches against the input text directly.
//
// The primary type provided is `parser`.  It's constructed with a string
// buffer (either std::string or const char*) and provides convenience functions
// for parsing data.  It wraps the provided text buffer in a `stream` object
// that provides a wrapper around it to advance the stream and manage seeking
// to handle eg: backtracking.  Stream is designed so that the stream can be
// moved through by only advancing an offset structure.  No string copying is
// necessary, so it's quite fast.
//
// Basic usage:
//   1) inherit from parser (you can inherit constructors if you like):
//         struct json_parser : parser {
//             using parser::parser;
//
//
//   2) implement parsing functions
//
//            // run the parser
//             maybe<json_value> parse() {
//                 return parse_json_value();
//             }
//
//         private:
//             maybe<json_value> parse_json_value() {
//                 // consume any whitespace
//                 whitespace();
//
//                 // peak ahead to see if we have an object to parse
//                 // luckily JSON is unambiguous here.
//                 switch (*stream) {
//                     case '[': return parse_json_array (); break;
//                     case '{': return parse_json_object(); break;
//                     case '"': return parse_json_string(); break;
//                     case 't': if (literal("true"))  return json_bool(true);  break;
//                     case 'f': if (literal("false")) return json_bool(false); break;
//                     case 'n': if (literal("null"))  return json_value();     break;
//                     default:  break;
//                 }
//
//                 // if not any of the above, try to parse a number
//                 return parse_json_number();
//         }
//
//
// The end result of parsing should be some sort of object built from the parsed code.
// In general, you may access the underlying data stream via the stream member, and it
// is acted upon implicitly when using the helpers.
//
// Recursive descent parsers are implemented as recursive functions as can be seen above.
// The parser generally ends up matching the underlying grammar quite closely.  In a
// language like C++ you don't want to recurse too deeply, so iteration is often a better
// alternative when eg: parsing a list of values.
//
// It's recommended to write parsing functions so that if they fail to parse, there is no
// change to the underlying stream.  This can be done using the cursor() and seek() functions
// of stream, which will return and set the current stream position, respectively.  Recursive
// descent parsers can backtrack arbitrarily, so it's easier to reason about functions that
// are "all or nothing", eg they don't partially consume input when they fail.
//
// Helpers
//   Several "micro-parser" helpers are provided for common operations.
//
// whitespace - matches and consumes any whitespace
// literal    - matches a literal character or string
// token      - matches a literal character or string, ignoring whitespace
// to_eol     - matches data to the next newline character (or end of input)
//
// Errors
//   In general, recursive descent parsers can backtrack arbitrarily, so it's possible to have
// many errors.  Rather than trying to guess at how to handle them, the parser class simple
// provides a push_error() and pop_error() function that push a new error message on a stack
// with stream position information.
//
//   Parsers can then choose how to pop errors and when to print error information.  The
// last_error() and print_last_error() functions can help with this.
//

// local
#include <maybe.h>
#include <verify.h>
#include <strutil.h>
#include <terminal.h>

// c++
#include <memory>
#include <string>
#include <vector>
#include <cassert>

// c
#include <string.h>

using std::string;
using std::vector;
using std::shared_ptr;


// main class to inherit from to build a parser
struct parser {
    // an index into the stream, holds linear offset and row/column information
    struct index {
        ssize_t off=0;
        ssize_t row=0;
        ssize_t col=0;
    };

    // an error is a message and a position in the stream
    struct error {
        index  idx;
        string msg;
    };

    // a match in the underlying stream
    struct match {
        match(const char* str, ssize_t len, index pos)
            : str(str), len(len), pos(pos) {}

        const char* str = nullptr;
        ssize_t     len = 0;
        index       pos;

        operator string() const {
            return string(str, len);
        }
    };

    // stream wraps up a char array and lets us access it nicely
    // it handles the bookkeeping related to where we are in the stream
    // and consuming characters/generating match objects.
    //
    struct stream {
        // create stream from c-string
        stream() =default;
        stream(const char* str, ssize_t len=-1)
            : str_(str), len_(len >= 0 ? len : strlen(str)), end_(str_ + len_) {}

        // index stream, get character at given offset from current position
        const char& operator[](ssize_t off) const {
            return str_[cursor_.off + off];
        }

        // de-reference stream to get c-string containing rest of buffer
        const char& operator*() const {
            return (*this)[0];
        }

        // iterators
        const char* begin() const  { return &str_[cursor_.off];      } // iterator to start of stream
        const char* end()   const  { return end_;                    } // iterator to end of stream
        const char* raw()   const  { return str_;                    } // raw pointer to stream

        // accessors
        ssize_t remaining() const  { return end()-begin();           } // how much of stream is left
        index   cursor()    const  { return cursor_;                 } // return current position in stream
        stream& seek(index cursor) { cursor_ = cursor; return *this; } // set cursor position

        // return string containing num characters from string.
        //
        // @param nchar  number of characters to slice (if < 0, rest of stream)
        //
        string slice(ssize_t nchar=0) const {
            return string(begin(), (nchar > 0) ? begin() + nchar : end());
        }

        // return rest of stream and advance to end
        match rest() {
            return matched(end() - begin());
        }

        // return a match from the current position and advance stream
        match matched(ssize_t len) {
            match ret(begin(), len, cursor());
            advance(len);
            return ret;
        }

        // advance current stream a given number of characters
        stream& advance(ssize_t num=1) {
            // don't move backwards
            if (num < 0) {
                return *this;
            }

            // copy current stream state
            return munch(num);
        }

    private:
        // advance internal pointers by a given number of characters, handle
        // counting rows and columns as we go.
        stream& munch(ssize_t num) {
            for (ssize_t ii=0; ii < num; ii++) {
                if ((str_ + cursor_.off) < end_) {
                    cursor_.col++;
                    if (str_[cursor_.off++] == '\n') {
                        cursor_.row++;
                        cursor_.col = 0;
                    }
                }
            }

            return *this;
        }

        index cursor_;    // cursor specifying current position in buffer
        const char* str_; // underlying buffer
        ssize_t     len_; // length of buffer
        const char* end_; // pointer to end of buffer + 1
    };

    // create parser from c-string
    parser(const char* text)
        : stream(text) {}

    // create parser from std::string
    parser(const string &text)
        : stream(text.c_str(), text.size()) {}

    // return last error if we have one
    maybe<error> last_error() {
        return pop_error();
    }

    // pop last error off stack and print it
    void print_last_error() {
        maybe<error> err = last_error();
        if (err) {
            print_error(err.value().idx, err.value().msg);
        }
    }

protected:
    parser::stream stream;

    // push a new error message on the error stack at the current position
    void push_error(string msg) {
        errors_.emplace_back(
            error {stream.cursor(), msg}
        );
    }


    // pop last error off stack
    maybe<error> pop_error() {
        if (errors_.size() == 0) {
            return false;
        } else {
            error err = std::move(errors_.back());
            errors_.pop_back();
            return err;
        }
    }

    // print the source code around the given index and print an error message
    // between lines.  Be careful to preserve whitespace in the input text.
    //
    void print_error(index idx, string msg) const {
        vector<string> lines = split(stream.raw(), "\n");

        // print source up to error row
        for (ssize_t ii=0; ii <= idx.row; ii++) {
            fprintf(stderr, "%s\n", lines[ii].c_str());
        }

        // print spacing to match line error is on
        for (ssize_t ii=0; ii < std::min(idx.col, (ssize_t)lines[idx.row].size()); ii++) {
            char cc = lines[idx.row][ii];
            if (isspace(cc)) {
                fputc(cc,  stderr);
            } else {
                fputc(' ', stderr);
            }
        }

        // print error message
        if (isatty(STDERR_FILENO)) {
            fprintf(stderr, "%s\n", (ansi_seq(TERM_FG_RED) + "^" + ansi_seq(TERM_RESET) + " - " + msg).c_str());
        } else {
            fprintf(stderr, "%s\n", ("^ - " + msg).c_str());
        }

        // print a few lines after for context
        for (ssize_t ii=idx.row+1; ii < std::min(idx.row+4, (ssize_t)lines.size()); ii++) {
            fprintf(stderr, "%s\n", lines[ii].c_str());
        }
    }


    // match given string exactly
    maybe<match> literal(string lit) {
        if (strncmp(lit.c_str(), &stream[0], lit.size()) == 0) {
            return stream.matched(lit.size());
        }
        return false;
    }

    // match single char exactly
    maybe<match> literal(char lit) {
        if (*stream == lit) {
            return stream.matched(1);
        }
        return false;
    }

    // match whitespace
    maybe<match> whitespace() {
        // match whitespace characters
        const char* ptr = stream.begin();
        while (*ptr && isspace(*ptr)) {
            ptr++;
        }

        int nmatch = ptr - stream.begin();
        if (nmatch > 0) {
            return stream.matched(nmatch);
        }
        return false;
    }

    // match a literal string, ignoring leading whitespace
    maybe<match> token(string token) {
        index cursor = stream.cursor();
        {
            whitespace();
            maybe<match> mm = literal(token);
            if (!mm) {
                stream.seek(cursor);
                return false;
            }
            return mm;
        }
    }

    // match a single char, ignoring leading whitespace
    maybe<match> token(char token) {
        index cursor = stream.cursor();
        {
            whitespace();
            maybe<match> mm = literal(token);
            if (!mm) {
                stream.seek(cursor);
                return false;
            }
            return mm;
        }
    }

    // match to end of line, or end of input, whichever comes first,
    // consumes the newline at the end of the line.  If no input was
    // between current position and newline, return false.
    //
    maybe<match> to_eol() {
        const char* beg = stream.begin();
        const char* ptr = beg;

        // seek to newline
        while (*ptr && *ptr != '\n') {
            ptr++;
        }

        // consume newline if we have one
        ssize_t sub=0;
        if (*ptr) {
            ptr++;
            sub++;
        }

        int nmatch = (ptr-beg)-sub;
        if (nmatch > 0) {
            return stream.matched(nmatch);
        }
        return false;
    }

private:
    vector<error> errors_;
};


// // regexp matches the start of the stream against a regular expression
// // this is a functor so we can create/optimize the regex once instead
// // of each call.
// //
// struct regexp {
//     using pcre2_string = const unsigned char*;

//     // constructor
//     regexp(string exp)
//         : exp_(exp) {
//         compile();
//     }

//     // execute parser
//     parse_match operator()(parse_stream& stream) const {
//         if (pcre2_jit_match(regex_.get(), (pcre2_string)&stream, stream.left(), 0, 0, match_.get(), NULL) > 0) {
//             size_t *ovector = pcre2_get_ovector_pointer(match_.get());
//             return stream.match(ovector[1]-ovector[0]);
//         }
//         return parse_match::no_match();
//     }

// private:
//     void compile() {
//         // compile regexp
//         int    error_num;
//         size_t error_off;
//         pcre2_code *regex = pcre2_compile(
//             (pcre2_string)exp_.c_str(),
//             PCRE2_ZERO_TERMINATED | PCRE2_DOTALL,
//             0, &error_num, &error_off, NULL
//         );
//         assert(regex != NULL && "regex compilation failed");
//         assert(pcre2_jit_compile(regex, PCRE2_JIT_COMPLETE) == 0 && "regex JIT compilation failed");

//         // use the pattern itself to generate match storage
//         pcre2_match_data *match = pcre2_match_data_create_from_pattern(regex, NULL);

//         // save into shared pointers with deleters
//         regex_ = shared_ptr<pcre2_code>      (regex, [](pcre2_code       *regex) { pcre2_code_free(regex); });
//         match_ = shared_ptr<pcre2_match_data>(match, [](pcre2_match_data *match) { pcre2_match_data_free(match); });
//     }

//     string exp_;                         // original regex text
//     shared_ptr<pcre2_code>       regex_; // compiled regex
//     shared_ptr<pcre2_match_data> match_; // match data
// };