Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- main.cpp
- S. Edward Dolan
- Saturday, November 9 2019
- A text concordance. Load a plain ASCII file, then search for words in the
- file. A word is defined as [a-zA-Z]+. The words are stored in lower case and
- searches ingore case.
- Some problems:
- * Only plain ASCII text files are permitted. The load() function will throw
- on characters > 0x7e but this doesn't guarantee stable operation. The
- problem is the current column is tracked and stored in a Ref, but a
- multi-byte character will throw that off.
- * Tabs are not dealt with at all. This will also throw the column off. load()
- will print a warning if a tab is encountered.
- */
- #include <iostream>
- #include <fstream>
- #include <vector>
- #include <forward_list>
- #include <algorithm>
- #include <sstream>
- #include <chrono>
- #include <iomanip>
- #include <cassert>
- typedef std::string str_t;
- struct Ref;
- typedef std::forward_list<Ref*> listref_t;
- typedef std::pair<uint32_t, uint8_t> location_t; // line, column
- typedef std::vector<location_t> veclocation_t;
- typedef std::vector<str_t> vecstr_t;
- /*
- A word reference from a text file.
- A Ref holds a lookup ID to the source file name where the word is found and
- a vector of <line, column> pairs where each occurrance of the word can be
- found. The line is one-based, the column is zero-based.
- */
- struct Ref {
- Ref(const str_t& fname) : fileNameID(0), locs() {
- for (size_t i=0; i<fileNames.size(); ++i) // meh...
- if (fileNames[i] == fname) {
- fileNameID = i;
- return;
- }
- fileNameID = fileNames.size();
- fileNames.push_back(fname);
- }
- Ref(const str_t& fname, uint32_t line, uint8_t column)
- : Ref(fname) {
- locs.push_back(std::make_pair(line, column));
- }
- ~Ref() {
- fileNames.clear();
- locs.clear();
- }
- str_t fileName() { return fileNames[fileNameID]; }
- const veclocation_t& getLocations() { return locs; }
- bool hasLocation(uint32_t line, uint8_t column) {
- for (auto loc : locs)
- if (loc.first == line && loc.second == column)
- return true;
- return false;
- }
- bool addLocation(uint32_t line, uint8_t column) {
- if (!hasLocation(line, column)) {
- locs.push_back(std::make_pair(line, column));
- return true;
- }
- return false;
- }
- void write(std::ostream& s) {
- (void)s;
- }
- protected:
- static vecstr_t fileNames;
- size_t fileNameID;
- veclocation_t locs;
- };
- vecstr_t Ref::fileNames;
- /*
- A node in a trie.
- A Node holds a singly linked list of Ref*, a bool flag to mark the end of a
- word, and an array of Node*: [a, b, ..., z].
- */
- struct Node {
- // Will initialize C-style array (C++11) --------------v
- Node(bool term = false) : refs(), term(term), children{0} {
- }
- ~Node() {
- for (auto r : refs)
- delete r;
- for (size_t i=0; i<26; ++i)
- delete children[i];
- }
- Node* getChild(char c) {
- return children[std::tolower(c) - 97];
- }
- Node* addChild(char c, bool term = false) {
- return children[std::tolower(c) - 97] = new Node(term);
- }
- void addRef(const str_t& fname, uint32_t line, uint8_t column,
- bool term = false) {
- if (term)
- this->term = true;
- for (auto r : refs)
- if (r->fileName() == fname) {
- r->addLocation(line, column); // update ref
- return;
- }
- refs.push_front(new Ref(fname, line, column)); // new ref
- }
- const listref_t& getRefs() { return refs; }
- bool isTerm() { return term; }
- void write(std::ostream& s) {
- (void)s;
- }
- protected:
- listref_t refs;
- bool term; // true if the path that led here terminates a word
- Node* children[26];
- };
- /*
- A fast but memory-hungry word look-up tree structure.
- The tree can hold the words, and the location (line/column) of those words,
- from multiple plain-text files. The tree is case-insensitive and will only
- contain lower-case alphabetic words [a-z].
- The insert() and find() methods ignore the case of the supplied word.
- {} -- a Ref or Node struct
- [] -- a std::vector
- () -- a std::forward_list
- <> -- a std::pair
- If "cat" was the first word read from foo.txt at line 3, column 16:
- .---------------------------- empty list of Ref*
- /
- {(), false, [0, c, 0]} <--- root Node
- .------------------'
- `->{(), false, [a, 0]}
- .---------------'
- `->{(), false, [0, t, 0]}
- .------------------'
- `->{
- ({footxtID, [<3, 16>]}), <--- one Ref
- true, <--- terminates the word
- [0, ..., 0] <--- no children
- }
- If another "cat" was read @ line 8, column 3, the tree would be identical
- except the terminating node would be:
- ({footxtID, true, [<3, 16>, <8, 3>]}) <--- new location appended
- If "cat" was read from bar.txt @ line 56, column 0, the tree structure
- would still be unchanged. A new Ref would be added to the terminating node:
- `->{
- ({footxtID, [<3, 16>, <8, 3>]} <--- now two Refs
- {bartxtID, [<56, 0>]}),
- true,
- [0, ..., 0]
- }
- */
- struct Trie {
- Trie() : root(new Node), maxLine(1), maxColumn(0) {}
- ~Trie() { delete root; }
- int getMaxLine() { return maxLine; }
- int getMaxColumn() { return maxColumn; }
- void insert(const str_t& word, const str_t& fname, uint32_t line,
- uint8_t column) {
- if (word.empty())
- throw std::runtime_error("Trie can't insert empty strings");
- maxLine = std::max<int>(maxLine, line);
- maxColumn = std::max<int>(maxColumn, column);
- doInsert(root, word, fname, line, column);
- }
- Node* find(const str_t& word) {
- assert(!word.empty());
- return doFind(root, word);
- }
- void write(std::ostream& s) {
- root->write(s);
- }
- protected:
- void doInsert(Node* node, const str_t& word, const str_t& fname,
- uint32_t line, uint8_t column) {
- assert(node);
- if (word.empty())
- node->addRef(fname, line, column, true);
- else {
- Node* n = node->getChild(word[0]);
- if (n)
- doInsert(n, word.substr(1), fname, line, column);
- else {
- Node* nn = node->addChild(word[0]);
- doInsert(nn, word.substr(1), fname, line, column);
- }
- }
- }
- Node* doFind(Node* node, const str_t& word) {
- if (word.empty())
- return node && node->isTerm() ? node : NULL;
- if (!node)
- return NULL;
- return doFind(node->getChild(word[0]), word.substr(1));
- }
- Node* root;
- int maxLine, maxColumn; // for pretty printing
- };
- /*
- Load the trie with the words in the file.
- */
- void doLoad(Trie& trie, const str_t& fname) {
- std::ifstream f(fname.c_str());
- if (!f) {
- std::stringstream ss;
- ss << "failed to open: " << fname;
- throw std::runtime_error(ss.str());
- }
- int c;
- str_t word;
- uint32_t line = 1;
- uint8_t column = 0;
- while (true) {
- c = f.get();
- if (c > 0x7e) {
- // TODO: If this occurs, the trie will be in a crap state with
- // only some of the file processed, depending on where the
- // error occurred. So... possibly create a new trie for each
- // load and merge it with the one supplied to this function.
- std::stringstream ss;
- ss << "line " << line << " of " << fname
- << "\nInput must be plain ASCII, got char > 0x7e.";
- throw std::runtime_error(ss.str());
- }
- if (c == '\t')
- std::cerr << "WARNING: an unsupported TAB character was read."
- << " This will result in inaccurate column tracking."
- << std::endl;
- if (!f) {
- if (!word.empty())
- trie.insert(word, fname, line, column);
- break;
- }
- else if (std::isalpha(c))
- word += (char)c; // column is not advanced
- else {
- if (!word.empty()) {
- trie.insert(word, fname, line, column);
- column += word.size();
- word = "";
- }
- // handle the char that may have terminted the last word
- if (c == '\r')
- continue; // useless sack of donkey shit
- if (c == '\n') {
- column = 0;
- ++line;
- }
- else // char not [a-zA-Z] CR NL
- ++column;
- }
- }
- f.close();
- }
- /*
- Print the result of a search.
- */
- void display(Node* node, const str_t& word, int maxLine, int maxColumn,
- bool showLines, bool align) {
- int maxPrintColumn = 79;
- std::stringstream ss;
- ss << maxLine;
- int lineField = ss.str().size();
- ss.str("");
- ss << maxColumn;
- int columnField = ss.str().size();
- int textField = maxPrintColumn - (lineField + columnField + 2);
- int wordPos = textField / 2 - word.size() / 2;
- for (auto ref : node->getRefs()) {
- std::ifstream f(ref->fileName());
- if (!f) {
- std::stringstream ss;
- ss << "failed to open: " << ref->fileName();
- throw std::runtime_error(ss.str());
- }
- std::cout << ref->getLocations().size() << " occurrences of `"
- << word << "' in " << ref->fileName() << std::endl;
- if (showLines) {
- std::cout << str_t(79, '=') << std::endl;
- str_t buf;
- uint32_t thisLine = 1, lastLine = 0;
- for (auto loc : ref->getLocations()) {
- if (lastLine != loc.first) {
- do {
- std::getline(f, buf);
- lastLine = thisLine;
- } while (thisLine++ != loc.first);
- }
- std::cout << std::right << std::setw(lineField)
- << loc.first << ":" << std::left
- << std::setw(columnField) << (int)loc.second;
- str_t out = buf;
- if (align) {
- if (loc.second < wordPos) // right shift
- out = str_t(wordPos - loc.second, ' ') + buf;
- else if (loc.second > wordPos) // left shift
- out = buf.substr(loc.second - wordPos);
- }
- std::cout << ' '
- << out.substr(0, std::min<int>(out.size(),
- textField))
- << std::endl;
- }
- }
- f.close();
- }
- }
- void load(Trie& trie, const std::string& fname) {
- std::cout << "loading...";
- std::cout.flush();
- try {
- // auto start = std::chrono::high_resolution_clock::now();
- doLoad(trie, fname);
- // auto end = std::chrono::high_resolution_clock::now();
- // std::chrono::duration<double> t = end - start;
- // std::cout << " time: " << t.count() << " seconds" << std::endl;
- }
- catch (std::exception& e) {
- std::cout << "\nError: " << e.what() << std::endl;
- }
- }
- void help() {
- std::cout << ",align - toggle vertical word alignment in matched lines\n"
- << ",help - print this help\n"
- << ",load - load a text file into the trie\n"
- << ",quit - exit the program\n"
- << ",show - toggle printing matched lines"
- << std::endl;
- }
- int main(int argc, char** argv) {
- Trie trie;
- if (argc > 1) {
- for (int i=1; i<argc; ++i)
- load(trie, argv[i]);
- }
- str_t word;
- bool show = false;
- bool align = false;
- str_t legalChars("abcdefghijklmnopqrstuvwxyz"
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
- std::cout << "All input files must be plain ASCII text. Chaos will reign"
- << " supreme otherwise." << std::endl;
- help();
- while (std::cout << "> " && std::cin >> word) {
- if (word == ",align") {
- align = !align;
- continue;
- }
- else if (word == ",help")
- help();
- else if (word == ",load") {
- std::cout << "File name to load: ";
- std::string fname;
- std::cin >> fname;
- load(trie, fname);
- continue;
- }
- else if (word == ",quit")
- break;
- else if (word == ",show") {
- show = !show;
- continue;
- }
- if (word.find_first_not_of(legalChars) != std::string::npos)
- continue;
- // auto start = std::chrono::high_resolution_clock::now();
- Node* node = trie.find(word);
- // auto end = std::chrono::high_resolution_clock::now();
- if (node) {
- display(node, word, trie.getMaxLine(), trie.getMaxColumn(), show,
- align);
- // std::chrono::duration<double> t = end - start;
- // std::cout << "*** Seach time: " << t.count() * 1e3
- // << " milliseconds" << std::endl;
- }
- }
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement