Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <algorithm>
- #include <iostream>
- #include <fstream>
- #include <sstream>
- #include <string>
- #include <vector>
- #include <chrono>
- const int SMALL_FILE_SIZE = 10000;
- const int LARGE_FILE_SIZE = 100000;
- enum ALPH_TYPE {
- BIN=2,
- DNA=4
- };
- std::vector<int> br(const std::string &s) {
- std::vector<int> p(s.size(), 0);
- for (size_t i = 1; i < s.size(); ++i) {
- int k = p[i - 1];
- while (k > 0 && (s[i] != s[k] || s[i] != '?' || s[k] != '?')) {
- k = p[k - 1];
- }
- if (s[i] == s[k] || s[i] == '?' || s[k] == '?') {
- ++k;
- }
- p[i] = k;
- }
- return p;
- }
- int64_t kmp(const std::string &src, const std::string &pattern) {
- auto start = std::chrono::high_resolution_clock::now();
- std::vector<int> p = br(pattern);
- size_t n = src.size();
- size_t m = p.size();
- size_t j = 0;
- for (int i = 0; i < n; ++i) {
- if (src[i] == pattern[j] || src[i] == '?' || pattern[j] == '?') {
- ++j;
- if (j == m) {
- // return i - m + 2;
- auto elapsed = std::chrono::high_resolution_clock::now() - start;
- return std::chrono::duration_cast<std::chrono::nanoseconds>(elapsed).count();
- }
- } else {
- if (j != 0) {
- j = p[j - 1];
- }
- }
- }
- // return 0;
- auto elapsed = std::chrono::high_resolution_clock::now() - start;
- return std::chrono::duration_cast<std::chrono::nanoseconds>(elapsed).count();
- }
- std::vector<int> brs(std::string &s) {
- s += '`';
- size_t n = s.size();
- std::vector<int> p_brs(n, 0);
- std::vector<int> p_br = br(s);
- for (size_t i = 1; i < n; ++i) {
- if (s[p_br[i - 1] + 1] != s[i + 1]) {
- p_brs[i] = p_br[i];
- } else {
- p_brs[i] = p_brs[p_br[i]];
- }
- }
- return p_brs;
- }
- int64_t kmp2(const std::string &src, std::string pattern) {
- auto start = std::chrono::high_resolution_clock::now();
- size_t n = src.size();
- size_t m = pattern.size();
- auto p = brs(pattern);
- int j = 0;
- for (size_t i = 0; i < n; ++i) {
- while (j > 0 && src[i] != pattern[j] && pattern[j] != '?') {
- j = p[j - 1];
- }
- if (src[i] == pattern[j] || pattern[j] == '?') {
- ++j;
- if (j == m) {
- // return i - m + 2;
- auto elapsed = std::chrono::high_resolution_clock::now() - start;
- return std::chrono::duration_cast<std::chrono::nanoseconds>(elapsed).count();
- }
- }
- }
- // return 0;
- auto elapsed = std::chrono::high_resolution_clock::now() - start;
- return std::chrono::duration_cast<std::chrono::nanoseconds>(elapsed).count();
- }
- bool str_equals(const std::string &s, const std::string &pat, size_t from) {
- size_t size = pat.size();
- for (size_t i = from; i < from + size; ++i) {
- if (!(s[i] == pat[i - from] || s[i] == '?' || pat[i - from] == '?')) {
- return false;
- }
- }
- return true;
- }
- int64_t naive_search(const std::string &src, const std::string &pattern) {
- auto start = std::chrono::high_resolution_clock::now();
- size_t size1 = src.size();
- size_t size2 = pattern.size();
- for (size_t i = 0; i < size1 - size2 + 1; ++i) {
- if (str_equals(src, pattern, i)) {
- // return i + 1;
- auto elapsed = std::chrono::high_resolution_clock::now() - start;
- return std::chrono::duration_cast<std::chrono::nanoseconds>(elapsed).count();
- }
- }
- // return 0;
- auto elapsed = std::chrono::high_resolution_clock::now() - start;
- return std::chrono::duration_cast<std::chrono::nanoseconds>(elapsed).count();
- }
- void write_data(std::ofstream &out, const std::string &algname, int subs_syms,
- int srcfilesize, ALPH_TYPE type, size_t ptrnsize, int64_t secs) {
- std::ostringstream ss;
- ss << algname << ',' << subs_syms << ',' << srcfilesize << ','
- << type << ',' << ptrnsize << ',' << secs << std::endl;
- out << ss.str();
- }
- int main(int argc, char *argv[]) {
- srand(time(nullptr));
- const std::string naive_name = "Naive search";
- const std::string kmp_name = "KMP standard";
- const std::string kmp2_name = "KMP exquisite";
- std::ifstream smallinput("smallfile.txt");
- std::ifstream largeinput("largefile.txt");
- std::ifstream patterns("patterns.txt");
- std::ofstream data("data.csv");
- std::string smlfile;
- std::string lrgfile;
- std::getline(smallinput, smlfile);
- std::getline(largeinput, lrgfile);
- std::string smlpat, lrgpat;
- while (std::getline(patterns, smlpat, ',') && std::getline(patterns, lrgpat)) {
- // string from small file
- int64_t secs = naive_search(smlfile, smlpat);
- write_data(data, naive_name, 0, SMALL_FILE_SIZE, BIN, smlpat.size(), secs);
- secs = kmp(smlfile, smlpat);
- write_data(data, kmp_name, 0, SMALL_FILE_SIZE, BIN, smlpat.size(), secs);
- secs = kmp2(smlfile, smlpat);
- write_data(data, kmp2_name, 0, SMALL_FILE_SIZE, BIN, smlpat.size(), secs);
- // string from large file:
- secs = naive_search(lrgfile, lrgpat);
- write_data(data, naive_name, 0, LARGE_FILE_SIZE, DNA, lrgpat.size(), secs);
- secs = kmp(lrgfile, lrgpat);
- write_data(data, kmp_name, 0, LARGE_FILE_SIZE, DNA, lrgpat.size(), secs);
- secs = kmp2(lrgfile, lrgpat);
- write_data(data, kmp2_name, 0, LARGE_FILE_SIZE, DNA, lrgpat.size(), secs);
- std::string qmsmlpat(smlpat), qmlrgpat(lrgpat);
- size_t lb, ub = 0;
- for (int i = 1; i <= 4; ++i) {
- lb = ub + 1;
- ub = (qmsmlpat.size() / 4) * i - 1;
- size_t range = ub - lb + 1;
- size_t pos = std::rand() % range + lb;
- qmsmlpat.replace(pos, 1, "?");
- pos = std::rand() % range + lb;
- qmlrgpat.replace(pos, 1, "?");
- // string from small file
- secs = naive_search(qmsmlpat, smlpat);
- write_data(data, naive_name, i, SMALL_FILE_SIZE, BIN, qmsmlpat.size(), secs);
- secs = kmp(qmsmlpat, smlpat);
- write_data(data, kmp_name, i, SMALL_FILE_SIZE, BIN, qmsmlpat.size(), secs);
- secs = kmp2(qmsmlpat, smlpat);
- write_data(data, kmp2_name, i, SMALL_FILE_SIZE, BIN, qmsmlpat.size(), secs);
- // string from large file:
- secs = naive_search(qmlrgpat, lrgpat);
- write_data(data, naive_name, i, LARGE_FILE_SIZE, DNA, qmlrgpat.size(), secs);
- secs = kmp(lrgfile, lrgpat);
- write_data(data, kmp_name, i, LARGE_FILE_SIZE, DNA, qmlrgpat.size(), secs);
- secs = kmp2(lrgfile, lrgpat);
- write_data(data, kmp2_name, i, LARGE_FILE_SIZE, DNA, qmlrgpat.size(), secs);
- }
- // check if newline is on top
- if (patterns.peek() == '\n') {
- patterns.ignore();
- }
- }
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement