Advertisement
Guest User

Untitled

a guest
May 5th, 2015
238
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 10.58 KB | None | 0 0
  1. /*
  2. * Copyright 2015 Bagrat Ter-Akopyan, Giuliano Castiglia
  3. */
  4.  
  5. #include <boost/tokenizer.hpp>
  6. #include <boost/lexical_cast.hpp>
  7. #include <boost/filesystem.hpp>
  8. #include <stdint.h>
  9. #include <iostream>
  10. #include <chrono>
  11. #include <fstream>
  12. #include <cstdlib>
  13. #include <vector>
  14. #include <iterator>
  15. #include <utility>
  16. #include <string>
  17. #include <map>
  18. #include <algorithm>
  19. #include "contract.h"
  20. #include "wikidb.h"
  21. #include "SimPair.h"
  22. #include "docopt.h"
  23.  
  24. #include <thread>
  25. #include <future>
  26.  
  27. static const char USAGE[] =
  28. R"(createdb.
  29.  
  30. Usage:
  31. createdb pages <DatabaseName> <revid2titleFilename>
  32. createdb parents <DatabaseName> <revid2parentsFilename>
  33. createdb comparisons <DatabaseName> <simMatrixDir>
  34.  
  35. Options:
  36. -h --help Show this screen.
  37. --version Show version.
  38. )";
  39.  
  40.  
  41. static const unsigned int k_similarity_multiplier = 1000;
  42.  
  43. void pages2db(std::string const& db_filename,
  44. std::string const& tsv_filename) {
  45. std::ifstream infile(tsv_filename.c_str());
  46. if (!infile.is_open()) {
  47. std::cerr << "Couldn't open " << tsv_filename << std::endl;
  48. std::exit(1);
  49. }
  50.  
  51. WikiDB wikidb(db_filename);
  52.  
  53. std::vector< std::pair<uint32_t, std::string> > articles(0);
  54. std::vector< std::pair<uint32_t, std::string> > categories(0);
  55.  
  56. boost::char_separator<char> sep("\t");
  57. for (std::string line; getline(infile, line); ) {
  58. boost::tokenizer<boost::char_separator<char> > tokens(line, sep);
  59.  
  60. auto it = std::begin(tokens);
  61. std::size_t ns = boost::lexical_cast<std::size_t>(*it);
  62.  
  63. ++it;
  64. uint32_t revid = boost::lexical_cast<uint32_t>(*it);
  65.  
  66. ++it;
  67. std::string title = *it;
  68. if (ns == 0) {
  69. std::pair<uint32_t, std::string> article = std::make_pair(revid, title);
  70. articles.push_back(article);
  71. }
  72. if (ns == 14) {
  73. std::pair<uint32_t, std::string> category = std::make_pair(revid, title);
  74. categories.push_back(category);
  75. }
  76. }
  77.  
  78. wikidb.bulkInsertArticle(articles);
  79. wikidb.bulkInsertCategory(categories);
  80. infile.close();
  81. }
  82.  
  83. void parents2db(std::string const& db_filename,
  84. std::string const& parents_tsv_filename) {
  85. std::ifstream infile(parents_tsv_filename.c_str());
  86. if (!infile.is_open()) {
  87. std::cerr << "Couldn't open " << parents_tsv_filename << std::endl;
  88. std::exit(1);
  89. }
  90.  
  91. WikiDB wikidb(db_filename);
  92.  
  93. std::vector<std::string> parents(0);
  94.  
  95. // measure time for inserting parents for every record
  96. std::chrono::high_resolution_clock::time_point t1, t2, t3;
  97. t1 = std::chrono::high_resolution_clock::now();
  98.  
  99. boost::char_separator<char> sep("\t");
  100. for (std::string line; getline(infile, line); ) {
  101. boost::tokenizer<boost::char_separator<char> > tokens(line, sep);
  102.  
  103. auto it = std::begin(tokens);
  104. std::size_t ns = boost::lexical_cast<std::size_t>(*it);
  105. ++it;
  106. uint32_t revid = boost::lexical_cast<uint32_t>(*it);
  107.  
  108. if (ns == 0) {
  109. ++it;
  110. while (it != std::end(tokens)) {
  111. parents.push_back(*it);
  112. ++it;
  113. }
  114. wikidb.bulkUpdateArticleParents(revid, parents);
  115.  
  116. }
  117. if (ns == 14) {
  118. ++it;
  119. while (it != std::end(tokens)) {
  120. parents.push_back(*it);
  121. ++it;
  122. }
  123. wikidb.bulkUpdateCategoryParents(revid, parents);
  124. }
  125.  
  126. parents.clear();
  127. }
  128.  
  129. t2 = std::chrono::high_resolution_clock::now();
  130.  
  131. wikidb.commit();
  132. t3 = std::chrono::high_resolution_clock::now();
  133.  
  134. auto insertDuration = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1);
  135. auto commitDuration = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2);
  136.  
  137. std::cout << "Parents insertion time for all records: " << insertDuration.count()
  138. << " milliseconds" << std::endl;
  139. std::cout << "Commit duration: " << commitDuration.count()
  140. << " milliseconds" << std::endl;
  141.  
  142. infile.close();
  143. }
  144.  
  145. void comparisons2db(WikiDB &wikidb,
  146. std::string const& simpairs_tsv_filename, double threshold = 0.5) {
  147. // structure of matrix: revid revid sim revid sim ...
  148.  
  149. std::ifstream infile(simpairs_tsv_filename.c_str());
  150. if (!infile.is_open()) {
  151. std::cerr << "Couldn't open file " << simpairs_tsv_filename << std::endl;
  152. std::exit(1);
  153. }
  154.  
  155. //std::cout << "Processing file: " << simpairs_tsv_filename << std::endl;
  156.  
  157. auto t1 = std::chrono::high_resolution_clock::now();
  158.  
  159. std::vector<uint32_t> comparisons(0);
  160.  
  161. static std::size_t count(0);
  162.  
  163. boost::char_separator<char> sep("\t");
  164. for (std::string line; getline(infile, line); ) {
  165. boost::tokenizer<boost::char_separator<char> > tokens(line, sep);
  166.  
  167. auto it = std::begin(tokens);
  168. uint32_t refRevid = boost::lexical_cast<uint32_t>(*it);
  169.  
  170. ++it;
  171. if (wikidb.articleExists(refRevid)) { // does the reference revision id exist in the db?
  172. while (it != std::end(tokens)) {
  173. uint32_t revid = boost::lexical_cast<uint32_t>(*it);
  174. ++it;
  175. uint32_t sim = boost::numeric_cast<uint32_t>
  176. (boost::lexical_cast<float>(*it) * k_similarity_multiplier);
  177. if (sim >= threshold) {
  178. dbQuery q1;
  179. q1 = "revid=", revid;
  180. dbCursor<Article> comparisonCursor;
  181. if (comparisonCursor.select(q1) > 0) {
  182. uint32_t comparisonIdx = comparisonCursor->index;
  183.  
  184. SimPair sp(comparisonIdx, sim);
  185.  
  186. uint32_t comparison = sp.getData();
  187. comparisons.push_back(comparison);
  188. }
  189. } else {
  190. break;
  191. }
  192. ++it;
  193. }
  194.  
  195. wikidb.bulkUpdateComparisons(refRevid, comparisons);
  196. comparisons.clear();
  197. } else{ /* nothing */ }
  198. }
  199.  
  200. infile.close();
  201. auto t2 = std::chrono::high_resolution_clock::now();
  202. auto duration = std::chrono::duration_cast<std::chrono::seconds>(t2 - t1);
  203. std::cout << "File " << simpairs_tsv_filename << " finished in "
  204. << duration.count() << " seconds." << std::endl;
  205. }
  206.  
  207. std::vector< std::pair<uint32_t, std::vector<uint32_t> > >
  208. comparisons2vec(WikiDB &wikidb,
  209. std::string const& simpairs_tsv_filename, uint32_t threshold = 500) {
  210.  
  211. std::ifstream infile(simpairs_tsv_filename.c_str());
  212. if (!infile.is_open()) {
  213. std::cerr << "Couldn't open file " << simpairs_tsv_filename << std::endl;
  214. std::exit(1);
  215. }
  216.  
  217. std::vector< std::pair<uint32_t, std::vector<uint32_t> > > ret;
  218.  
  219. std::vector<uint32_t> comparisons(0);
  220.  
  221. static std::size_t count(0);
  222.  
  223. wikidb._db.attach();
  224.  
  225. boost::char_separator<char> sep("\t");
  226. for (std::string line; getline(infile, line); ) {
  227. boost::tokenizer<boost::char_separator<char> > tokens(line, sep);
  228.  
  229. auto it = std::begin(tokens);
  230. uint32_t refRevid = boost::lexical_cast<uint32_t>(*it);
  231.  
  232. ++it;
  233. if (wikidb.articleExists(refRevid)) { // does the reference revision id exist in the db?
  234. while (it != std::end(tokens)) {
  235. uint32_t revid = boost::lexical_cast<uint32_t>(*it);
  236. ++it;
  237. uint32_t sim = boost::numeric_cast<uint32_t>
  238. (boost::lexical_cast<float>(*it) * k_similarity_multiplier);
  239. if (sim >= threshold) {
  240. dbQuery q1;
  241. q1 = "revid=", revid;
  242. dbCursor<Article> comparisonCursor;
  243. if (comparisonCursor.select(q1) > 0) {
  244. uint32_t comparisonIdx = comparisonCursor->index;
  245.  
  246. SimPair sp(comparisonIdx, sim);
  247.  
  248. uint32_t comparison = sp.getData();
  249. comparisons.push_back(comparison);
  250. }
  251. } else { break; }
  252. ++it;
  253. }
  254. ret.push_back(std::make_pair(refRevid, comparisons));
  255. //wikidb.bulkUpdateComparisons(refRevid, comparisons);
  256. comparisons.clear();
  257. } else { /* nothing 8 */ }
  258. }
  259.  
  260. wikidb._db.detach();
  261. infile.close();
  262. return ret;
  263. }
  264.  
  265.  
  266. std::vector<std::string>
  267. create_files_vec(std::string const& matrixDir) {
  268. std::vector<std::string> vec;
  269. boost::filesystem::path path(matrixDir);
  270. boost::filesystem::directory_iterator end_itr;
  271. for (boost::filesystem::directory_iterator itr(path); itr != end_itr; ++itr) {
  272. if (is_regular_file(itr->path())) {
  273. std::string current_file = itr->path().string();
  274. vec.push_back(current_file);
  275. }
  276. }
  277.  
  278. std::sort(vec.begin(), vec.end());
  279.  
  280. return vec;
  281. }
  282.  
  283.  
  284. int main(int argc, char* argv[]) {
  285. std::map<std::string, docopt::value> args = docopt::docopt(USAGE,
  286. { argv + 1, argv + argc },
  287. true,
  288. "wikidbcreate 0.1");
  289.  
  290. for (auto const& arg : args) {
  291. std::cout << arg.first << ":" << arg.second << std::endl;
  292. }
  293.  
  294. if (args["pages"].asBool()) {
  295. pages2db(args["<DatabaseName>"].asString(), args["<revid2titleFilename>"].asString());
  296. }
  297. if (args["parents"].asBool()) {
  298. parents2db(args["<DatabaseName>"].asString(), args["<revid2parentsFilename>"].asString());
  299. }
  300. if (args["comparisons"].asBool()) {
  301. std::vector<std::string> filename_vec = create_files_vec(args["<simMatrixDir>"].asString());
  302.  
  303. WikiDB wikidb(args["<DatabaseName>"].asString());
  304.  
  305. std::size_t count = 0;
  306. std::size_t filesNum = filename_vec.size(); // 5639 files
  307. std::size_t chunks = 8;
  308.  
  309. for(auto it = filename_vec.begin(); it != filename_vec.end(); ++it) {
  310. auto t1 = std::chrono::high_resolution_clock::now();
  311.  
  312. std::vector< std::vector< std::pair< uint32_t, std::vector<uint32_t> > > > workVecs;
  313. typedef decltype(std::async(comparisons2vec, std::ref(wikidb), *it, 500)) future_t;
  314. std::vector<future_t> futuresVec;
  315. if (filesNum - (chunks * count) > chunks) { // count = 0..351
  316. for (std::size_t i = 0; i < chunks; ++i) {
  317. futuresVec.push_back(std::async(std::launch::async, comparisons2vec, std::ref(wikidb), *it, 500));
  318. ++it;
  319. }
  320. } else { // count = 352
  321. for (std::size_t i = 0; i < (filesNum - (chunks * count)); ++i) {
  322. futuresVec.push_back(std::async(std::launch::async, comparisons2vec, std::ref(wikidb), *it, 500));
  323. ++it;
  324. }
  325. }
  326.  
  327. for(auto &future : futuresVec) {
  328. workVecs.push_back(future.get());
  329. }
  330.  
  331. // update already processed files count
  332. count++;
  333.  
  334. for( auto const& workVec : workVecs ) {
  335. for( auto const& element : workVec ) {
  336. wikidb.bulkUpdateComparisons(element.first, element.second);
  337. }
  338. }
  339. wikidb.commit();
  340.  
  341. auto t2 = std::chrono::high_resolution_clock::now();
  342. auto duration = std::chrono::duration_cast<std::chrono::seconds>(t2 - t1);
  343. std::cout << "processed 8 files in " << duration.count() << " sec" << std::endl;
  344. std::cout << "already processed " << count*chunks << " files" << std::endl;
  345. }
  346. wikidb.commit();
  347. }
  348.  
  349. return 0;
  350. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement