Guest User

Untitled

a guest
Jul 23rd, 2018
108
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 2.91 KB | None | 0 0
  1. /**
  2. * Scrape the file
  3. * @param string $url The Url of the String we wish to scrape
  4. * @param string $filename The filename we wish to write to
  5. * @return string $content The content of the scrape
  6. */
  7. vector<string> Scraper::scrapeFile(string url, string filename, string regexp)
  8. {
  9. // some sane defaults
  10. //Url to scrape
  11. //this->scrape_url = "NCRListing.aspx.html";
  12. //Filename to write scraped content
  13.      this->scrape_filename  = "scrape.txt";
  14. //The String or Regexp we wish to filter and search for
  15.      this->scrape_regexp  = "<a[^>]*href=(\"??)([^\" >]*?)\\1[^>]*>(.*)</a>";
  16. //Directory to write curl scrape results
  17.      this->scrape_dir = "scrapes";
  18. // Create a result vector (holding strings)
  19.      vector<string> vResult;
  20.  
  21. //Assign the scrape url
  22.      if ( url == "" ) {
  23.           url = this->scrape_url;
  24.      }
  25.  
  26. //Assign the filename
  27.      if ( filename == "" ) {
  28.           filename = this->scrape_filename;
  29.      }
  30.  
  31. //Assign the regexp
  32.      if ( regexp == "" ) {
  33.           regexp = this->scrape_regexp;
  34.      }
  35.  
  36. //Get the contents of url
  37.      string input;
  38.      stringstream ss;
  39.      try {
  40. // prepare session
  41.           URI uri(url);
  42.           HTTPClientSession session(uri.getHost(), uri.getPort());
  43.  
  44. // prepare path
  45.           string path(uri.getPathAndQuery());
  46.           if (path.empty()) path = "/";
  47.  
  48. // send request
  49.           HTTPRequest req(HTTPRequest::HTTP_GET, path, HTTPMessage::HTTP_1_1);
  50.           session.sendRequest(req);
  51.  
  52. // get response
  53.           HTTPResponse res;
  54.           cout << res.getStatus() << " " << res.getReason() << endl;
  55.  
  56. // copy response stream to buffer
  57.           istream &is = session.receiveResponse(res);
  58.           StreamCopier::copyStream(is, ss);
  59. // some error happened
  60.      } catch (Exception &ex) {
  61.           cerr << ex.displayText() << endl; // print the error to the standard error stream
  62.           vResult.push_back(ex.displayText()); // push the error on to our vector stack
  63.           vResult.push_back("-1"); // terminate our vector stack with an error status
  64.           return vResult; // exit the program
  65.      }
  66. // Create a regular expression object with Poco
  67.      RegularExpression test(regexp, 0, 1);
  68. // Create a regular expression vector with Poco for storing our match location and length
  69.      RegularExpression::MatchVec matchvect;
  70. // Loop through the results of the uri query
  71.      while (getline(ss, input)) {
  72.           if(test.match(input, 0, matchvect)) { // We have a match
  73.                cout << "Scraper:Hit: " << input.substr(matchvect[0].offset, matchvect[0].length) << endl; // print the match to the screen
  74.                vResult.push_back(input.substr(matchvect[0].offset, matchvect[0].length)); // push the match into our return vector container (string)
  75.           }
  76.           if(DEBUG == 1) // for debugging purposes
  77.                cout << "Scraper:Debug:regexp: " << regexp << endl;
  78.      }
  79.  
  80. //Return our content
  81.      return vResult;
  82. }
Add Comment
Please, Sign In to add comment