Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /**
- * Scrape the file
- * @param string $url The Url of the String we wish to scrape
- * @param string $filename The filename we wish to write to
- * @return string $content The content of the scrape
- */
- vector<string> Scraper::scrapeFile(string url, string filename, string regexp)
- {
- // some sane defaults
- //Url to scrape
- //this->scrape_url = "NCRListing.aspx.html";
- //Filename to write scraped content
- this->scrape_filename = "scrape.txt";
- //The String or Regexp we wish to filter and search for
- this->scrape_regexp = "<a[^>]*href=(\"??)([^\" >]*?)\\1[^>]*>(.*)</a>";
- //Directory to write curl scrape results
- this->scrape_dir = "scrapes";
- // Create a result vector (holding strings)
- vector<string> vResult;
- //Assign the scrape url
- if ( url == "" ) {
- url = this->scrape_url;
- }
- //Assign the filename
- if ( filename == "" ) {
- filename = this->scrape_filename;
- }
- //Assign the regexp
- if ( regexp == "" ) {
- regexp = this->scrape_regexp;
- }
- //Get the contents of url
- string input;
- stringstream ss;
- try {
- // prepare session
- URI uri(url);
- HTTPClientSession session(uri.getHost(), uri.getPort());
- // prepare path
- string path(uri.getPathAndQuery());
- if (path.empty()) path = "/";
- // send request
- HTTPRequest req(HTTPRequest::HTTP_GET, path, HTTPMessage::HTTP_1_1);
- session.sendRequest(req);
- // get response
- HTTPResponse res;
- cout << res.getStatus() << " " << res.getReason() << endl;
- // copy response stream to buffer
- istream &is = session.receiveResponse(res);
- StreamCopier::copyStream(is, ss);
- // some error happened
- } catch (Exception &ex) {
- cerr << ex.displayText() << endl; // print the error to the standard error stream
- vResult.push_back(ex.displayText()); // push the error on to our vector stack
- vResult.push_back("-1"); // terminate our vector stack with an error status
- return vResult; // exit the program
- }
- // Create a regular expression object with Poco
- RegularExpression test(regexp, 0, 1);
- // Create a regular expression vector with Poco for storing our match location and length
- RegularExpression::MatchVec matchvect;
- // Loop through the results of the uri query
- while (getline(ss, input)) {
- if(test.match(input, 0, matchvect)) { // We have a match
- cout << "Scraper:Hit: " << input.substr(matchvect[0].offset, matchvect[0].length) << endl; // print the match to the screen
- vResult.push_back(input.substr(matchvect[0].offset, matchvect[0].length)); // push the match into our return vector container (string)
- }
- if(DEBUG == 1) // for debugging purposes
- cout << "Scraper:Debug:regexp: " << regexp << endl;
- }
- //Return our content
- return vResult;
- }
Add Comment
Please, Sign In to add comment