Advertisement
Guest User

Untitled

a guest
Feb 20th, 2017
58
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.20 KB | None | 0 0
  1. import java.io.File;
  2.  
  3. import java.io.FileNotFoundException;
  4. import java.io.FileWriter;
  5.  
  6. import edu.uci.ics.crawler4j.crawler.CrawlConfig;
  7. import edu.uci.ics.crawler4j.crawler.CrawlController;
  8. import edu.uci.ics.crawler4j.fetcher.PageFetcher;
  9. import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
  10. import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
  11.  
  12. public class Controller {
  13.  
  14. public static void main(String[] args) throws Exception {
  15. // TODO Auto-generated method stub
  16. String crawlStorageFolder = "/data/crawl";
  17. int numberOfCrawlers = 20;
  18. FileWriter fileWriter = null;
  19. CrawlConfig config = new CrawlConfig();
  20. config.setCrawlStorageFolder(crawlStorageFolder);
  21. /*
  22. * Instantiate the controller for this crawl.
  23. */
  24. PageFetcher pageFetcher = new PageFetcher(config);
  25. RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
  26. RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
  27. CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
  28. controller.addSeed("https://www.nytimes.com/");
  29. config.setMaxDepthOfCrawling(16);
  30. config.setMaxPagesToFetch(20000);
  31. config.setPolitenessDelay(100);
  32. config.setUserAgentString("USC viterbi");
  33. config.setUserAgentString("USC Viterbi");
  34. config.setIncludeBinaryContentInCrawling(true);
  35. /*
  36. * For each crawl, you need to add some seed urls. These are the first
  37. * URLs that are fetched and then the crawler starts following links
  38. * which are found in these pages
  39. */
  40. // controller.addSeed("http://www.latimes.com/");
  41. /*
  42. * Start the crawl. This is a blocking operation, meaning that your code
  43. * will reach the line after this only when crawling is finished.
  44. */
  45. fileWriter = new FileWriter("fetch_LA.csv",false);
  46. fileWriter.append("URL,STATUS CODE\n");
  47. fileWriter.close();
  48. fileWriter = new FileWriter("visit_LA.csv",false);
  49. fileWriter.append("URL,SIZE,No of Outlinks,Content-Type\n");
  50. fileWriter.close();
  51. fileWriter = new FileWriter("urls_LA.csv",false);
  52. //fileWriter.append("URL,STATUS CODE\n");
  53. fileWriter.close();
  54.  
  55. controller.start(MyCrawler.class, numberOfCrawlers);
  56. }
  57.  
  58. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement