Advertisement
Guest User

crawler4j

a guest
Jun 5th, 2016
194
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.03 KB | None | 0 0
  1. public class SampleLauncher {
  2.  
  3. public static void main(String[] args) throws Exception {
  4. String crawlStorageFolder = "/data/crawl/root";
  5. int numberOfCrawlers = 7;
  6.  
  7. CrawlConfig config = new CrawlConfig();
  8.  
  9. config.setPolitenessDelay(100);
  10.  
  11. config.setCrawlStorageFolder(crawlStorageFolder);
  12.  
  13. /*
  14. * Instantiate the controller for this crawl.
  15. */
  16. PageFetcher pageFetcher = new PageFetcher(config);
  17. RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
  18. RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
  19. CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
  20.  
  21. /*
  22. * For each crawl, you need to add some seed urls. These are the first
  23. * URLs that are fetched and then the crawler starts following links
  24. * which are found in these pages
  25. */
  26. controller.addSeed("https://de.wikipedia.org/wiki/Java_Database_Connectivity");
  27. controller.addSeed("https://de.wikipedia.org/wiki/Relationale_Datenbank");
  28. controller.addSeed("https://pt.wikipedia.org/wiki/JDBC");
  29. controller.addSeed("https://pt.wikipedia.org/wiki/Protocolo");
  30. controller.addSeed("https://de.wikipedia.org/wiki/Datenbank");
  31.  
  32. /*
  33. * Start the crawl. This is a blocking operation, meaning that your code
  34. * will reach the line after this only when crawling is finished.
  35. */
  36.  
  37. controller.start(new PostgresCrawlerFactory("jdbc:postgresql://localhost/crawler4j","postgres","postgres"), numberOfCrawlers);
  38. }
  39.  
  40. }
  41.  
  42. ----------------
  43.  
  44. public class PostgresDBServiceImpl implements PostgresDBService {
  45.  
  46. private static final Logger logger = org.slf4j.LoggerFactory.getLogger(PostgresDBServiceImpl.class);
  47.  
  48. private ComboPooledDataSource comboPooledDataSource;
  49.  
  50. private PreparedStatement insertKeyStatement;
  51.  
  52. public PostgresDBServiceImpl(String dbUrl, String dbUser, String dbPw, String driver) throws
  53. PropertyVetoException, SQLException {
  54. comboPooledDataSource = new ComboPooledDataSource();
  55. comboPooledDataSource.setDriverClass(driver);
  56. comboPooledDataSource.setJdbcUrl(dbUrl);
  57. comboPooledDataSource.setUser(dbUser);
  58. comboPooledDataSource.setPassword(dbPw);
  59.  
  60. createDatabase();
  61. }
  62.  
  63. private void createDatabase() throws SQLException {
  64.  
  65. comboPooledDataSource.getConnection().createStatement().executeUpdate(
  66. "CREATE SEQUENCE IF NOT EXISTS id_master_seq" +
  67. " INCREMENT 1" +
  68. " MINVALUE 1 " +
  69. " MAXVALUE 9223372036854775807" +
  70. " START 6 " +
  71. " CACHE 1;")
  72. ;
  73. comboPooledDataSource.getConnection().createStatement().executeUpdate(
  74. "CREATE TABLE IF NOT EXISTS webpage" +
  75. " ( " +
  76. " id bigint NOT NULL," +
  77. " html TEXT," +
  78. " text TEXT," +
  79. " url varchar(4096)," +
  80. " seen timestamp without time zone NOT NULL," +
  81. " primary key (id)" +
  82. ")");
  83.  
  84. insertKeyStatement = comboPooledDataSource.getConnection().prepareStatement("insert into webpage values " +
  85. "(nextval('id_master_seq'),?,?,?,?)");
  86.  
  87. }
  88.  
  89. @Override
  90. public void store(Page page) {
  91.  
  92. if (page.getParseData() instanceof HtmlParseData) {
  93. try {
  94.  
  95. HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
  96.  
  97. insertKeyStatement.setString(1, htmlParseData.getHtml());
  98. insertKeyStatement.setString(2, htmlParseData.getText());
  99. insertKeyStatement.setString(3, page.getWebURL().getURL());
  100. insertKeyStatement.setTimestamp(4, new Timestamp(new java.util.Date().getTime()));
  101. insertKeyStatement.executeUpdate();
  102. } catch (SQLException e) {
  103. logger.error("SQL Exception while storing webpage for url'{}'", page.getWebURL().getURL(), e);
  104. throw new RuntimeException(e);
  105. }
  106. }
  107. }
  108.  
  109. @Override
  110. public void close() {
  111. if (comboPooledDataSource != null) {
  112. comboPooledDataSource.close();
  113. }
  114. }
  115. }
  116.  
  117. ----------------------
  118.  
  119. public class PostgresWebCrawler extends WebCrawler {
  120.  
  121. private static final Logger logger = org.slf4j.LoggerFactory.getLogger(PostgresWebCrawler.class);
  122.  
  123. private static Pattern FILE_ENDING_EXCLUSION_PATTERN = Pattern.compile(".*(\\.(" +
  124. "css|js" +
  125. "|bmp|gif|jpe?g|JPE?G|png|tiff?|ico|nef|raw" +
  126. "|mid|mp2|mp3|mp4|wav|wma|flv|mpe?g" +
  127. "|avi|mov|mpeg|ram|m4v|wmv|rm|smil" +
  128. "|pdf|doc|docx|pub|xls|xlsx|vsd|ppt|pptx" +
  129. "|swf" +
  130. "|zip|rar|gz|bz2|7z|bin" +
  131. "|xml|txt|java|c|cpp|exe" +
  132. "))$");
  133.  
  134.  
  135. private final PostgresDBService postgresDBService;
  136.  
  137. public PostgresWebCrawler(PostgresDBService postgresDBService) {
  138. this.postgresDBService = postgresDBService;
  139. }
  140.  
  141. @Override
  142. public boolean shouldVisit(Page referringPage, WebURL url) {
  143. String href = url.getURL().toLowerCase();
  144. return !FILE_ENDING_EXCLUSION_PATTERN.matcher(href).matches();
  145. }
  146.  
  147. @Override
  148. public void visit(Page page) {
  149. String url = page.getWebURL().getURL();
  150. logger.info("URL: " + url);
  151.  
  152. if (page.getParseData() instanceof HtmlParseData) {
  153. HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
  154. String text = htmlParseData.getText();
  155. String html = htmlParseData.getHtml();
  156. Set<WebURL> links = htmlParseData.getOutgoingUrls();
  157.  
  158. logger.info("Text length: " + text.length());
  159. logger.info("Html length: " + html.length());
  160. logger.info("Number of outgoing links: " + links.size());
  161.  
  162. try {
  163. postgresDBService.store(page);
  164. } catch (RuntimeException e) {
  165. logger.error("Storing failed", e);
  166. }
  167. }
  168. }
  169.  
  170. public void onBeforeExit() {
  171. if (postgresDBService != null) {
  172. postgresDBService.close();
  173. }
  174. }
  175. }
  176.  
  177. -------------------------
  178.  
  179. public class PostgresCrawlerFactory implements CrawlController.WebCrawlerFactory<PostgresWebCrawler> {
  180.  
  181. private final String dbUrl;
  182. private final String dbUser;
  183. private final String dbPw;
  184.  
  185. public PostgresCrawlerFactory(String dbUrl, String dbUser, String dbPw) {
  186. this.dbUrl = dbUrl;
  187. this.dbUser = dbUser;
  188. this.dbPw = dbPw;
  189. }
  190.  
  191. public PostgresWebCrawler newInstance() throws Exception {
  192. return new PostgresWebCrawler(new PostgresDBServiceImpl(dbUrl,dbUser,dbPw,"org.postgresql.Driver"));
  193. }
  194. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement