Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- public class SampleLauncher {
- public static void main(String[] args) throws Exception {
- String crawlStorageFolder = "/data/crawl/root";
- int numberOfCrawlers = 7;
- CrawlConfig config = new CrawlConfig();
- config.setPolitenessDelay(100);
- config.setCrawlStorageFolder(crawlStorageFolder);
- /*
- * Instantiate the controller for this crawl.
- */
- PageFetcher pageFetcher = new PageFetcher(config);
- RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
- RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
- CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
- /*
- * For each crawl, you need to add some seed urls. These are the first
- * URLs that are fetched and then the crawler starts following links
- * which are found in these pages
- */
- controller.addSeed("https://de.wikipedia.org/wiki/Java_Database_Connectivity");
- controller.addSeed("https://de.wikipedia.org/wiki/Relationale_Datenbank");
- controller.addSeed("https://pt.wikipedia.org/wiki/JDBC");
- controller.addSeed("https://pt.wikipedia.org/wiki/Protocolo");
- controller.addSeed("https://de.wikipedia.org/wiki/Datenbank");
- /*
- * Start the crawl. This is a blocking operation, meaning that your code
- * will reach the line after this only when crawling is finished.
- */
- controller.start(new PostgresCrawlerFactory("jdbc:postgresql://localhost/crawler4j","postgres","postgres"), numberOfCrawlers);
- }
- }
- ----------------
- public class PostgresDBServiceImpl implements PostgresDBService {
- private static final Logger logger = org.slf4j.LoggerFactory.getLogger(PostgresDBServiceImpl.class);
- private ComboPooledDataSource comboPooledDataSource;
- private PreparedStatement insertKeyStatement;
- public PostgresDBServiceImpl(String dbUrl, String dbUser, String dbPw, String driver) throws
- PropertyVetoException, SQLException {
- comboPooledDataSource = new ComboPooledDataSource();
- comboPooledDataSource.setDriverClass(driver);
- comboPooledDataSource.setJdbcUrl(dbUrl);
- comboPooledDataSource.setUser(dbUser);
- comboPooledDataSource.setPassword(dbPw);
- createDatabase();
- }
- private void createDatabase() throws SQLException {
- comboPooledDataSource.getConnection().createStatement().executeUpdate(
- "CREATE SEQUENCE IF NOT EXISTS id_master_seq" +
- " INCREMENT 1" +
- " MINVALUE 1 " +
- " MAXVALUE 9223372036854775807" +
- " START 6 " +
- " CACHE 1;")
- ;
- comboPooledDataSource.getConnection().createStatement().executeUpdate(
- "CREATE TABLE IF NOT EXISTS webpage" +
- " ( " +
- " id bigint NOT NULL," +
- " html TEXT," +
- " text TEXT," +
- " url varchar(4096)," +
- " seen timestamp without time zone NOT NULL," +
- " primary key (id)" +
- ")");
- insertKeyStatement = comboPooledDataSource.getConnection().prepareStatement("insert into webpage values " +
- "(nextval('id_master_seq'),?,?,?,?)");
- }
- @Override
- public void store(Page page) {
- if (page.getParseData() instanceof HtmlParseData) {
- try {
- HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
- insertKeyStatement.setString(1, htmlParseData.getHtml());
- insertKeyStatement.setString(2, htmlParseData.getText());
- insertKeyStatement.setString(3, page.getWebURL().getURL());
- insertKeyStatement.setTimestamp(4, new Timestamp(new java.util.Date().getTime()));
- insertKeyStatement.executeUpdate();
- } catch (SQLException e) {
- logger.error("SQL Exception while storing webpage for url'{}'", page.getWebURL().getURL(), e);
- throw new RuntimeException(e);
- }
- }
- }
- @Override
- public void close() {
- if (comboPooledDataSource != null) {
- comboPooledDataSource.close();
- }
- }
- }
- ----------------------
- public class PostgresWebCrawler extends WebCrawler {
- private static final Logger logger = org.slf4j.LoggerFactory.getLogger(PostgresWebCrawler.class);
- private static Pattern FILE_ENDING_EXCLUSION_PATTERN = Pattern.compile(".*(\\.(" +
- "css|js" +
- "|bmp|gif|jpe?g|JPE?G|png|tiff?|ico|nef|raw" +
- "|mid|mp2|mp3|mp4|wav|wma|flv|mpe?g" +
- "|avi|mov|mpeg|ram|m4v|wmv|rm|smil" +
- "|pdf|doc|docx|pub|xls|xlsx|vsd|ppt|pptx" +
- "|swf" +
- "|zip|rar|gz|bz2|7z|bin" +
- "|xml|txt|java|c|cpp|exe" +
- "))$");
- private final PostgresDBService postgresDBService;
- public PostgresWebCrawler(PostgresDBService postgresDBService) {
- this.postgresDBService = postgresDBService;
- }
- @Override
- public boolean shouldVisit(Page referringPage, WebURL url) {
- String href = url.getURL().toLowerCase();
- return !FILE_ENDING_EXCLUSION_PATTERN.matcher(href).matches();
- }
- @Override
- public void visit(Page page) {
- String url = page.getWebURL().getURL();
- logger.info("URL: " + url);
- if (page.getParseData() instanceof HtmlParseData) {
- HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
- String text = htmlParseData.getText();
- String html = htmlParseData.getHtml();
- Set<WebURL> links = htmlParseData.getOutgoingUrls();
- logger.info("Text length: " + text.length());
- logger.info("Html length: " + html.length());
- logger.info("Number of outgoing links: " + links.size());
- try {
- postgresDBService.store(page);
- } catch (RuntimeException e) {
- logger.error("Storing failed", e);
- }
- }
- }
- public void onBeforeExit() {
- if (postgresDBService != null) {
- postgresDBService.close();
- }
- }
- }
- -------------------------
- public class PostgresCrawlerFactory implements CrawlController.WebCrawlerFactory<PostgresWebCrawler> {
- private final String dbUrl;
- private final String dbUser;
- private final String dbPw;
- public PostgresCrawlerFactory(String dbUrl, String dbUser, String dbPw) {
- this.dbUrl = dbUrl;
- this.dbUser = dbUser;
- this.dbPw = dbPw;
- }
- public PostgresWebCrawler newInstance() throws Exception {
- return new PostgresWebCrawler(new PostgresDBServiceImpl(dbUrl,dbUser,dbPw,"org.postgresql.Driver"));
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement