Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- public class Main {
- @Getter private static AsyncMySQL mysql;
- @Getter private static List<String> pages = new ArrayList<>();
- public static void main(String[] args) {
- mysql = new AsyncMySQL("localhost", 3306, "mmm", "dertest", "crawler");
- mysql.query("SELECT * FROM websites", resultSet -> {
- try {
- while(resultSet.next()) {
- pages.add(resultSet.getString("url"));
- }
- } catch (SQLException e) {
- e.printStackTrace();
- }
- });
- try {
- processPage("https://en.wikipedia.org/wiki/List_of_most_popular_websites");
- processPage("http://sicor-kdl.net");
- processPage("http://sm-development.de");
- processPage("http://facebook.com");
- processPage("http://twitter.com");
- } catch (SQLException | IOException e) {
- e.printStackTrace();
- }
- }
- public static void processPage(String url) throws SQLException, IOException {
- System.out.println("Checking Page: " + url);
- if(pages.contains(url)){
- System.out.println("Already found");
- }else{
- System.out.println("New Insert");
- //store the URL to database to avoid parsing again
- mysql.update("INSERT INTO websites(url) VALUES ('" + url + "')");
- pages.add(url);
- //get useful information
- Document doc = Jsoup.connect(url).get();
- if(doc.text().contains("research")){
- System.out.println(url);
- }
- //get all links and recursively call the processPage method
- Elements questions = doc.select("a[href]");
- for(Element link: questions){
- String redirect = link.attr("abs:href");
- if(redirect != null && !redirect.equalsIgnoreCase("") && !redirect.equalsIgnoreCase("#")) {
- //System.out.println("Link: " + link.attr("abs:href"));
- if(!redirect.contains(".png") && !redirect.contains(".jpg")) {
- processPage(redirect);
- }
- }
- }
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement