Advertisement
Guest User

Untitled

a guest
May 25th, 2016
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.22 KB | None | 0 0
  1. public class Main {
  2.  
  3. @Getter private static AsyncMySQL mysql;
  4.  
  5. @Getter private static List<String> pages = new ArrayList<>();
  6.  
  7. public static void main(String[] args) {
  8. mysql = new AsyncMySQL("localhost", 3306, "mmm", "dertest", "crawler");
  9.  
  10. mysql.query("SELECT * FROM websites", resultSet -> {
  11. try {
  12. while(resultSet.next()) {
  13. pages.add(resultSet.getString("url"));
  14. }
  15. } catch (SQLException e) {
  16. e.printStackTrace();
  17. }
  18. });
  19.  
  20. try {
  21. processPage("https://en.wikipedia.org/wiki/List_of_most_popular_websites");
  22. processPage("http://sicor-kdl.net");
  23. processPage("http://sm-development.de");
  24. processPage("http://facebook.com");
  25. processPage("http://twitter.com");
  26. } catch (SQLException | IOException e) {
  27. e.printStackTrace();
  28. }
  29. }
  30.  
  31. public static void processPage(String url) throws SQLException, IOException {
  32. System.out.println("Checking Page: " + url);
  33. if(pages.contains(url)){
  34. System.out.println("Already found");
  35. }else{
  36. System.out.println("New Insert");
  37. //store the URL to database to avoid parsing again
  38. mysql.update("INSERT INTO websites(url) VALUES ('" + url + "')");
  39. pages.add(url);
  40.  
  41. //get useful information
  42. Document doc = Jsoup.connect(url).get();
  43.  
  44. if(doc.text().contains("research")){
  45. System.out.println(url);
  46. }
  47.  
  48. //get all links and recursively call the processPage method
  49. Elements questions = doc.select("a[href]");
  50. for(Element link: questions){
  51. String redirect = link.attr("abs:href");
  52. if(redirect != null && !redirect.equalsIgnoreCase("") && !redirect.equalsIgnoreCase("#")) {
  53. //System.out.println("Link: " + link.attr("abs:href"));
  54. if(!redirect.contains(".png") && !redirect.contains(".jpg")) {
  55. processPage(redirect);
  56. }
  57. }
  58. }
  59. }
  60. }
  61.  
  62. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement