Advertisement
Guest User

Untitled

a guest
Oct 23rd, 2018
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.62 KB | None | 0 0
  1. import org.jsoup.Connection;
  2. import org.jsoup.Jsoup;
  3. import org.jsoup.nodes.Document;
  4. import org.jsoup.select.Elements;
  5. import org.jsoup.nodes.Element;
  6.  
  7. import java.io.BufferedWriter;
  8. import java.io.File;
  9. import java.io.FileWriter;
  10. import java.io.IOException;
  11. import java.util.HashMap;
  12. import java.util.LinkedList;
  13. import java.util.List;
  14. import java.util.Map;
  15.  
  16. public class WebCrawl {
  17. private static final String USER_AGENT =
  18. "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0";
  19. private List<String> links = new LinkedList<String>(); // Just a list of URLs
  20. private Document htmlDocument; // This is our web page, or in other words, our document
  21. private Map<String, String> urlMap = new HashMap();
  22.  
  23. public void crawl(String url)
  24. {
  25. try
  26. {
  27. Connection connection = Jsoup.connect(url).userAgent(USER_AGENT);
  28. Document htmlDocument = (Document) connection.get();
  29. this.htmlDocument = htmlDocument;
  30. if(connection.response().statusCode() == 200) // 200 is the HTTP OK status code
  31. // indicating that everything is great.
  32. {
  33. System.out.println("\n**Visiting** Received web page at " + url);
  34. }
  35. if(!connection.response().contentType().contains("text/html"))
  36. {
  37. System.out.println("**Failure** Retrieved something other than HTML");
  38. return;
  39. }
  40. String title = htmlDocument.title();
  41. if (title.contains("/"))
  42. title = title.replace("/","-");
  43. Element dataOnPage = htmlDocument.select("div.mw-parser-output").first();
  44. StringBuilder sb = new StringBuilder();
  45. Element eleInDiv = dataOnPage.select("p").first();
  46. sb.append(eleInDiv.text());
  47. Element nextPar = eleInDiv.nextElementSibling();
  48. while(nextPar!= null && nextPar.nodeName() != "h2"){
  49. if(nextPar.nodeName() == "p")
  50. sb.append(nextPar.text()+"\n");
  51. nextPar = nextPar.nextElementSibling();
  52. }
  53. File directory = new File("Crawler/JavaWikiBook/"+title);
  54. if(! directory.exists())
  55. directory.mkdir();
  56. writeDataIntoFile(sb.toString(),"Crawler/JavaWikiBook/"+title+"/"+title+".txt");
  57. System.out.println(title);
  58. System.out.println(sb.toString());
  59. Elements allh2 = dataOnPage.select("h2");
  60. for(Element eachH2 : allh2){
  61. String title2 = eachH2.text();
  62. if (title2.contains("/"))
  63. title2 = title2.replace("/","-");
  64. if(title2.contains("[edit]"))
  65. title2 = title2.replace("[edit]","");
  66. Element nextP = eachH2.nextElementSibling();
  67. StringBuffer stringBuffer = new StringBuffer();
  68. while (nextP != null && nextP.nodeName() != "h2"){
  69. stringBuffer.append(nextP.text()+"\n");
  70. nextP = nextP.nextElementSibling();
  71. }
  72. writeDataIntoFile(stringBuffer.toString(),"Crawler/JavaWikiBook/"+title+"/"+title2+".txt");
  73. System.out.println(title2);
  74. System.out.println(stringBuffer.toString());
  75. }
  76. // return dataOnPage.text();
  77. }
  78. catch(IOException ioe)
  79. {
  80. // We were not successful in our HTTP request
  81. System.out.println("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!");
  82. return;
  83. }
  84. }
  85.  
  86. public Map<String,String> crawlerStageOne(String url)
  87. {
  88. try {
  89. Connection connection = Jsoup.connect(url).userAgent(USER_AGENT);
  90. Document htmlDocument = (Document) connection.get();
  91. if(connection.response().statusCode() == 200) // 200 is the HTTP OK status code
  92. // indicating that everything is great.
  93. {
  94. System.out.println("\n**Visiting** Received web page at " + url);
  95. }
  96. if(!connection.response().contentType().contains("text/html"))
  97. {
  98. System.out.println("**Failure** Retrieved something other than HTML");
  99. return null;
  100. }
  101.  
  102. Element contentDiv = htmlDocument.select("div.mw-content-ltr").first();
  103. Elements javaLinks = contentDiv.getElementsByTag("a");
  104. boolean foundPreface = false;
  105. for(Element link: javaLinks)
  106. {
  107. if(link.text().equalsIgnoreCase("preface"))
  108. foundPreface = true;
  109. if(!link.text().isEmpty() && foundPreface)
  110. {
  111. System.out.println("Title :"+link.text()+"\tURL :"+link.absUrl("href"));
  112. urlMap.put(link.text(),link.absUrl("href"));
  113. }
  114. }
  115. }catch (IOException exp){
  116. System.out.println(exp);
  117. }
  118. return urlMap;
  119. }
  120.  
  121. public List<String> getLinks()
  122. {
  123. return this.links;
  124. }
  125.  
  126. private boolean writeDataIntoFile(String data, String fileName){
  127. try{
  128. BufferedWriter writer = new BufferedWriter(new FileWriter(fileName,true));
  129. //System.out.println("Data: "+data);
  130. writer.write(data);
  131. writer.close();
  132. return true;
  133. }catch (IOException exp){
  134. System.out.println("ERROR!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! "+ exp);
  135. return false;
  136. }
  137. }
  138. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement