Advertisement
Guest User

Untitled

a guest
Aug 22nd, 2017
71
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.49 KB | None | 0 0
  1. package best;
  2.  
  3. import java.io.IOException;
  4. import java.util.ArrayList;
  5. import java.util.List;
  6.  
  7. import org.jsoup.Jsoup;
  8. import org.jsoup.nodes.Document;
  9. import org.jsoup.nodes.Element;
  10. import org.jsoup.select.Elements;
  11.  
  12. public class Crawling {
  13.  
  14. public Document getDocument(String url) throws IOException {
  15.  
  16. Document document = Jsoup.connect(url)
  17. .header("Accept",
  18. "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
  19. .header("Accept-Encoding", "gzip, deflate, sdch, br")
  20. .header("Accept-Language", "ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4")
  21. .header("User-Agent",
  22. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
  23. .header("Connection","Keep-Alive")
  24. .header("Host",/*HOSTNAME*/)
  25. .get();
  26.  
  27. return document;
  28. }
  29.  
  30. public List<String> attrSrcCrawling(Document document, String CSSSelect) throws IOException {
  31. List<String> list = new ArrayList<>();
  32.  
  33. Elements attrSrcElements = document.select(CSSSelect);
  34.  
  35. for (Element attrSrcEl : attrSrcElements) {
  36. list.add(attrSrcEl.attr("src"));
  37. System.out.println(attrSrcEl.attr("src"));
  38. }
  39. return list;
  40. }
  41.  
  42. public List<String> textCrawling(Document document, String CSSSelect) throws IOException {
  43. List<String> list = new ArrayList<>();
  44.  
  45. Elements textElements = document.select(CSSSelect);
  46.  
  47. for (Element textEl : textElements) {
  48. list.add(textEl.text());
  49. System.out.println(textEl.text());
  50. }
  51. return list;
  52. }
  53. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement