Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package best;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- public class Crawling {
- public Document getDocument(String url) throws IOException {
- Document document = Jsoup.connect(url)
- .header("Accept",
- "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
- .header("Accept-Encoding", "gzip, deflate, sdch, br")
- .header("Accept-Language", "ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4")
- .header("User-Agent",
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")
- .header("Connection","Keep-Alive")
- .header("Host",/*HOSTNAME*/)
- .get();
- return document;
- }
- public List<String> attrSrcCrawling(Document document, String CSSSelect) throws IOException {
- List<String> list = new ArrayList<>();
- Elements attrSrcElements = document.select(CSSSelect);
- for (Element attrSrcEl : attrSrcElements) {
- list.add(attrSrcEl.attr("src"));
- System.out.println(attrSrcEl.attr("src"));
- }
- return list;
- }
- public List<String> textCrawling(Document document, String CSSSelect) throws IOException {
- List<String> list = new ArrayList<>();
- Elements textElements = document.select(CSSSelect);
- for (Element textEl : textElements) {
- list.add(textEl.text());
- System.out.println(textEl.text());
- }
- return list;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement