Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package step3;
- import java.io.File;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- public class Task {
- //通过filePath文件路径获取Docment对象
- public Document getDoc(String filePath) throws IOException{
- /********** Begin **********/
- File file=new File("./backups/hotel.ctrip.com.txt");
- Document doc=Jsoup.parse(file,"UTF-8","http://hotels.ctrip.com/");
- return doc;
- /********** End **********/
- }
- //获取所有链接
- public List<String> getLinks(Document doc){
- /********** Begin **********/
- List<String> ar=new ArrayList<>();
- Elements kk=doc.select("a[href]");
- for(Element gg:kk){
- ar.add(gg.tagName()+"$"+gg.attr("abs:href")+"("+gg.text()+")");
- }
- return ar;
- /********** End **********/
- }
- //获取图片
- public List<String> getMedia(Document doc){
- /********** Begin **********/
- List<String> list=new ArrayList<>();
- Elements ll=doc.select("[src]");
- for(Element h:ll){
- if(h.tagName().equals("img")){
- list.add(h.tagName()+"$"+h.attr("abs:src"));
- }
- }
- return list;
- /********** End **********/
- }
- //获取link[href]链接
- public List<String> getImports(Document doc){
- /********** Begin **********/
- List<String> list=new ArrayList<>();
- Elements kk=doc.select("link[href]");
- for(Element g:kk){
- list.add(g.tagName()+"$"+g.attr("abs:href")+"("+g.attr("rel")+")");
- }
- return list;
- /********** End **********/
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement