Advertisement
FancyKing

使用Jsoup抓取携程旅游网全国城市信息

Apr 3rd, 2020
203
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 1.82 KB | None | 0 0
  1. package step4;
  2. import java.io.File;
  3. import java.io.IOException;
  4. import java.util.ArrayList;
  5. import java.util.List;
  6. import org.jsoup.Jsoup;
  7. import org.jsoup.nodes.Document;
  8. import org.jsoup.nodes.Element;
  9. import org.jsoup.select.Elements;
  10. public class Task {
  11.    
  12.     //通过filePath文件路径获取Docment对象
  13.     public Document getDoc(String filePath) throws IOException{
  14.         /**********   Begin   **********/
  15.        
  16.  
  17.         File file=new File("backups/hotels.ctrip.com_domestic-city-hotel.txt");
  18.         Document doc=Jsoup.parse(file,"UTF-8","http://hotels.ctrip.com/");
  19.         return doc;
  20.         /**********   End   **********/
  21.     }
  22.    
  23.     /**
  24.      * 获取所有城市返回城市信息集合
  25.      * @param doc  
  26.      * @return
  27.      */
  28.     public List<HotelCity> getAllCitys(Document doc){
  29.         /**********   Begin   **********/
  30.  
  31.        
  32.        
  33.         List<HotelCity> cities = new ArrayList<HotelCity>();
  34.        
  35.         Elements aa= doc.getElementsByClass("pinyin_filter_detail layoutfix");
  36.         Element pp = aa.first();
  37.         Elements hh= pp.getElementsByTag("dd");
  38.         Elements hts=pp.getElementsByTag("dt");
  39.        
  40.        for (int i = 0; i < hh.size(); i++) {
  41.             Element bb = hts.get(i);
  42.             Element head_hotelsLink = hh.get(i);
  43.             Elements links = head_hotelsLink.children();
  44.            
  45.             for (Element link : links) {
  46.                 String pinyin_cityId = link.attr("href").replace("/hotel/", "");
  47.                 String pinyin = pinyin_cityId.replace(StringUtil.getNumbers(link.attr("href")), "");//截取拼音
  48.                 HotelCity city = new HotelCity();
  49.                 city.setCityId(StringUtil.getNumbers(link.attr("href"))); //截取cityId
  50.                 city.setCityName(link.text());
  51.                 city.setHeadPinyin(bb.text());
  52.                 city.setPinyin(pinyin);
  53.                 cities.add(city);
  54.             }
  55.         }
  56.         return cities;
  57.         /**********   End   **********/
  58.     }
  59. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement