Advertisement
Guest User

Untitled

a guest
Feb 21st, 2019
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 3.48 KB | None | 0 0
  1. public class HtmlParser{
  2.     private String adr= "https://mi-shop.com/ru/catalog/";
  3.     private URL categoriesUrl = new URL("https://mi-shop.com/ru/");
  4.     private URL mainUrl = new URL("https://mi-shop.com");
  5.     private HttpURLConnection conn = null;
  6.     private BufferedReader br = null;
  7.     private String str;
  8.  
  9.     private String getPageCode() throws IOException {
  10.         categoriesUrl = new URL(adr);
  11.         conn = (HttpURLConnection)categoriesUrl.openConnection();
  12.         br = new BufferedReader(new InputStreamReader(
  13.                 conn.getInputStream())
  14.         ); //
  15.         String page="";
  16.         while((str = br.readLine()) != null){
  17.             page+=str;
  18.         }
  19.         br.close();
  20.         return page;
  21.     }
  22.  
  23.     private List<URL> getCategories() {
  24.         HtmlCleaner cleaner = new HtmlCleaner();
  25.         TagNode html = null;
  26.         try {
  27.             html = cleaner.clean(categoriesUrl);
  28.         } catch (IOException e) {
  29.             e.printStackTrace();
  30.         }
  31.         Object[] objects = null;
  32.         try {
  33.             objects = html.evaluateXPath("//div[@class='container']/div[@class='menu']/ul[@class='menu-list']/li[@class='menu-item']/a[@class='menu-link']/@href");
  34.         } catch (XPatherException e) {
  35.             e.printStackTrace();
  36.         }
  37.         List<String> stringList = new ArrayList<String>();
  38.         for (int i = 0; i < objects.length -2; i++) {
  39.             stringList.add(objects[i].toString().trim());
  40.         }
  41.         List<URL> catalogs = new ArrayList<URL>();
  42.         for (String catalog : stringList) {
  43.             try {
  44.                 catalogs.add(new URL(mainUrl.toString()+catalog));
  45.             } catch (MalformedURLException e) {
  46.                 e.printStackTrace();
  47.             }
  48.         }
  49.         return catalogs;
  50.     }
  51.  
  52.  
  53.  
  54.  
  55.  
  56.  
  57.  
  58.     public HtmlParser() throws MalformedURLException {
  59.     }
  60.  
  61.     public void parse() throws IOException {
  62.         String page = getPageCode();
  63.         HtmlCleaner cleaner = new HtmlCleaner();
  64.         TagNode html = null;
  65.         //System.out.println(page);
  66.         List<URL> catalogs = getCategories();
  67.         //System.out.println(catalogs);
  68.         Object[] products = null;
  69.         html = cleaner.clean(catalogs.get(0));
  70.         try {
  71.             products =  html.evaluateXPath("//div[@class='main-catalog']/div[@class='main-catalog-item']/div[@class='main-catalog-inner']");
  72.         } catch (XPatherException e) {
  73.             e.printStackTrace();
  74.         }
  75.         List<TagNode> innerElements = new ArrayList<TagNode>(100);
  76.         for (Object obj : products) {
  77.             innerElements.add((TagNode)obj);
  78.         }
  79.         for (TagNode innerElement : innerElements) {
  80.             try {
  81.                 String img = innerElement.evaluateXPath("//div[@class='main-catalog-media']/img/@src")[0].toString().trim();
  82.                 //System.out.println(img);
  83.                 String href = innerElement.evaluateXPath("//a[@class='main-catalog-link']/@href")[0].toString().trim();
  84.                 //System.out.println(href);
  85.                 String title = innerElement.evaluateXPath("//div[@class='main-catalog-title']/text()")[0].toString().trim();
  86.                 //System.out.println(title);
  87.                 String price = innerElement.evaluateXPath("//div[@class='main-catalog-price']/text()")[0].toString().trim();
  88.                 System.out.println(price);
  89.             } catch (XPatherException e) {
  90.                 e.printStackTrace();
  91.             }
  92.  
  93.         }
  94.   }
  95.  
  96.  
  97.  
  98.  
  99. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement