Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- public class HtmlParser{
- private String adr= "https://mi-shop.com/ru/catalog/";
- private URL categoriesUrl = new URL("https://mi-shop.com/ru/");
- private URL mainUrl = new URL("https://mi-shop.com");
- private HttpURLConnection conn = null;
- private BufferedReader br = null;
- private String str;
- private String getPageCode() throws IOException {
- categoriesUrl = new URL(adr);
- conn = (HttpURLConnection)categoriesUrl.openConnection();
- br = new BufferedReader(new InputStreamReader(
- conn.getInputStream())
- ); //
- String page="";
- while((str = br.readLine()) != null){
- page+=str;
- }
- br.close();
- return page;
- }
- private List<URL> getCategories() {
- HtmlCleaner cleaner = new HtmlCleaner();
- TagNode html = null;
- try {
- html = cleaner.clean(categoriesUrl);
- } catch (IOException e) {
- e.printStackTrace();
- }
- Object[] objects = null;
- try {
- objects = html.evaluateXPath("//div[@class='container']/div[@class='menu']/ul[@class='menu-list']/li[@class='menu-item']/a[@class='menu-link']/@href");
- } catch (XPatherException e) {
- e.printStackTrace();
- }
- List<String> stringList = new ArrayList<String>();
- for (int i = 0; i < objects.length -2; i++) {
- stringList.add(objects[i].toString().trim());
- }
- List<URL> catalogs = new ArrayList<URL>();
- for (String catalog : stringList) {
- try {
- catalogs.add(new URL(mainUrl.toString()+catalog));
- } catch (MalformedURLException e) {
- e.printStackTrace();
- }
- }
- return catalogs;
- }
- public HtmlParser() throws MalformedURLException {
- }
- public void parse() throws IOException {
- String page = getPageCode();
- HtmlCleaner cleaner = new HtmlCleaner();
- TagNode html = null;
- //System.out.println(page);
- List<URL> catalogs = getCategories();
- //System.out.println(catalogs);
- Object[] products = null;
- html = cleaner.clean(catalogs.get(0));
- try {
- products = html.evaluateXPath("//div[@class='main-catalog']/div[@class='main-catalog-item']/div[@class='main-catalog-inner']");
- } catch (XPatherException e) {
- e.printStackTrace();
- }
- List<TagNode> innerElements = new ArrayList<TagNode>(100);
- for (Object obj : products) {
- innerElements.add((TagNode)obj);
- }
- for (TagNode innerElement : innerElements) {
- try {
- String img = innerElement.evaluateXPath("//div[@class='main-catalog-media']/img/@src")[0].toString().trim();
- //System.out.println(img);
- String href = innerElement.evaluateXPath("//a[@class='main-catalog-link']/@href")[0].toString().trim();
- //System.out.println(href);
- String title = innerElement.evaluateXPath("//div[@class='main-catalog-title']/text()")[0].toString().trim();
- //System.out.println(title);
- String price = innerElement.evaluateXPath("//div[@class='main-catalog-price']/text()")[0].toString().trim();
- System.out.println(price);
- } catch (XPatherException e) {
- e.printStackTrace();
- }
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement