Advertisement
Typhoon

cat_subcat_extractor.py

Jul 12th, 2016
144
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.81 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3.  
  4. from bs4 import BeautifulSoup
  5. import urllib.request as urllib
  6.  
  7. def category_parser ():
  8.     checkurl = 'http://www.bazos.sk/'
  9.     checkpage = urllib.urlopen(checkurl)
  10.     checksoup = BeautifulSoup(checkpage.read(), "html.parser")
  11.  
  12.     categories = {}
  13.  
  14.     for category in checksoup.select(".nadpisnahlavni a"):
  15.         #print(category.getText())
  16.         #print(category['href'])
  17.         subcat_url = category['href']
  18.         checksub = urllib.urlopen(subcat_url)
  19.         checksubsoup = BeautifulSoup(checksub.read(), "html.parser")
  20.         subcategories = checksubsoup.select(".barvaleva a")
  21.         for subcategory in subcategories:
  22.             subcategory = subcategory.getText()
  23.             #print(subcategory)
  24.             categories[subcategory]=category.getText()
  25.  
  26.     return categories
  27.  
  28. #print("\n\n#######")
  29. #print(category_parser())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement