Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import urllib.request
- from bs4 import BeautifulSoup
- from datetime import datetime
- prefix = "https://www.ceneo.pl/"
- postfix = "/opinie-"
- product_id = "45498942"
- page_num = 1
- url = prefix+product_id+postfix+str(page_num)
- #pobranie zawartosci strony
- site = urllib.request.urlopen(url)
- page = site.read()
- page_tree = BeautifulSoup(page, 'html.parser')
- opinions_num = int(page_tree.find("span", attrs={"itemprop": "reviewCount"}).string)
- print(opinions_num)
- #parsowanie kodu strony
- # opinions = page_tree.find_all("li" , attrs={ "class": "review-box"})
- opinions = page_tree.select("li.review-box")
- for opinion in opinions:
- id = int(opinion["data-entry-id"])
- author = (opinion.select("div.reviewer-name-line")).pop().stringa
- try:
- recomendation = (opinion.select("div.product-review-summary > em")).pop().string
- except IndexError:
- recomendation = "BRAK"
- stars = (opinion.select("span.review-score-count")).pop().string
- content = (opinion.select("p.product-review-body")).pop().get_text()
- useful = (opinion.select("[id^=votes-yes]")).pop().string
- unuseful = (opinion.select("[id^=votes-no]")).pop().string
- time = opinion.select("div > span.review-time > time")
- add_date = datetime.strptime(time.pop()["datetime"], "%Y-%m-%d %H:%M:%S")
- if time:
- purchase_date = datetime.strptime(time.pop()["datetime"], "%Y-%m-%d %H:%M:%S")
- else:
- purchase_date = None
- adavnatges = (opinion.select("div.cons-cell > ul")).pop().get_text()
- print(adavnatges)
- #print(author, id, recomendation, stars, content, useful, unuseful, end="\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement