Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from selenium.common.exceptions import NoSuchElementException, WebDriverException
- import numpy as np
- import random
- ## helper function to consolidate two dictionaries
- def merge_two_dicts(x, y):
- z = x.copy() # start with x's keys and values
- z.update(y) # modifies z with y's keys and values & returns None
- return z
- scraped_data = []
- for iterr in range(0,len(tops_link)):
- init = 0
- url = tops_link[iterr]
- ## open the URL in browser
- try:
- browser.get(url)
- time.sleep(4)
- except WebDriverException: ## when extracted URL is invalid
- print('invalid url', iterr)
- continue
- ## get the webpage content
- content = browser.page_source
- soup = BeautifulSoup(content, "lxml")
- ## repeat until we run of review pages
- while(True):
- ## get the webpage content
- content = browser.page_source
- soup = BeautifulSoup(content, "lxml")
- ## extract reviewer details
- reviewer_details = soup.find_all("div", {"class": "pr-rd-reviewer-details pr-rd-inner-side-content-block"})
- ## extract reviewers' name
- reviewers_name = []
- for reviewer in reviewer_details:
- ## In ModCloth, reviewer name appears as "By REVIEWER_NAME"
- ## Splitting at the end is to remove "By" to get only the actual reviewer name
- reviewer_name = reviewer.find("p", {"class":"pr-rd-details pr-rd-author-nickname"}).text.split('\n')[-1].strip()
- reviewers_name.append(reviewer_name)
- ## extract "isVerified" information
- isVerified = soup.find_all("span", {"class": "pr-rd-badging-text"})
- ## extract the fit feedback and customer measurements data (review_metadata)
- review_data = soup.find_all("article", {"class": "pr-review"})
- review_metadata_raw = []
- for i in range(len(review_data)):
- review_metadata_raw.append(review_data[i].find("div", {"class": "pr-accordion-content"}))
- ## extract HTML elements which contain review metadata
- review_metadata_elements = [review_metadata_raw[i].find_all("dl", {"class", "pr-rd-def-list"})
- if review_metadata_raw[i] is not None else None
- for i in range(len(review_metadata_raw))]
- ## extract actual data from HTML elements
- review_metadata = []
- for element in review_metadata_elements:
- if element is None:
- review_metadata.append(None)
- continue
- ## <dt> elements contain metadata field name like "fit", "length" etc
- ## <dd> elements contain reviewer's response for those metadata fields like "small", "just right" etc
- review_metadata.append([(element[i].find("dt").text.lower(), element[i].find("dd").text.lower())
- if element is not None else ""
- for i in range(len(element))])
- ## extract review text
- review_text = [txt.text for txt in soup.find_all("p", {"class": "pr-rd-description-text"})]
- review_summary = [txt.text for txt in soup.find_all("h2", {"class": "pr-rd-review-headline"})]
- ## extract item id
- item_id = soup.find("div", {"class": "product-number"}).find("span").text
- ## extract item category
- try:
- category = soup.find("a", {"class":"breadcrumb-element"}).text.lower()
- except AttributeError: ## if category not present, item is not available
- time.sleep(15 + random.randint(0,10))
- break
- ## extract available product sizes
- product_sizes = [i.text.strip().lower() for i in soup.find("ul", {"class": "swatches size"})
- .find_all("li", {"class": "selectable variation-group-value"})]
- item_info = {"category": category, "item_id": item_id, "product_sizes": product_sizes}
- ## consolidate all the extracted data
- ## ignore records which don't have any review metadata as fit feedback is an essential signal for us
- scraped_data.extend([merge_two_dicts({"review_text": review_text[j], "review_summary": review_summary[j]},
- merge_two_dicts(merge_two_dicts({"user_name":reviewers_name[j]},
- {data[0]:data[1] for data in review_metadata[j]})
- ,item_info))
- for j in range(len(reviewer_details)) if review_metadata_raw[j] is not None])
- ## if current page is the initial one, it contains only NEXT button (PREVIOUS is missing)
- if init == 0:
- try:
- init = 1
- ## execute click on NEXT by utilizing the xpath of NEXT
- browser.execute_script("arguments[0].click();",
- browser.find_element_by_xpath('//*[@id="pr-review-display"]/footer/div/aside/button'))
- time.sleep(10 + random.randint(0,5))
- except NoSuchElementException: ## No NEXT button present, less than 10 reviews
- time.sleep(15 + random.randint(0,10))
- break
- else:
- try:
- ## execute click on NEXT by utilizing the xpath of NEXT
- ## if you notice, the xpath of NEXT is different here since PREVIOUS button is also present now
- browser.execute_script("arguments[0].click();",
- browser.find_element_by_xpath('//*[@id="pr-review-display"]/footer/div/aside/button[2]'))
- time.sleep(10 + random.randint(0,5))
- except NoSuchElementException: ## No NEXT button, no more pages left
- time.sleep(15 + random.randint(0,10))
- break
- ## save the extracted data locally
- np.save('./scraped_data_tops.npy',scraped_data)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement