Advertisement
Guest User

Untitled

a guest
Mar 20th, 2019
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.81 KB | None | 0 0
  1. from selenium.common.exceptions import NoSuchElementException, WebDriverException
  2. import numpy as np
  3. import random
  4.  
  5. ## helper function to consolidate two dictionaries
  6. def merge_two_dicts(x, y):
  7. z = x.copy() # start with x's keys and values
  8. z.update(y) # modifies z with y's keys and values & returns None
  9. return z
  10.  
  11. scraped_data = []
  12. for iterr in range(0,len(tops_link)):
  13. init = 0
  14. url = tops_link[iterr]
  15.  
  16. ## open the URL in browser
  17. try:
  18. browser.get(url)
  19. time.sleep(4)
  20. except WebDriverException: ## when extracted URL is invalid
  21. print('invalid url', iterr)
  22. continue
  23.  
  24. ## get the webpage content
  25. content = browser.page_source
  26. soup = BeautifulSoup(content, "lxml")
  27.  
  28. ## repeat until we run of review pages
  29. while(True):
  30. ## get the webpage content
  31. content = browser.page_source
  32. soup = BeautifulSoup(content, "lxml")
  33.  
  34. ## extract reviewer details
  35. reviewer_details = soup.find_all("div", {"class": "pr-rd-reviewer-details pr-rd-inner-side-content-block"})
  36.  
  37. ## extract reviewers' name
  38. reviewers_name = []
  39. for reviewer in reviewer_details:
  40. ## In ModCloth, reviewer name appears as "By REVIEWER_NAME"
  41. ## Splitting at the end is to remove "By" to get only the actual reviewer name
  42. reviewer_name = reviewer.find("p", {"class":"pr-rd-details pr-rd-author-nickname"}).text.split('\n')[-1].strip()
  43. reviewers_name.append(reviewer_name)
  44.  
  45. ## extract "isVerified" information
  46. isVerified = soup.find_all("span", {"class": "pr-rd-badging-text"})
  47.  
  48. ## extract the fit feedback and customer measurements data (review_metadata)
  49. review_data = soup.find_all("article", {"class": "pr-review"})
  50. review_metadata_raw = []
  51.  
  52. for i in range(len(review_data)):
  53. review_metadata_raw.append(review_data[i].find("div", {"class": "pr-accordion-content"}))
  54.  
  55. ## extract HTML elements which contain review metadata
  56. review_metadata_elements = [review_metadata_raw[i].find_all("dl", {"class", "pr-rd-def-list"})
  57. if review_metadata_raw[i] is not None else None
  58. for i in range(len(review_metadata_raw))]
  59.  
  60. ## extract actual data from HTML elements
  61. review_metadata = []
  62. for element in review_metadata_elements:
  63. if element is None:
  64. review_metadata.append(None)
  65. continue
  66. ## <dt> elements contain metadata field name like "fit", "length" etc
  67. ## <dd> elements contain reviewer's response for those metadata fields like "small", "just right" etc
  68. review_metadata.append([(element[i].find("dt").text.lower(), element[i].find("dd").text.lower())
  69. if element is not None else ""
  70. for i in range(len(element))])
  71.  
  72. ## extract review text
  73. review_text = [txt.text for txt in soup.find_all("p", {"class": "pr-rd-description-text"})]
  74. review_summary = [txt.text for txt in soup.find_all("h2", {"class": "pr-rd-review-headline"})]
  75.  
  76. ## extract item id
  77. item_id = soup.find("div", {"class": "product-number"}).find("span").text
  78.  
  79. ## extract item category
  80. try:
  81. category = soup.find("a", {"class":"breadcrumb-element"}).text.lower()
  82. except AttributeError: ## if category not present, item is not available
  83. time.sleep(15 + random.randint(0,10))
  84. break
  85.  
  86. ## extract available product sizes
  87. product_sizes = [i.text.strip().lower() for i in soup.find("ul", {"class": "swatches size"})
  88. .find_all("li", {"class": "selectable variation-group-value"})]
  89. item_info = {"category": category, "item_id": item_id, "product_sizes": product_sizes}
  90.  
  91. ## consolidate all the extracted data
  92. ## ignore records which don't have any review metadata as fit feedback is an essential signal for us
  93. scraped_data.extend([merge_two_dicts({"review_text": review_text[j], "review_summary": review_summary[j]},
  94. merge_two_dicts(merge_two_dicts({"user_name":reviewers_name[j]},
  95. {data[0]:data[1] for data in review_metadata[j]})
  96. ,item_info))
  97. for j in range(len(reviewer_details)) if review_metadata_raw[j] is not None])
  98.  
  99. ## if current page is the initial one, it contains only NEXT button (PREVIOUS is missing)
  100. if init == 0:
  101. try:
  102. init = 1
  103. ## execute click on NEXT by utilizing the xpath of NEXT
  104. browser.execute_script("arguments[0].click();",
  105. browser.find_element_by_xpath('//*[@id="pr-review-display"]/footer/div/aside/button'))
  106. time.sleep(10 + random.randint(0,5))
  107. except NoSuchElementException: ## No NEXT button present, less than 10 reviews
  108. time.sleep(15 + random.randint(0,10))
  109. break
  110. else:
  111. try:
  112. ## execute click on NEXT by utilizing the xpath of NEXT
  113. ## if you notice, the xpath of NEXT is different here since PREVIOUS button is also present now
  114. browser.execute_script("arguments[0].click();",
  115. browser.find_element_by_xpath('//*[@id="pr-review-display"]/footer/div/aside/button[2]'))
  116. time.sleep(10 + random.randint(0,5))
  117. except NoSuchElementException: ## No NEXT button, no more pages left
  118. time.sleep(15 + random.randint(0,10))
  119. break
  120.  
  121. ## save the extracted data locally
  122. np.save('./scraped_data_tops.npy',scraped_data)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement