Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## for https://stackoverflow.com/q/74650912/6146136
- from bs4 import BeautifulSoup
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.common.keys import Keys
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.support import expected_conditions as EC
- import time
- import pandas
- ## PASTE getDECstars FROM https://pastebin.com/Q0GLwRv9 ## without selenium
- ## PASTE [some verion of] linkToSoup FROM https://pastebin.com/rBTr06vy ## without selenium
- def login_to_gd(driver, tmout=10, lEmail='YOUR_EMAIL', lPwd='YOUR_PASSWORD'):
- # not needed if you want to login manually
- try:
- WebDriverWait(driver, tmout).until(
- EC.presence_of_element_located((By.CSS_SELECTOR, 'input#inlineUserEmail')))
- uemInp = driver.find_element(By.CSS_SELECTOR, 'input#inlineUserEmail')
- driver.execute_script("arguments[0].click();", uemInp)
- uemInp.send_keys(lEmail, Keys.ENTER)
- WebDriverWait(driver, tmout).until(
- EC.presence_of_element_located((By.CSS_SELECTOR, 'input#inlineUserPassword')))
- pwdInp = driver.find_element(By.CSS_SELECTOR, 'input#inlineUserPassword')
- driver.execute_script("arguments[0].click();", pwdInp)
- pwdInp.send_keys(lPwd, Keys.ENTER)
- WebDriverWait(driver, tmout).until(
- EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-test="profile-container"]')))
- except Exception as e:
- print(e)
- input('Please login manually and then press ENTER here')
- def formCssKey(scEl):
- if type(scEl) == list: c = [str(s) for s in scEl]
- else: c = str(scEl.get_attribute('class')).split()
- return ' '.join(sorted(w for w in c if w.startswith('css-')))
- def cssToStars(cStr, outOf=5):
- try:
- str_bfr, str_aft = 'linear-gradient(90deg, rgb(12, 170, 65) ', '%, rgb(222, 224, 227) '
- perc = float(cStr.split(str_bfr, 1)[1].split(str_aft)[0])
- if type(outOf) == int and outOf > 0: perc = (perc/100)*outOf
- return float(f'{perc: .3}')
- except: return None
- def linkToSoup_selenium(driver, tmout=10, isv=False, returnErr=False):
- try:
- WebDriverWait(driver, tmout).until(
- EC.visibility_of_all_elements_located((
- By.CSS_SELECTOR, 'li[id^="empReview_"]')))
- subRatSel = 'li[id^="empReview_"] div:has(> .ratingNumber) ~ aside ul > li div:nth-of-type(2)'
- starConts = driver.find_elements(By.CSS_SELECTOR, subRatSel)
- starConts = {
- formCssKey(s): s.value_of_css_property('background')
- for s in starConts
- }
- lSoup = BeautifulSoup(driver.page_source, 'html.parser')
- return lSoup, starConts
- except Exception as e:
- if isv: print(e)
- return (str(e) if returnErr else None), {}
- def scrape_gdRevs(pgUrl, csvFn='empRevs.csv', constBreak=5, breaktime=5, maxScrapes=500):
- try:
- prevDf = pandas.read_csv(csvFn)
- prevs = list(prevDf['reviewId'])
- empRevs = prevDf.to_dict('records')
- except: prevs, empRevs = [], []
- total_allTime = len(empRevs)
- total_current = 0
- ### JUST FOR STATS ###
- try: scrapeLogs = pandas.read_csv(f'scrapeLogs_{csvFn}').to_dict('records')
- except: scrapeLogs = []
- ######################
- totalRevs = 'UNKNOWN'
- pcCon = 'div.px-std:has(h2 > a.reviewLink) + div.px-std'
- pcDiv = f'{pcCon} div.v2__EIReviewDetailsV2__fullWidth'
- refDict = {
- 'rating_num': 'span.ratingNumber',
- 'emp_status': 'div:has(> div > span.ratingNumber) + span',
- 'header': 'h2 > a.reviewLink',
- 'subheader': 'h2:has(> a.reviewLink) + span',
- 'pros': f'{pcDiv}:first-of-type > p.pb',
- 'cons': f'{pcDiv}:nth-of-type(2) > p.pb'
- }
- # I copy chromedriver.exe to the same folder as this py file ## for selenium
- driverG = webdriver.Chrome() ## for selenium
- driverG.get('https://www.glassdoor.com') ## for selenium
- login_to_gd(driverG) # REMOVE if you want to login manually ## for selenium
- # input('Please LogIn and then press enter here') # manual login ## for selenium
- driverG.get(pgUrl) ## for selenium
- subRatSel = 'div:has(> .ratingNumber) ~ aside ul > li:has(div ~ div)'
- pgftSel = 'div[data-test="pagination-footer-text"]'
- for sci in range(maxScrapes):
- scn = f'[{sci + 1} of {maxScrapes}]'
- print('', end=f'\r{scn} scraping {pgUrl}')
- # soup = linkToSoup(pgUrl, isv=True, returnErr=True) ## without selenium
- soup, srDict = linkToSoup_selenium(driverG, isv=True, returnErr=True) ## for selenium
- if type(soup) == str:
- scrapeLogs.append(
- {'scrapeNum': sci+1, 'errorMsg': soup, 'url': pgUrl}
- ) # JUST FOR STATS ###
- # break # if you want to stop at first error # OR take a break:
- waitMsg = f'!{soup}! {breaktime*sci}s break before retrying'
- print('', end=f'\r{scn} {waitMsg} {pgUrl}')
- time.sleep(breaktime*sci)
- continue
- ### JUST FOR STATS ###
- try: curPg = soup.select_one('li a.page.selected').get_text().strip()
- except: curPg = 'UNKNOWN'
- if curPg.isdigit(): curPg = int(curPg)
- try: ftrTxt = soup.select_one(pgftSel).get_text().strip()
- except: ftrTxt = 'reviewCount UNKNOWN'
- try: tRevs = ftrTxt.strip().strip().split('of')[-1].split()[0].replace(',', '')
- except: tRevs = 'UNKNOWN'
- if tRevs.isdigit(): totalRevs = int(tRevs)
- print('', end=f'\r{scn} scraping "{ftrTxt}" from page#{curPg} {pgUrl}')
- ######################
- newRevIds, pgRevIds = [], [] # JUST FOR STATS ###
- rSoups = soup.select('li[id^="empReview_"]')
- for r in rSoups:
- rId = r.get('id')
- pgRevIds.append(rId) # JUST FOR STATS ###
- if rId in prevs: continue # skip duplicates
- newRevIds.append(rId) # JUST FOR STATS ###
- rDet = {'reviewId': rId}
- for sr in r.select(subRatSel):
- k = sr.select_one('div:first-of-type').get_text(' ').strip()
- # sval = getDECstars(sr.select_one('div:nth-of-type(2)'), soup) ## without selenium
- kc = formCssKey(sr.select_one('div:nth-of-type(2)').get('class', [])) ## for selenium
- sval = cssToStars(srDict[kc]) if kc in srDict else None ## for selenium
- rDet[f'[rating] {k}'] = sval
- for k, sel in refDict.items():
- sval = r.select_one(sel)
- if sval: sval = sval.get_text(' ').strip()
- rDet[k] = sval
- empRevs.append(rDet)
- prevs.append(rId)
- pandas.DataFrame(empRevs).to_csv(csvFn, index=False)
- total_current += len(newRevIds)
- total_allTime = len(empRevs)
- ### JUST FOR STATS ###
- for_sl = {
- 'scrapeNum': sci+1, 'curPg': curPg, 'totalRevs': tRevs,
- 'pgFooter': ftrTxt, 'allCt': len(pgRevIds),
- 'uniqCt': len(set(pgRevIds)), 'newCt': len(newRevIds),
- 'allRevs': pgRevIds, 'newRevs': newRevIds, 'url': pgUrl
- }
- if not rSoups:
- for_sl['errorMsg'] = 'No reviews found in ' + ''.join([
- ' '.join(w for w in l.split() if w)
- for l in str(soup).splitlines() if l
- ])
- scrapeLogs.append(for_sl)
- pandas.DataFrame(scrapeLogs).to_csv(f'scrapeLogs_{csvFn}', index=False)
- ######################
- rCt = len(rSoups)
- print(f'\r{scn} scraped {rCt} "{ftrTxt}" from page#{curPg} {pgUrl}')
- # nextPg = soup.select_one('li:has(a.page.selected) + li a.page[href]') ## without selenium
- nextPg = driverG.find_elements(By.CSS_SELECTOR, 'li:has(a.page.selected) + li a.page[href]') ## for selenium
- if nextPg:
- # pgUrl = 'https://www.glassdoor.com/' + nextPg.get('href') ## without selenium
- if constBreak > 0: time.sleep(constBreak)
- scrollElToBottom = 'arguments[0].scrollIntoView(false);' ## for selenium
- driverG.execute_script(scrollElToBottom, nextPg[0]) ## for selenium
- nextPg[0].click() ## for selenium
- elif not rSoups: # remove if you want to stop at first error
- print('', end=f'\r{scn} {breaktime*sci}s break before retrying {pgUrl}')
- time.sleep(breaktime*sci)
- else: break # last page
- driverG.close() ## for selenium
- del driverG # (just in case) ## for selenium
- print('\n\n\n total reviews: ', totalRevs)
- print('total reviews scraped this run:', total_current)
- print('total reviews scraped over all time:', total_allTime)
- # startUrl = 'https://www.glassdoor.com/Reviews/Walmart-Reviews-E715.htm?filter.iso3Language=eng'
- # scrape_gdRevs(startUrl)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement