Not a member of Pastebin yet?
                        Sign Up,
                        it unlocks many cool features!                    
                - ## for https://stackoverflow.com/q/74650912/6146136
 - from bs4 import BeautifulSoup
 - from selenium import webdriver
 - from selenium.webdriver.common.by import By
 - from selenium.webdriver.common.keys import Keys
 - from selenium.webdriver.support.ui import WebDriverWait
 - from selenium.webdriver.support import expected_conditions as EC
 - import time
 - import pandas
 - ## PASTE getDECstars FROM https://pastebin.com/Q0GLwRv9 ## without selenium
 - ## PASTE [some verion of] linkToSoup FROM https://pastebin.com/rBTr06vy ## without selenium
 - def login_to_gd(driver, tmout=10, lEmail='YOUR_EMAIL', lPwd='YOUR_PASSWORD'):
 - # not needed if you want to login manually
 - try:
 - WebDriverWait(driver, tmout).until(
 - EC.presence_of_element_located((By.CSS_SELECTOR, 'input#inlineUserEmail')))
 - uemInp = driver.find_element(By.CSS_SELECTOR, 'input#inlineUserEmail')
 - driver.execute_script("arguments[0].click();", uemInp)
 - uemInp.send_keys(lEmail, Keys.ENTER)
 - WebDriverWait(driver, tmout).until(
 - EC.presence_of_element_located((By.CSS_SELECTOR, 'input#inlineUserPassword')))
 - pwdInp = driver.find_element(By.CSS_SELECTOR, 'input#inlineUserPassword')
 - driver.execute_script("arguments[0].click();", pwdInp)
 - pwdInp.send_keys(lPwd, Keys.ENTER)
 - WebDriverWait(driver, tmout).until(
 - EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-test="profile-container"]')))
 - except Exception as e:
 - print(e)
 - input('Please login manually and then press ENTER here')
 - def formCssKey(scEl):
 - if type(scEl) == list: c = [str(s) for s in scEl]
 - else: c = str(scEl.get_attribute('class')).split()
 - return ' '.join(sorted(w for w in c if w.startswith('css-')))
 - def cssToStars(cStr, outOf=5):
 - try:
 - str_bfr, str_aft = 'linear-gradient(90deg, rgb(12, 170, 65) ', '%, rgb(222, 224, 227) '
 - perc = float(cStr.split(str_bfr, 1)[1].split(str_aft)[0])
 - if type(outOf) == int and outOf > 0: perc = (perc/100)*outOf
 - return float(f'{perc: .3}')
 - except: return None
 - def linkToSoup_selenium(driver, tmout=10, isv=False, returnErr=False):
 - try:
 - WebDriverWait(driver, tmout).until(
 - EC.visibility_of_all_elements_located((
 - By.CSS_SELECTOR, 'li[id^="empReview_"]')))
 - subRatSel = 'li[id^="empReview_"] div:has(> .ratingNumber) ~ aside ul > li div:nth-of-type(2)'
 - starConts = driver.find_elements(By.CSS_SELECTOR, subRatSel)
 - starConts = {
 - formCssKey(s): s.value_of_css_property('background')
 - for s in starConts
 - }
 - lSoup = BeautifulSoup(driver.page_source, 'html.parser')
 - return lSoup, starConts
 - except Exception as e:
 - if isv: print(e)
 - return (str(e) if returnErr else None), {}
 - def scrape_gdRevs(pgUrl, csvFn='empRevs.csv', constBreak=5, breaktime=5, maxScrapes=500):
 - try:
 - prevDf = pandas.read_csv(csvFn)
 - prevs = list(prevDf['reviewId'])
 - empRevs = prevDf.to_dict('records')
 - except: prevs, empRevs = [], []
 - total_allTime = len(empRevs)
 - total_current = 0
 - ### JUST FOR STATS ###
 - try: scrapeLogs = pandas.read_csv(f'scrapeLogs_{csvFn}').to_dict('records')
 - except: scrapeLogs = []
 - ######################
 - totalRevs = 'UNKNOWN'
 - pcCon = 'div.px-std:has(h2 > a.reviewLink) + div.px-std'
 - pcDiv = f'{pcCon} div.v2__EIReviewDetailsV2__fullWidth'
 - refDict = {
 - 'rating_num': 'span.ratingNumber',
 - 'emp_status': 'div:has(> div > span.ratingNumber) + span',
 - 'header': 'h2 > a.reviewLink',
 - 'subheader': 'h2:has(> a.reviewLink) + span',
 - 'pros': f'{pcDiv}:first-of-type > p.pb',
 - 'cons': f'{pcDiv}:nth-of-type(2) > p.pb'
 - }
 - # I copy chromedriver.exe to the same folder as this py file ## for selenium
 - driverG = webdriver.Chrome() ## for selenium
 - driverG.get('https://www.glassdoor.com') ## for selenium
 - login_to_gd(driverG) # REMOVE if you want to login manually ## for selenium
 - # input('Please LogIn and then press enter here') # manual login ## for selenium
 - driverG.get(pgUrl) ## for selenium
 - subRatSel = 'div:has(> .ratingNumber) ~ aside ul > li:has(div ~ div)'
 - pgftSel = 'div[data-test="pagination-footer-text"]'
 - for sci in range(maxScrapes):
 - scn = f'[{sci + 1} of {maxScrapes}]'
 - print('', end=f'\r{scn} scraping {pgUrl}')
 - # soup = linkToSoup(pgUrl, isv=True, returnErr=True) ## without selenium
 - soup, srDict = linkToSoup_selenium(driverG, isv=True, returnErr=True) ## for selenium
 - if type(soup) == str:
 - scrapeLogs.append(
 - {'scrapeNum': sci+1, 'errorMsg': soup, 'url': pgUrl}
 - ) # JUST FOR STATS ###
 - # break # if you want to stop at first error # OR take a break:
 - waitMsg = f'!{soup}! {breaktime*sci}s break before retrying'
 - print('', end=f'\r{scn} {waitMsg} {pgUrl}')
 - time.sleep(breaktime*sci)
 - continue
 - ### JUST FOR STATS ###
 - try: curPg = soup.select_one('li a.page.selected').get_text().strip()
 - except: curPg = 'UNKNOWN'
 - if curPg.isdigit(): curPg = int(curPg)
 - try: ftrTxt = soup.select_one(pgftSel).get_text().strip()
 - except: ftrTxt = 'reviewCount UNKNOWN'
 - try: tRevs = ftrTxt.strip().strip().split('of')[-1].split()[0].replace(',', '')
 - except: tRevs = 'UNKNOWN'
 - if tRevs.isdigit(): totalRevs = int(tRevs)
 - print('', end=f'\r{scn} scraping "{ftrTxt}" from page#{curPg} {pgUrl}')
 - ######################
 - newRevIds, pgRevIds = [], [] # JUST FOR STATS ###
 - rSoups = soup.select('li[id^="empReview_"]')
 - for r in rSoups:
 - rId = r.get('id')
 - pgRevIds.append(rId) # JUST FOR STATS ###
 - if rId in prevs: continue # skip duplicates
 - newRevIds.append(rId) # JUST FOR STATS ###
 - rDet = {'reviewId': rId}
 - for sr in r.select(subRatSel):
 - k = sr.select_one('div:first-of-type').get_text(' ').strip()
 - # sval = getDECstars(sr.select_one('div:nth-of-type(2)'), soup) ## without selenium
 - kc = formCssKey(sr.select_one('div:nth-of-type(2)').get('class', [])) ## for selenium
 - sval = cssToStars(srDict[kc]) if kc in srDict else None ## for selenium
 - rDet[f'[rating] {k}'] = sval
 - for k, sel in refDict.items():
 - sval = r.select_one(sel)
 - if sval: sval = sval.get_text(' ').strip()
 - rDet[k] = sval
 - empRevs.append(rDet)
 - prevs.append(rId)
 - pandas.DataFrame(empRevs).to_csv(csvFn, index=False)
 - total_current += len(newRevIds)
 - total_allTime = len(empRevs)
 - ### JUST FOR STATS ###
 - for_sl = {
 - 'scrapeNum': sci+1, 'curPg': curPg, 'totalRevs': tRevs,
 - 'pgFooter': ftrTxt, 'allCt': len(pgRevIds),
 - 'uniqCt': len(set(pgRevIds)), 'newCt': len(newRevIds),
 - 'allRevs': pgRevIds, 'newRevs': newRevIds, 'url': pgUrl
 - }
 - if not rSoups:
 - for_sl['errorMsg'] = 'No reviews found in ' + ''.join([
 - ' '.join(w for w in l.split() if w)
 - for l in str(soup).splitlines() if l
 - ])
 - scrapeLogs.append(for_sl)
 - pandas.DataFrame(scrapeLogs).to_csv(f'scrapeLogs_{csvFn}', index=False)
 - ######################
 - rCt = len(rSoups)
 - print(f'\r{scn} scraped {rCt} "{ftrTxt}" from page#{curPg} {pgUrl}')
 - # nextPg = soup.select_one('li:has(a.page.selected) + li a.page[href]') ## without selenium
 - nextPg = driverG.find_elements(By.CSS_SELECTOR, 'li:has(a.page.selected) + li a.page[href]') ## for selenium
 - if nextPg:
 - # pgUrl = 'https://www.glassdoor.com/' + nextPg.get('href') ## without selenium
 - if constBreak > 0: time.sleep(constBreak)
 - scrollElToBottom = 'arguments[0].scrollIntoView(false);' ## for selenium
 - driverG.execute_script(scrollElToBottom, nextPg[0]) ## for selenium
 - nextPg[0].click() ## for selenium
 - elif not rSoups: # remove if you want to stop at first error
 - print('', end=f'\r{scn} {breaktime*sci}s break before retrying {pgUrl}')
 - time.sleep(breaktime*sci)
 - else: break # last page
 - driverG.close() ## for selenium
 - del driverG # (just in case) ## for selenium
 - print('\n\n\n total reviews: ', totalRevs)
 - print('total reviews scraped this run:', total_current)
 - print('total reviews scraped over all time:', total_allTime)
 - # startUrl = 'https://www.glassdoor.com/Reviews/Walmart-Reviews-E715.htm?filter.iso3Language=eng'
 - # scrape_gdRevs(startUrl)
 
Advertisement
 
                    Add Comment                
                
                        Please, Sign In to add comment