Advertisement
Try95th

scrape Glassdoor Reviews so_q_74650912

Dec 2nd, 2022 (edited)
134
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.82 KB | None | 0 0
  1. ## for https://stackoverflow.com/q/74650912/6146136
  2.  
  3. from bs4 import BeautifulSoup
  4. from selenium import webdriver
  5. from selenium.webdriver.common.by import By
  6. from selenium.webdriver.common.keys import Keys
  7. from selenium.webdriver.support.ui import WebDriverWait
  8. from selenium.webdriver.support import expected_conditions as EC
  9. import time
  10. import pandas
  11.  
  12. ## PASTE getDECstars FROM https://pastebin.com/Q0GLwRv9 ## without selenium
  13. ## PASTE [some verion of] linkToSoup FROM https://pastebin.com/rBTr06vy ## without selenium
  14.  
  15. def login_to_gd(driver, tmout=10, lEmail='YOUR_EMAIL', lPwd='YOUR_PASSWORD'):
  16.     # not needed if you want to login manually
  17.     try:
  18.         WebDriverWait(driver, tmout).until(
  19.             EC.presence_of_element_located((By.CSS_SELECTOR, 'input#inlineUserEmail')))
  20.         uemInp = driver.find_element(By.CSS_SELECTOR, 'input#inlineUserEmail')
  21.         driver.execute_script("arguments[0].click();", uemInp)
  22.         uemInp.send_keys(lEmail, Keys.ENTER)
  23.  
  24.         WebDriverWait(driver, tmout).until(
  25.             EC.presence_of_element_located((By.CSS_SELECTOR, 'input#inlineUserPassword')))
  26.         pwdInp = driver.find_element(By.CSS_SELECTOR, 'input#inlineUserPassword')
  27.         driver.execute_script("arguments[0].click();", pwdInp)
  28.         pwdInp.send_keys(lPwd, Keys.ENTER)
  29.  
  30.         WebDriverWait(driver, tmout).until(
  31.             EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-test="profile-container"]')))
  32.     except Exception as e:
  33.         print(e)
  34.         input('Please login manually and then press ENTER here')
  35.  
  36.  
  37. def formCssKey(scEl):
  38.     if type(scEl) == list: c = [str(s) for s in scEl]
  39.     else: c = str(scEl.get_attribute('class')).split()
  40.     return ' '.join(sorted(w for w in c if w.startswith('css-')))
  41. def cssToStars(cStr, outOf=5):
  42.     try:
  43.         str_bfr, str_aft = 'linear-gradient(90deg, rgb(12, 170, 65) ', '%, rgb(222, 224, 227) '
  44.         perc = float(cStr.split(str_bfr, 1)[1].split(str_aft)[0])
  45.         if type(outOf) == int and outOf > 0: perc = (perc/100)*outOf
  46.         return float(f'{perc: .3}')
  47.     except: return None
  48.  
  49.  
  50. def linkToSoup_selenium(driver, tmout=10, isv=False, returnErr=False):
  51.     try:
  52.         WebDriverWait(driver, tmout).until(
  53.             EC.visibility_of_all_elements_located((
  54.                 By.CSS_SELECTOR, 'li[id^="empReview_"]')))
  55.  
  56.         subRatSel = 'li[id^="empReview_"] div:has(> .ratingNumber) ~ aside ul > li div:nth-of-type(2)'
  57.         starConts = driver.find_elements(By.CSS_SELECTOR, subRatSel)
  58.         starConts = {
  59.             formCssKey(s): s.value_of_css_property('background')
  60.             for s in starConts
  61.         }
  62.  
  63.         lSoup = BeautifulSoup(driver.page_source, 'html.parser')
  64.         return lSoup, starConts
  65.     except Exception as e:
  66.         if isv: print(e)
  67.         return (str(e) if returnErr else None), {}
  68.  
  69.  
  70. def scrape_gdRevs(pgUrl, csvFn='empRevs.csv', constBreak=5, breaktime=5, maxScrapes=500):
  71.     try:
  72.         prevDf = pandas.read_csv(csvFn)
  73.         prevs = list(prevDf['reviewId'])
  74.         empRevs = prevDf.to_dict('records')
  75.     except: prevs, empRevs = [], []
  76.     total_allTime = len(empRevs)
  77.     total_current = 0
  78.  
  79.     ### JUST FOR STATS ###
  80.     try: scrapeLogs = pandas.read_csv(f'scrapeLogs_{csvFn}').to_dict('records')
  81.     except: scrapeLogs = []
  82.     ######################
  83.  
  84.     totalRevs = 'UNKNOWN'
  85.  
  86.     pcCon = 'div.px-std:has(h2 > a.reviewLink) + div.px-std'
  87.     pcDiv = f'{pcCon} div.v2__EIReviewDetailsV2__fullWidth'
  88.     refDict = {
  89.         'rating_num': 'span.ratingNumber',
  90.         'emp_status': 'div:has(> div > span.ratingNumber) + span',
  91.         'header': 'h2 > a.reviewLink',
  92.         'subheader': 'h2:has(> a.reviewLink) + span',
  93.         'pros': f'{pcDiv}:first-of-type > p.pb',
  94.         'cons': f'{pcDiv}:nth-of-type(2) > p.pb'
  95.     }
  96.  
  97.     # I copy chromedriver.exe to the same folder as this py file ## for selenium
  98.     driverG = webdriver.Chrome() ## for selenium
  99.     driverG.get('https://www.glassdoor.com') ## for selenium
  100.     login_to_gd(driverG) # REMOVE if you want to login manually ## for selenium
  101.     # input('Please LogIn and then press enter here') # manual login ## for selenium
  102.     driverG.get(pgUrl)  ## for selenium
  103.  
  104.     subRatSel = 'div:has(> .ratingNumber) ~ aside ul > li:has(div ~ div)'
  105.     pgftSel = 'div[data-test="pagination-footer-text"]'
  106.     for sci in range(maxScrapes):
  107.         scn = f'[{sci + 1} of {maxScrapes}]'
  108.         print('', end=f'\r{scn} scraping {pgUrl}')
  109.         # soup = linkToSoup(pgUrl, isv=True, returnErr=True) ## without selenium
  110.         soup, srDict = linkToSoup_selenium(driverG, isv=True, returnErr=True) ## for selenium
  111.  
  112.         if type(soup) == str:
  113.             scrapeLogs.append(
  114.                 {'scrapeNum': sci+1, 'errorMsg': soup, 'url': pgUrl}
  115.             )  # JUST FOR STATS ###
  116.  
  117.             # break # if you want to stop at first error # OR take a break:
  118.             waitMsg = f'!{soup}! {breaktime*sci}s break before retrying'
  119.             print('', end=f'\r{scn} {waitMsg} {pgUrl}')
  120.             time.sleep(breaktime*sci)
  121.             continue
  122.  
  123.         ### JUST FOR STATS ###
  124.         try: curPg = soup.select_one('li a.page.selected').get_text().strip()
  125.         except: curPg = 'UNKNOWN'
  126.         if curPg.isdigit(): curPg = int(curPg)
  127.  
  128.         try: ftrTxt = soup.select_one(pgftSel).get_text().strip()
  129.         except: ftrTxt = 'reviewCount UNKNOWN'
  130.         try: tRevs = ftrTxt.strip().strip().split('of')[-1].split()[0].replace(',', '')
  131.         except: tRevs = 'UNKNOWN'
  132.         if tRevs.isdigit(): totalRevs = int(tRevs)
  133.         print('', end=f'\r{scn} scraping "{ftrTxt}" from page#{curPg} {pgUrl}')
  134.         ######################
  135.  
  136.         newRevIds, pgRevIds = [], []  # JUST FOR STATS ###
  137.         rSoups = soup.select('li[id^="empReview_"]')
  138.         for r in rSoups:
  139.             rId = r.get('id')
  140.             pgRevIds.append(rId)  # JUST FOR STATS ###
  141.             if rId in prevs: continue  # skip duplicates
  142.  
  143.             newRevIds.append(rId)  # JUST FOR STATS ###
  144.             rDet = {'reviewId': rId}
  145.             for sr in r.select(subRatSel):
  146.                 k = sr.select_one('div:first-of-type').get_text(' ').strip()
  147.                 # sval = getDECstars(sr.select_one('div:nth-of-type(2)'), soup) ## without selenium
  148.                 kc = formCssKey(sr.select_one('div:nth-of-type(2)').get('class', [])) ## for selenium
  149.                 sval = cssToStars(srDict[kc]) if kc in srDict else None ## for selenium
  150.                 rDet[f'[rating] {k}'] = sval
  151.  
  152.             for k, sel in refDict.items():
  153.                 sval = r.select_one(sel)
  154.                 if sval: sval = sval.get_text(' ').strip()
  155.                 rDet[k] = sval
  156.  
  157.             empRevs.append(rDet)
  158.             prevs.append(rId)
  159.  
  160.         pandas.DataFrame(empRevs).to_csv(csvFn, index=False)
  161.         total_current += len(newRevIds)
  162.         total_allTime = len(empRevs)
  163.  
  164.         ### JUST FOR STATS ###
  165.         for_sl = {
  166.             'scrapeNum': sci+1, 'curPg': curPg, 'totalRevs': tRevs,
  167.             'pgFooter': ftrTxt, 'allCt': len(pgRevIds),
  168.             'uniqCt': len(set(pgRevIds)), 'newCt': len(newRevIds),
  169.             'allRevs': pgRevIds, 'newRevs': newRevIds, 'url': pgUrl
  170.         }
  171.         if not rSoups:
  172.             for_sl['errorMsg'] = 'No reviews found in ' + ''.join([
  173.                 ' '.join(w for w in l.split() if w)
  174.                 for l in str(soup).splitlines() if l
  175.             ])
  176.         scrapeLogs.append(for_sl)
  177.         pandas.DataFrame(scrapeLogs).to_csv(f'scrapeLogs_{csvFn}', index=False)
  178.         ######################
  179.  
  180.         rCt = len(rSoups)
  181.         print(f'\r{scn} scraped {rCt} "{ftrTxt}" from page#{curPg} {pgUrl}')
  182.         # nextPg = soup.select_one('li:has(a.page.selected) + li a.page[href]')  ## without selenium
  183.         nextPg = driverG.find_elements(By.CSS_SELECTOR, 'li:has(a.page.selected) + li a.page[href]')  ## for selenium
  184.         if nextPg:
  185.             # pgUrl = 'https://www.glassdoor.com/' + nextPg.get('href')  ## without selenium
  186.             if constBreak > 0: time.sleep(constBreak)
  187.             scrollElToBottom = 'arguments[0].scrollIntoView(false);' ## for selenium
  188.             driverG.execute_script(scrollElToBottom, nextPg[0]) ## for selenium
  189.             nextPg[0].click() ## for selenium
  190.         elif not rSoups:  # remove if you want to stop at first error
  191.             print('', end=f'\r{scn} {breaktime*sci}s break before retrying {pgUrl}')
  192.             time.sleep(breaktime*sci)
  193.         else: break  # last page
  194.  
  195.     driverG.close() ## for selenium
  196.     del driverG  # (just in case) ## for selenium
  197.  
  198.     print('\n\n\n total reviews: ', totalRevs)
  199.     print('total reviews scraped this run:', total_current)
  200.     print('total reviews scraped over all time:', total_allTime)
  201.  
  202.  
  203. # startUrl = 'https://www.glassdoor.com/Reviews/Walmart-Reviews-E715.htm?filter.iso3Language=eng'
  204. # scrape_gdRevs(startUrl)
  205.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement