scrape Glassdoor Reviews so_q_74650912

## for https://stackoverflow.com/q/74650912/6146136

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas

## PASTE getDECstars FROM https://pastebin.com/Q0GLwRv9 ## without selenium
## PASTE [some verion of] linkToSoup FROM https://pastebin.com/rBTr06vy ## without selenium

def login_to_gd(driver, tmout=10, lEmail='YOUR_EMAIL', lPwd='YOUR_PASSWORD'):
    # not needed if you want to login manually
    try:
        WebDriverWait(driver, tmout).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'input#inlineUserEmail')))
        uemInp = driver.find_element(By.CSS_SELECTOR, 'input#inlineUserEmail')
        driver.execute_script("arguments[0].click();", uemInp)
        uemInp.send_keys(lEmail, Keys.ENTER)

        WebDriverWait(driver, tmout).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'input#inlineUserPassword')))
        pwdInp = driver.find_element(By.CSS_SELECTOR, 'input#inlineUserPassword')
        driver.execute_script("arguments[0].click();", pwdInp)
        pwdInp.send_keys(lPwd, Keys.ENTER)

        WebDriverWait(driver, tmout).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-test="profile-container"]')))
    except Exception as e:
        print(e)
        input('Please login manually and then press ENTER here')


def formCssKey(scEl):
    if type(scEl) == list: c = [str(s) for s in scEl]
    else: c = str(scEl.get_attribute('class')).split()
    return ' '.join(sorted(w for w in c if w.startswith('css-')))
def cssToStars(cStr, outOf=5):
    try:
        str_bfr, str_aft = 'linear-gradient(90deg, rgb(12, 170, 65) ', '%, rgb(222, 224, 227) '
        perc = float(cStr.split(str_bfr, 1)[1].split(str_aft)[0])
        if type(outOf) == int and outOf > 0: perc = (perc/100)*outOf
        return float(f'{perc: .3}')
    except: return None


def linkToSoup_selenium(driver, tmout=10, isv=False, returnErr=False):
    try:
        WebDriverWait(driver, tmout).until(
            EC.visibility_of_all_elements_located((
                By.CSS_SELECTOR, 'li[id^="empReview_"]')))

        subRatSel = 'li[id^="empReview_"] div:has(> .ratingNumber) ~ aside ul > li div:nth-of-type(2)'
        starConts = driver.find_elements(By.CSS_SELECTOR, subRatSel)
        starConts = {
            formCssKey(s): s.value_of_css_property('background')
            for s in starConts
        }

        lSoup = BeautifulSoup(driver.page_source, 'html.parser')
        return lSoup, starConts
    except Exception as e:
        if isv: print(e)
        return (str(e) if returnErr else None), {}


def scrape_gdRevs(pgUrl, csvFn='empRevs.csv', constBreak=5, breaktime=5, maxScrapes=500):
    try:
        prevDf = pandas.read_csv(csvFn)
        prevs = list(prevDf['reviewId'])
        empRevs = prevDf.to_dict('records')
    except: prevs, empRevs = [], []
    total_allTime = len(empRevs)
    total_current = 0

    ### JUST FOR STATS ###
    try: scrapeLogs = pandas.read_csv(f'scrapeLogs_{csvFn}').to_dict('records')
    except: scrapeLogs = []
    ######################

    totalRevs = 'UNKNOWN'

    pcCon = 'div.px-std:has(h2 > a.reviewLink) + div.px-std'
    pcDiv = f'{pcCon} div.v2__EIReviewDetailsV2__fullWidth'
    refDict = {
        'rating_num': 'span.ratingNumber',
        'emp_status': 'div:has(> div > span.ratingNumber) + span',
        'header': 'h2 > a.reviewLink',
        'subheader': 'h2:has(> a.reviewLink) + span',
        'pros': f'{pcDiv}:first-of-type > p.pb',
        'cons': f'{pcDiv}:nth-of-type(2) > p.pb'
    }

    # I copy chromedriver.exe to the same folder as this py file ## for selenium
    driverG = webdriver.Chrome() ## for selenium
    driverG.get('https://www.glassdoor.com') ## for selenium
    login_to_gd(driverG) # REMOVE if you want to login manually ## for selenium
    # input('Please LogIn and then press enter here') # manual login ## for selenium
    driverG.get(pgUrl)  ## for selenium

    subRatSel = 'div:has(> .ratingNumber) ~ aside ul > li:has(div ~ div)'
    pgftSel = 'div[data-test="pagination-footer-text"]'
    for sci in range(maxScrapes):
        scn = f'[{sci + 1} of {maxScrapes}]'
        print('', end=f'\r{scn} scraping {pgUrl}')
        # soup = linkToSoup(pgUrl, isv=True, returnErr=True) ## without selenium
        soup, srDict = linkToSoup_selenium(driverG, isv=True, returnErr=True) ## for selenium

        if type(soup) == str:
            scrapeLogs.append(
                {'scrapeNum': sci+1, 'errorMsg': soup, 'url': pgUrl}
            )  # JUST FOR STATS ###

            # break # if you want to stop at first error # OR take a break:
            waitMsg = f'!{soup}! {breaktime*sci}s break before retrying'
            print('', end=f'\r{scn} {waitMsg} {pgUrl}')
            time.sleep(breaktime*sci)
            continue

        ### JUST FOR STATS ###
        try: curPg = soup.select_one('li a.page.selected').get_text().strip()
        except: curPg = 'UNKNOWN'
        if curPg.isdigit(): curPg = int(curPg)

        try: ftrTxt = soup.select_one(pgftSel).get_text().strip()
        except: ftrTxt = 'reviewCount UNKNOWN'
        try: tRevs = ftrTxt.strip().strip().split('of')[-1].split()[0].replace(',', '')
        except: tRevs = 'UNKNOWN'
        if tRevs.isdigit(): totalRevs = int(tRevs)
        print('', end=f'\r{scn} scraping "{ftrTxt}" from page#{curPg} {pgUrl}')
        ######################

        newRevIds, pgRevIds = [], []  # JUST FOR STATS ###
        rSoups = soup.select('li[id^="empReview_"]')
        for r in rSoups:
            rId = r.get('id')
            pgRevIds.append(rId)  # JUST FOR STATS ###
            if rId in prevs: continue  # skip duplicates

            newRevIds.append(rId)  # JUST FOR STATS ###
            rDet = {'reviewId': rId}
            for sr in r.select(subRatSel):
                k = sr.select_one('div:first-of-type').get_text(' ').strip()
                # sval = getDECstars(sr.select_one('div:nth-of-type(2)'), soup) ## without selenium
                kc = formCssKey(sr.select_one('div:nth-of-type(2)').get('class', [])) ## for selenium
                sval = cssToStars(srDict[kc]) if kc in srDict else None ## for selenium
                rDet[f'[rating] {k}'] = sval

            for k, sel in refDict.items():
                sval = r.select_one(sel)
                if sval: sval = sval.get_text(' ').strip()
                rDet[k] = sval

            empRevs.append(rDet)
            prevs.append(rId)

        pandas.DataFrame(empRevs).to_csv(csvFn, index=False)
        total_current += len(newRevIds)
        total_allTime = len(empRevs)

        ### JUST FOR STATS ###
        for_sl = {
            'scrapeNum': sci+1, 'curPg': curPg, 'totalRevs': tRevs,
            'pgFooter': ftrTxt, 'allCt': len(pgRevIds),
            'uniqCt': len(set(pgRevIds)), 'newCt': len(newRevIds),
            'allRevs': pgRevIds, 'newRevs': newRevIds, 'url': pgUrl
        }
        if not rSoups:
            for_sl['errorMsg'] = 'No reviews found in ' + ''.join([
                ' '.join(w for w in l.split() if w)
                for l in str(soup).splitlines() if l
            ])
        scrapeLogs.append(for_sl)
        pandas.DataFrame(scrapeLogs).to_csv(f'scrapeLogs_{csvFn}', index=False)
        ######################

        rCt = len(rSoups)
        print(f'\r{scn} scraped {rCt} "{ftrTxt}" from page#{curPg} {pgUrl}')
        # nextPg = soup.select_one('li:has(a.page.selected) + li a.page[href]')  ## without selenium
        nextPg = driverG.find_elements(By.CSS_SELECTOR, 'li:has(a.page.selected) + li a.page[href]')  ## for selenium
        if nextPg:
            # pgUrl = 'https://www.glassdoor.com/' + nextPg.get('href')  ## without selenium
            if constBreak > 0: time.sleep(constBreak)
            scrollElToBottom = 'arguments[0].scrollIntoView(false);' ## for selenium
            driverG.execute_script(scrollElToBottom, nextPg[0]) ## for selenium
            nextPg[0].click() ## for selenium
        elif not rSoups:  # remove if you want to stop at first error
            print('', end=f'\r{scn} {breaktime*sci}s break before retrying {pgUrl}')
            time.sleep(breaktime*sci)
        else: break  # last page

    driverG.close() ## for selenium
    del driverG  # (just in case) ## for selenium

    print('\n\n\n total reviews: ', totalRevs)
    print('total reviews scraped this run:', total_current)
    print('total reviews scraped over all time:', total_allTime)


# startUrl = 'https://www.glassdoor.com/Reviews/Walmart-Reviews-E715.htm?filter.iso3Language=eng'
# scrape_gdRevs(startUrl)