open+scrape new tab [wallapop] for so_q_75426987

## for https://stackoverflow.com/q/75426987/6146136
## used in https://stackoverflow.com/a/75442484/6146136

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
import pandas as pd

saveTo = 'monitor.csv' ## adjust as preferred
kSearch, maxItems = 'monitor', 1500 ## adjust as preferred
url = f'https://es.wallapop.com/app/search?keywords={"+".join(kSearch.split())}'
url = f'{url}&filters_source=search_box&latitude=39.46895&longitude=-0.37686'
################################################################################


############################## REQUIRED FUNCTIONS ##############################

## scroll to an element and click [targetEl can be and element or selector] ##
def scrollClick(driverX, targetEl, maxWait=5, scroll2Top=False, printErr=True):
    try:
        xWait = WebDriverWait(driverX, maxWait)
        if isinstance(targetEl, str):
            xWait.until(
                EC.presence_of_element_located((By.CSS_SELECTOR,targetEl)))
            targetEl = driverX.find_element(By.CSS_SELECTOR, targetEl)
        xWait.until(EC.element_to_be_clickable(targetEl))
        driverX.execute_script('''
            arguments[0].scrollIntoView(arguments[1]);
        ''', targetEl, bool(scroll2Top)) ## execute js to scroll
        targetEl.click()
    except Exception as e:
        if printErr: print(repr(e), '\nFailed to click', targetEl)


## find a nextSibling of refEl that matches selector [if specified by sel] ##
def selectNextSib(driverX, refEl, sel=False, printError=False):
    sel = sel.strip() if isinstance(sel, str) and sel.strip() else False
    try: ## execute js code to find next card
        return driverX.execute_script('''
            var sibling = arguments[0].nextElementSibling;
            while (sibling && arguments[1]) {
                if (sibling.matches(arguments[1])) break;
                sibling = sibling.nextElementSibling; }
            return sibling;''', refEl, sel)
    except Exception as e:
        if printError: print(f'Error finding next "{sel}":',repr(e))


## [bs4] extract text or attribute from a tag inside tagSoup ##
def selectGet(tagSoup, selector='', ta='', defaultVal=None):
    el = tagSoup.select_one(selector) if selector else tagSoup
    if el is None: return defaultVal
    return el.get(ta, defaultVal) if ta else el.get_text(' ', strip=True)


## parse product page html and extract product details ##
def getProductDetails(prodPgHtml:str, prodUrl=None):
    pSoup = BeautifulSoup(prodPgHtml.encode('utf-8'))
    detsDiv = pSoup.select_one('div.detail-item')
    detKeys = ['category_id', 'is_bulky', 'is_bumped',
               'is_free_shipping_allowed', 'item_id', 'item_uuid',
               'main_image_thumbnail', 'mine', 'sell_price',
               'seller_user_id', 'subcategory_id', 'itle', 'title']
    pDets = {} if detsDiv is None else {
        k.lstrip('data-').replace('-', '_'): v
        for k, v in sorted(detsDiv.attrs.items(), key=lambda x: x[0])
        if k.lstrip('data-').replace('-', '_') in detKeys
    }
    pDets['description'] = selectGet(pSoup, 'div.card-product-detail-top>p')
    pDets['date_posted'] = selectGet(pSoup, 'div[class$="published"]')
    pDets['views_count'] = selectGet(pSoup, 'i.ico-eye+span')
    pDets['likes_count'] = selectGet(pSoup, 'i.ico-coounter_favourites+span')
    pDets['seller_name'] = selectGet(pSoup, 'h2.card-user-detail-name')
    uLink = selectGet(pSoup, 'a.card-user-right[href]', 'href')
    if uLink: pDets['seller_link'] = urljoin(prodUrl, uLink)

    ### EXTRACT ANY OTHER DETAILS YOU WANT ###

    pDets['product_link'] = prodUrl
    return pDets
################################################################################


################################## MAIN  CODE ##################################
browser = webdriver.Chrome()
browser.get(url)
browser.maximize_window()

scrollClick(browser, 'button[id="onetrust-accept-btn-handler"]') ## accept cookies
scrollClick(browser, 'tsl-button[id="btn-load-more"]') ## load more [then ∞-scroll]

itemCt, scrapedLinks, products = 0, [], [] ## initiate
itemSel, nextItem = 'a.ItemCardList__item[title]', None
try: nextItem = browser.find_element(By.CSS_SELECTOR, itemSel) ## first card
except Exception as e: print('No items found:', repr(e))

while nextItem:
    itemCt += 1 # counter
    cpHtml, cpTxt = '', '' # clear/initiate
    resultsTab = browser.current_window_handle # to go back

    try: # click card -> open new tab -> scrape product details
        cpHtml, cpTxt = nextItem.get_attribute('outerHTML'), nextItem.text
        scrollClick(browser, nextItem) ## click current card
        # add wait ?
        browser.switch_to.window(browser.window_handles[1]) ## go to 2nd tab
        WebDriverWait(browser, 5).until(EC.presence_of_element_located(
            (By.CSS_SELECTOR, 'div.detail-item'))) ## wait to load details
        pLink = browser.current_url ## product URL
        if pLink not in scrapedLinks: # skip duplicates [just in case]
            products.append(getProductDetails(browser.page_source, pLink))
        scrapedLinks.append(pLink)
    except Exception as e:
        print('!', [itemCt], ' '.join(cpTxt.split()), repr(e)) ## print error
        pSoup = BeautifulSoup(cpHtml.encode('utf-8'), 'lxml')
        products.append({
            'title': selectGet(pSoup, '', 'title'),
            'price': selectGet(pSoup, 'span.ItemCard__price'),
            'errorMsg': f'{type(e)} {e}'
        }) ## [ make do with info in card ]

    try: # close all tabs other than results tab
        for w in browser.window_handles:
            if w != resultsTab:
                browser.switch_to.window(w)
                browser.close()
            browser.switch_to.window(resultsTab)
    except Exception as e:
        print('Failed to restore results-tab-only window:', repr(e))
        break

    # print('', end=f"\r[{itemCt} of {maxItems}] {' '.join(cpTxt.split())} {repr(e)}")

    if isinstance(maxItems, int):
        if maxItems < itemCt: break

    nextItem = selectNextSib(browser, nextItem, itemSel) # get next result card
################################################################################

pd.DataFrame(products).to_csv(saveTo, index=False) ## SAVE RESULTS ##