Not a member of Pastebin yet?
                        Sign Up,
                        it unlocks many cool features!                    
                - ## for https://stackoverflow.com/q/75426987/6146136
 - ## used in https://stackoverflow.com/a/75442484/6146136
 - from selenium import webdriver
 - from selenium.webdriver.common.by import By
 - from selenium.webdriver.support.ui import WebDriverWait
 - from selenium.webdriver.support import expected_conditions as EC
 - from bs4 import BeautifulSoup
 - import pandas as pd
 - saveTo = 'monitor.csv' ## adjust as preferred
 - kSearch, maxItems = 'monitor', 1500 ## adjust as preferred
 - url = f'https://es.wallapop.com/app/search?keywords={"+".join(kSearch.split())}'
 - url = f'{url}&filters_source=search_box&latitude=39.46895&longitude=-0.37686'
 - ################################################################################
 - ############################## REQUIRED FUNCTIONS ##############################
 - ## scroll to an element and click [targetEl can be and element or selector] ##
 - def scrollClick(driverX, targetEl, maxWait=5, scroll2Top=False, printErr=True):
 - try:
 - xWait = WebDriverWait(driverX, maxWait)
 - if isinstance(targetEl, str):
 - xWait.until(
 - EC.presence_of_element_located((By.CSS_SELECTOR,targetEl)))
 - targetEl = driverX.find_element(By.CSS_SELECTOR, targetEl)
 - xWait.until(EC.element_to_be_clickable(targetEl))
 - driverX.execute_script('''
 - arguments[0].scrollIntoView(arguments[1]);
 - ''', targetEl, bool(scroll2Top)) ## execute js to scroll
 - targetEl.click()
 - except Exception as e:
 - if printErr: print(repr(e), '\nFailed to click', targetEl)
 - ## find a nextSibling of refEl that matches selector [if specified by sel] ##
 - def selectNextSib(driverX, refEl, sel=False, printError=False):
 - sel = sel.strip() if isinstance(sel, str) and sel.strip() else False
 - try: ## execute js code to find next card
 - return driverX.execute_script('''
 - var sibling = arguments[0].nextElementSibling;
 - while (sibling && arguments[1]) {
 - if (sibling.matches(arguments[1])) break;
 - sibling = sibling.nextElementSibling; }
 - return sibling;''', refEl, sel)
 - except Exception as e:
 - if printError: print(f'Error finding next "{sel}":',repr(e))
 - ## [bs4] extract text or attribute from a tag inside tagSoup ##
 - def selectGet(tagSoup, selector='', ta='', defaultVal=None):
 - el = tagSoup.select_one(selector) if selector else tagSoup
 - if el is None: return defaultVal
 - return el.get(ta, defaultVal) if ta else el.get_text(' ', strip=True)
 - ## parse product page html and extract product details ##
 - def getProductDetails(prodPgHtml:str, prodUrl=None):
 - pSoup = BeautifulSoup(prodPgHtml.encode('utf-8'))
 - detsDiv = pSoup.select_one('div.detail-item')
 - detKeys = ['category_id', 'is_bulky', 'is_bumped',
 - 'is_free_shipping_allowed', 'item_id', 'item_uuid',
 - 'main_image_thumbnail', 'mine', 'sell_price',
 - 'seller_user_id', 'subcategory_id', 'itle', 'title']
 - pDets = {} if detsDiv is None else {
 - k.lstrip('data-').replace('-', '_'): v
 - for k, v in sorted(detsDiv.attrs.items(), key=lambda x: x[0])
 - if k.lstrip('data-').replace('-', '_') in detKeys
 - }
 - pDets['description'] = selectGet(pSoup, 'div.card-product-detail-top>p')
 - pDets['date_posted'] = selectGet(pSoup, 'div[class$="published"]')
 - pDets['views_count'] = selectGet(pSoup, 'i.ico-eye+span')
 - pDets['likes_count'] = selectGet(pSoup, 'i.ico-coounter_favourites+span')
 - pDets['seller_name'] = selectGet(pSoup, 'h2.card-user-detail-name')
 - uLink = selectGet(pSoup, 'a.card-user-right[href]', 'href')
 - if uLink: pDets['seller_link'] = urljoin(prodUrl, uLink)
 - ### EXTRACT ANY OTHER DETAILS YOU WANT ###
 - pDets['product_link'] = prodUrl
 - return pDets
 - ################################################################################
 - ################################## MAIN CODE ##################################
 - browser = webdriver.Chrome()
 - browser.get(url)
 - browser.maximize_window()
 - scrollClick(browser, 'button[id="onetrust-accept-btn-handler"]') ## accept cookies
 - scrollClick(browser, 'tsl-button[id="btn-load-more"]') ## load more [then ∞-scroll]
 - itemCt, scrapedLinks, products = 0, [], [] ## initiate
 - itemSel, nextItem = 'a.ItemCardList__item[title]', None
 - try: nextItem = browser.find_element(By.CSS_SELECTOR, itemSel) ## first card
 - except Exception as e: print('No items found:', repr(e))
 - while nextItem:
 - itemCt += 1 # counter
 - cpHtml, cpTxt = '', '' # clear/initiate
 - resultsTab = browser.current_window_handle # to go back
 - try: # click card -> open new tab -> scrape product details
 - cpHtml, cpTxt = nextItem.get_attribute('outerHTML'), nextItem.text
 - scrollClick(browser, nextItem) ## click current card
 - # add wait ?
 - browser.switch_to.window(browser.window_handles[1]) ## go to 2nd tab
 - WebDriverWait(browser, 5).until(EC.presence_of_element_located(
 - (By.CSS_SELECTOR, 'div.detail-item'))) ## wait to load details
 - pLink = browser.current_url ## product URL
 - if pLink not in scrapedLinks: # skip duplicates [just in case]
 - products.append(getProductDetails(browser.page_source, pLink))
 - scrapedLinks.append(pLink)
 - except Exception as e:
 - print('!', [itemCt], ' '.join(cpTxt.split()), repr(e)) ## print error
 - pSoup = BeautifulSoup(cpHtml.encode('utf-8'), 'lxml')
 - products.append({
 - 'title': selectGet(pSoup, '', 'title'),
 - 'price': selectGet(pSoup, 'span.ItemCard__price'),
 - 'errorMsg': f'{type(e)} {e}'
 - }) ## [ make do with info in card ]
 - try: # close all tabs other than results tab
 - for w in browser.window_handles:
 - if w != resultsTab:
 - browser.switch_to.window(w)
 - browser.close()
 - browser.switch_to.window(resultsTab)
 - except Exception as e:
 - print('Failed to restore results-tab-only window:', repr(e))
 - break
 - # print('', end=f"\r[{itemCt} of {maxItems}] {' '.join(cpTxt.split())} {repr(e)}")
 - if isinstance(maxItems, int):
 - if maxItems < itemCt: break
 - nextItem = selectNextSib(browser, nextItem, itemSel) # get next result card
 - ################################################################################
 - pd.DataFrame(products).to_csv(saveTo, index=False) ## SAVE RESULTS ##
 
Advertisement
 
                    Add Comment                
                
                        Please, Sign In to add comment