Try95th

open+scrape new tab [wallapop] for so_q_75426987

Feb 13th, 2023 (edited)
143
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.40 KB | None | 0 0
  1. ## for https://stackoverflow.com/q/75426987/6146136
  2. ## used in https://stackoverflow.com/a/75442484/6146136
  3.  
  4. from selenium import webdriver
  5. from selenium.webdriver.common.by import By
  6. from selenium.webdriver.support.ui import WebDriverWait
  7. from selenium.webdriver.support import expected_conditions as EC
  8.  
  9. from bs4 import BeautifulSoup
  10. import pandas as pd
  11.  
  12. saveTo = 'monitor.csv' ## adjust as preferred
  13. kSearch, maxItems = 'monitor', 1500 ## adjust as preferred
  14. url = f'https://es.wallapop.com/app/search?keywords={"+".join(kSearch.split())}'
  15. url = f'{url}&filters_source=search_box&latitude=39.46895&longitude=-0.37686'
  16. ################################################################################
  17.  
  18.  
  19. ############################## REQUIRED FUNCTIONS ##############################
  20.  
  21. ## scroll to an element and click [targetEl can be and element or selector] ##
  22. def scrollClick(driverX, targetEl, maxWait=5, scroll2Top=False, printErr=True):
  23.     try:
  24.         xWait = WebDriverWait(driverX, maxWait)
  25.         if isinstance(targetEl, str):
  26.             xWait.until(
  27.                 EC.presence_of_element_located((By.CSS_SELECTOR,targetEl)))
  28.             targetEl = driverX.find_element(By.CSS_SELECTOR, targetEl)
  29.         xWait.until(EC.element_to_be_clickable(targetEl))
  30.         driverX.execute_script('''
  31.            arguments[0].scrollIntoView(arguments[1]);
  32.        ''', targetEl, bool(scroll2Top)) ## execute js to scroll
  33.         targetEl.click()
  34.     except Exception as e:
  35.         if printErr: print(repr(e), '\nFailed to click', targetEl)
  36.  
  37.  
  38. ## find a nextSibling of refEl that matches selector [if specified by sel] ##
  39. def selectNextSib(driverX, refEl, sel=False, printError=False):
  40.     sel = sel.strip() if isinstance(sel, str) and sel.strip() else False
  41.     try: ## execute js code to find next card
  42.         return driverX.execute_script('''
  43.            var sibling = arguments[0].nextElementSibling;
  44.            while (sibling && arguments[1]) {
  45.                if (sibling.matches(arguments[1])) break;
  46.                sibling = sibling.nextElementSibling; }
  47.            return sibling;''', refEl, sel)
  48.     except Exception as e:
  49.         if printError: print(f'Error finding next "{sel}":',repr(e))
  50.  
  51.  
  52. ## [bs4] extract text or attribute from a tag inside tagSoup ##
  53. def selectGet(tagSoup, selector='', ta='', defaultVal=None):
  54.     el = tagSoup.select_one(selector) if selector else tagSoup
  55.     if el is None: return defaultVal
  56.     return el.get(ta, defaultVal) if ta else el.get_text(' ', strip=True)
  57.  
  58.  
  59. ## parse product page html and extract product details ##
  60. def getProductDetails(prodPgHtml:str, prodUrl=None):
  61.     pSoup = BeautifulSoup(prodPgHtml.encode('utf-8'))
  62.     detsDiv = pSoup.select_one('div.detail-item')
  63.     detKeys = ['category_id', 'is_bulky', 'is_bumped',
  64.                'is_free_shipping_allowed', 'item_id', 'item_uuid',
  65.                'main_image_thumbnail', 'mine', 'sell_price',
  66.                'seller_user_id', 'subcategory_id', 'itle', 'title']
  67.     pDets = {} if detsDiv is None else {
  68.         k.lstrip('data-').replace('-', '_'): v
  69.         for k, v in sorted(detsDiv.attrs.items(), key=lambda x: x[0])
  70.         if k.lstrip('data-').replace('-', '_') in detKeys
  71.     }
  72.     pDets['description'] = selectGet(pSoup, 'div.card-product-detail-top>p')
  73.     pDets['date_posted'] = selectGet(pSoup, 'div[class$="published"]')
  74.     pDets['views_count'] = selectGet(pSoup, 'i.ico-eye+span')
  75.     pDets['likes_count'] = selectGet(pSoup, 'i.ico-coounter_favourites+span')
  76.     pDets['seller_name'] = selectGet(pSoup, 'h2.card-user-detail-name')
  77.     uLink = selectGet(pSoup, 'a.card-user-right[href]', 'href')
  78.     if uLink: pDets['seller_link'] = urljoin(prodUrl, uLink)
  79.  
  80.     ### EXTRACT ANY OTHER DETAILS YOU WANT ###
  81.  
  82.     pDets['product_link'] = prodUrl
  83.     return pDets
  84. ################################################################################
  85.  
  86.  
  87. ################################## MAIN  CODE ##################################
  88. browser = webdriver.Chrome()
  89. browser.get(url)
  90. browser.maximize_window()
  91.  
  92. scrollClick(browser, 'button[id="onetrust-accept-btn-handler"]') ## accept cookies
  93. scrollClick(browser, 'tsl-button[id="btn-load-more"]') ## load more [then ∞-scroll]
  94.  
  95. itemCt, scrapedLinks, products = 0, [], [] ## initiate
  96. itemSel, nextItem = 'a.ItemCardList__item[title]', None
  97. try: nextItem = browser.find_element(By.CSS_SELECTOR, itemSel) ## first card
  98. except Exception as e: print('No items found:', repr(e))
  99.  
  100. while nextItem:
  101.     itemCt += 1 # counter
  102.     cpHtml, cpTxt = '', '' # clear/initiate
  103.     resultsTab = browser.current_window_handle # to go back
  104.  
  105.     try: # click card -> open new tab -> scrape product details
  106.         cpHtml, cpTxt = nextItem.get_attribute('outerHTML'), nextItem.text
  107.         scrollClick(browser, nextItem) ## click current card
  108.         # add wait ?
  109.         browser.switch_to.window(browser.window_handles[1]) ## go to 2nd tab
  110.         WebDriverWait(browser, 5).until(EC.presence_of_element_located(
  111.             (By.CSS_SELECTOR, 'div.detail-item'))) ## wait to load details
  112.         pLink = browser.current_url ## product URL
  113.         if pLink not in scrapedLinks: # skip duplicates [just in case]
  114.             products.append(getProductDetails(browser.page_source, pLink))
  115.         scrapedLinks.append(pLink)
  116.     except Exception as e:
  117.         print('!', [itemCt], ' '.join(cpTxt.split()), repr(e)) ## print error
  118.         pSoup = BeautifulSoup(cpHtml.encode('utf-8'), 'lxml')
  119.         products.append({
  120.             'title': selectGet(pSoup, '', 'title'),
  121.             'price': selectGet(pSoup, 'span.ItemCard__price'),
  122.             'errorMsg': f'{type(e)} {e}'
  123.         }) ## [ make do with info in card ]
  124.  
  125.     try: # close all tabs other than results tab
  126.         for w in browser.window_handles:
  127.             if w != resultsTab:
  128.                 browser.switch_to.window(w)
  129.                 browser.close()
  130.             browser.switch_to.window(resultsTab)
  131.     except Exception as e:
  132.         print('Failed to restore results-tab-only window:', repr(e))
  133.         break
  134.  
  135.     # print('', end=f"\r[{itemCt} of {maxItems}] {' '.join(cpTxt.split())} {repr(e)}")
  136.  
  137.     if isinstance(maxItems, int):
  138.         if maxItems < itemCt: break
  139.  
  140.     nextItem = selectNextSib(browser, nextItem, itemSel) # get next result card
  141. ################################################################################
  142.  
  143. pd.DataFrame(products).to_csv(saveTo, index=False) ## SAVE RESULTS ##
Advertisement
Add Comment
Please, Sign In to add comment