Try95th

stackoverflow_q_74379884

Nov 16th, 2022 (edited)
149
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.34 KB | None | 0 0
  1. ## Edited from https://stackoverflow.com/questions/74379884/scraping-produce-image-by-beautifulsoup
  2.  
  3. import time
  4. from linkToSoup_selenium import * # paste from https://pastebin.com/kEC9gPC8
  5.  
  6. ## Instead of for-loop, the 2 functions below are used with list comprehension ##
  7.  
  8. def getTagAttr(tagSoup, targetAttr, defVal=None):
  9.     if 'bs4.element.ResultSet' in str(type(tagSoup)):
  10.         return [getTagAttr(t, targetAttr, defVal) for t in tagSoup]
  11.     if tagSoup is None:  
  12.         attrVal = defVal
  13.     elif targetAttr == '':
  14.         attrVal = tagSoup.get_text(' ', strip=True)
  15.     elif targetAttr == '"str1"':
  16.         strings = [s for s in tagSoup.strings]
  17.         attrVal = strings[0] if strings else defVal
  18.     else: attrVal = tagSoup.get(targetAttr, defVal)
  19.     return attrVal
  20.    
  21. def getProductData(product_link, selRef):
  22.     # Get product information from page
  23.     #Moreinfo = requests.get(product_link, headers=headers)
  24.     #Product_Details = BeautifulSoup(Moreinfo.content, "html.parser")
  25.     #time.sleep(1)
  26.  
  27.     Product_Details = linktosoup_selenium(product_link)
  28.     if Product_Details is None:
  29.         return {
  30.             'product_link': product_link,
  31.             'errorMessage': 'Failed to scrape'
  32.         }
  33.  
  34.     pDets = {k: getTagAttr((
  35.         Product_Details.select(sel) if k[-5:] == '_list'
  36.         else Product_Details.select_one(sel)
  37.     ), tattr) for k, (sel, tattr) in selRef.items()}
  38.  
  39.     ### only for this site ###
  40.     if 'product_id' in pDets and pDets['product_id']:
  41.         pDets['product_id'] = 'N' + pDets['product_id'].split('-N')[-1]
  42.     pDets['product_link'] = product_link
  43.     # print(Product_Details.select('.swiper-wrapper .lazyload-wrapper div'))
  44.     ##########################
  45.    
  46.     return pDets
  47.  
  48.  
  49. rootUrl = 'https://www.noon.com'
  50. page_numbers = 1
  51. selectorsRef = {
  52.     'product_id': ('*[data-qa*="-N"]', 'data-qa'),
  53.     'product_brand': ('div[data-qa^="pdp-brand-"]', ''),
  54.     'product_name': ('h1[data-qa^="pdp-name"]', ''),
  55.     'product_price_text': ('div[data-qa^="pdp-price"]', ''),
  56.     'current_price': ('div[data-qa^="pdp-price"] .priceNow', '"str1"'),  
  57.     'imageLinks_list': ('.swiper-wrapper img', 'src')
  58. }
  59.  
  60. productPages = []
  61.  
  62. while True:
  63.     URL = f"{rootUrl}/egypt-ar/beauty-and-health/beauty/eg-nov22-clearance-2/?limit=50&page={page_numbers}"
  64.  
  65.     #page = requests.get(URL, headers=headers)
  66.     #soup = BeautifulSoup(page.text, "html.parser")    
  67.     soup = linktosoup_selenium(URL)
  68.     if soup is None: break
  69.  
  70.     product_Source = soup.find("div", {"data-qa": "searchHeader"})
  71.     if product_Source is None:
  72.         break # break infinite loop if page has no searchHeader
  73.    
  74.     product_String = getTagAttr(product_Source, '')
  75.     products_Numbers = getTagAttr(product_Source, '"str1"')
  76.     if products_Numbers.split()[0].isdigit():
  77.         products_Numbers = int(products_Numbers.split()[0])
  78.     print(f'page {page_numbers} - {products_Numbers} products @ {URL}')
  79.    
  80.     time.sleep(1)
  81.     productPage = {
  82.         'page_number': page_numbers, 'page_link': URL,
  83.         'products_count': products_Numbers,
  84.         'products': [
  85.             getProductData(rootUrl + a.get('href'), selectorsRef)
  86.             for a in soup.select("span.productContainer > a[href]")
  87.         ] ## INSTEAD OF FOR LOOP ##
  88.     }
  89.     productPages.append(productPage)
  90.     page_numbers += 1
Advertisement
Add Comment
Please, Sign In to add comment