Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ## Edited from https://stackoverflow.com/questions/74379884/scraping-produce-image-by-beautifulsoup
- import time
- from linkToSoup_selenium import * # paste from https://pastebin.com/kEC9gPC8
- ## Instead of for-loop, the 2 functions below are used with list comprehension ##
- def getTagAttr(tagSoup, targetAttr, defVal=None):
- if 'bs4.element.ResultSet' in str(type(tagSoup)):
- return [getTagAttr(t, targetAttr, defVal) for t in tagSoup]
- if tagSoup is None:
- attrVal = defVal
- elif targetAttr == '':
- attrVal = tagSoup.get_text(' ', strip=True)
- elif targetAttr == '"str1"':
- strings = [s for s in tagSoup.strings]
- attrVal = strings[0] if strings else defVal
- else: attrVal = tagSoup.get(targetAttr, defVal)
- return attrVal
- def getProductData(product_link, selRef):
- # Get product information from page
- #Moreinfo = requests.get(product_link, headers=headers)
- #Product_Details = BeautifulSoup(Moreinfo.content, "html.parser")
- #time.sleep(1)
- Product_Details = linktosoup_selenium(product_link)
- if Product_Details is None:
- return {
- 'product_link': product_link,
- 'errorMessage': 'Failed to scrape'
- }
- pDets = {k: getTagAttr((
- Product_Details.select(sel) if k[-5:] == '_list'
- else Product_Details.select_one(sel)
- ), tattr) for k, (sel, tattr) in selRef.items()}
- ### only for this site ###
- if 'product_id' in pDets and pDets['product_id']:
- pDets['product_id'] = 'N' + pDets['product_id'].split('-N')[-1]
- pDets['product_link'] = product_link
- # print(Product_Details.select('.swiper-wrapper .lazyload-wrapper div'))
- ##########################
- return pDets
- rootUrl = 'https://www.noon.com'
- page_numbers = 1
- selectorsRef = {
- 'product_id': ('*[data-qa*="-N"]', 'data-qa'),
- 'product_brand': ('div[data-qa^="pdp-brand-"]', ''),
- 'product_name': ('h1[data-qa^="pdp-name"]', ''),
- 'product_price_text': ('div[data-qa^="pdp-price"]', ''),
- 'current_price': ('div[data-qa^="pdp-price"] .priceNow', '"str1"'),
- 'imageLinks_list': ('.swiper-wrapper img', 'src')
- }
- productPages = []
- while True:
- URL = f"{rootUrl}/egypt-ar/beauty-and-health/beauty/eg-nov22-clearance-2/?limit=50&page={page_numbers}"
- #page = requests.get(URL, headers=headers)
- #soup = BeautifulSoup(page.text, "html.parser")
- soup = linktosoup_selenium(URL)
- if soup is None: break
- product_Source = soup.find("div", {"data-qa": "searchHeader"})
- if product_Source is None:
- break # break infinite loop if page has no searchHeader
- product_String = getTagAttr(product_Source, '')
- products_Numbers = getTagAttr(product_Source, '"str1"')
- if products_Numbers.split()[0].isdigit():
- products_Numbers = int(products_Numbers.split()[0])
- print(f'page {page_numbers} - {products_Numbers} products @ {URL}')
- time.sleep(1)
- productPage = {
- 'page_number': page_numbers, 'page_link': URL,
- 'products_count': products_Numbers,
- 'products': [
- getProductData(rootUrl + a.get('href'), selectorsRef)
- for a in soup.select("span.productContainer > a[href]")
- ] ## INSTEAD OF FOR LOOP ##
- }
- productPages.append(productPage)
- page_numbers += 1
Advertisement
Add Comment
Please, Sign In to add comment