stackoverflow_q_74379884

## Edited from https://stackoverflow.com/questions/74379884/scraping-produce-image-by-beautifulsoup

import time
from linkToSoup_selenium import * # paste from https://pastebin.com/kEC9gPC8

## Instead of for-loop, the 2 functions below are used with list comprehension ##

def getTagAttr(tagSoup, targetAttr, defVal=None):
    if 'bs4.element.ResultSet' in str(type(tagSoup)):
        return [getTagAttr(t, targetAttr, defVal) for t in tagSoup]
    if tagSoup is None:
        attrVal = defVal
    elif targetAttr == '':
        attrVal = tagSoup.get_text(' ', strip=True)
    elif targetAttr == '"str1"':
        strings = [s for s in tagSoup.strings]
        attrVal = strings[0] if strings else defVal
    else: attrVal = tagSoup.get(targetAttr, defVal)
    return attrVal

def getProductData(product_link, selRef):
    # Get product information from page
    #Moreinfo = requests.get(product_link, headers=headers)
    #Product_Details = BeautifulSoup(Moreinfo.content, "html.parser")
    #time.sleep(1)

    Product_Details = linktosoup_selenium(product_link)
    if Product_Details is None:
        return {
            'product_link': product_link,
            'errorMessage': 'Failed to scrape'
        }

    pDets = {k: getTagAttr((
        Product_Details.select(sel) if k[-5:] == '_list'
        else Product_Details.select_one(sel)
    ), tattr) for k, (sel, tattr) in selRef.items()}

    ### only for this site ###
    if 'product_id' in pDets and pDets['product_id']:
        pDets['product_id'] = 'N' + pDets['product_id'].split('-N')[-1]
    pDets['product_link'] = product_link
    # print(Product_Details.select('.swiper-wrapper .lazyload-wrapper div'))
    ##########################

    return pDets


rootUrl = 'https://www.noon.com'
page_numbers = 1
selectorsRef = {
    'product_id': ('*[data-qa*="-N"]', 'data-qa'),
    'product_brand': ('div[data-qa^="pdp-brand-"]', ''),
    'product_name': ('h1[data-qa^="pdp-name"]', ''),
    'product_price_text': ('div[data-qa^="pdp-price"]', ''),
    'current_price': ('div[data-qa^="pdp-price"] .priceNow', '"str1"'),
    'imageLinks_list': ('.swiper-wrapper img', 'src')
}

productPages = []

while True:
    URL = f"{rootUrl}/egypt-ar/beauty-and-health/beauty/eg-nov22-clearance-2/?limit=50&page={page_numbers}"

    #page = requests.get(URL, headers=headers)
    #soup = BeautifulSoup(page.text, "html.parser")
    soup = linktosoup_selenium(URL)
    if soup is None: break

    product_Source = soup.find("div", {"data-qa": "searchHeader"})
    if product_Source is None:
        break # break infinite loop if page has no searchHeader

    product_String = getTagAttr(product_Source, '')
    products_Numbers = getTagAttr(product_Source, '"str1"')
    if products_Numbers.split()[0].isdigit():
        products_Numbers = int(products_Numbers.split()[0])
    print(f'page {page_numbers} - {products_Numbers} products @ {URL}')

    time.sleep(1)
    productPage = {
        'page_number': page_numbers, 'page_link': URL,
        'products_count': products_Numbers,
        'products': [
            getProductData(rootUrl + a.get('href'), selectorsRef)
            for a in soup.select("span.productContainer > a[href]")
        ] ## INSTEAD OF FOR LOOP ##
    }
    productPages.append(productPage)
    page_numbers += 1