getUniqLinks [tap_az] for q_74598056

############ PRINTED OUTPUT AT BOTTOM ############
## for https://stackoverflow.com/q/74598056/6146136

# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# import requests
# from bs4 import BeautifulSoup
# import pandas as pd


rootUrl = 'https://tap.az'
fullUrl = f'{rootUrl}/elanlar/dasinmaz-emlak/menziller'


def getUniqLinks(URL, rUrl, max_scrolls=40, maxFails=5, tmo=3):
    elsViz = EC.visibility_of_all_elements_located
    scrollElToTop = "arguments[0].scrollIntoView(true);"

    driverX = webdriver.Chrome()

    driverX.get(URL)
    print(driverX.current_url)

    plXpath = '//div[@class="products-bookmarking"]//a[@href]'
    piXclas = "concat(' ',normalize-space(@class),' '),' products-i '"
    piXpath = f"//div[contains({piXclas})][{plXpath[2:]}]"
    nxtp_xpath = f'/following-sibling::{piXpath[2:]}'

    WebDriverWait(driverX, tmo).until(elsViz((By.XPATH, piXpath)))
    failCt = 0
    prodLinks = []  # remove
    for sci in range(max_scrolls):
        prods = driverX.find_elements(By.XPATH, piXpath)

        # remove below ################################### # remove
        if sci == 0:  # remove
            prodLinks += [p.find_element(  # remove
                By.XPATH, plXpath # remove
            ).get_attribute('href') for p in prods]  # remove
        # remove above ################################### # remove

        driverX.execute_script(scrollElToTop, prods[-1])

        nxtp_xful = f'{piXpath}[{len(prods) - 1}]{nxtp_xpath}'
        try:
            WebDriverWait(driverX, tmo).until(elsViz((By.XPATH, nxtp_xful)))
        except:
            failCt += 1
            print(f'\n[Failed to load more fail#{failCt} (max: {maxFails})]')
            if failCt < maxFails:
                continue
            break
        nprods = driverX.find_elements(By.XPATH, nxtp_xful)

        # remove below ################################### # remove
        prodLinks += [p.find_element(  # remove
            By.XPATH, plXpath  # remove
        ).get_attribute('href') for p in nprods]  # remove
        # remove above ################################### # remove

        print('', end=(f'\r[{sci+1} of {max_scrolls}]' + (
            f'  [started with] {len(prods)} products --> '
        ) + f'  {len(nprods)} new products'))

    # remove below ######################################### # remove
    uniqLinks = list(set([l.strip('/bookmark') for l in prodLinks]))  # remove
    print(  # remove
        f'\nWITH SELENIUM found {len(prodLinks)} product links',  # remove
        f'[{len(uniqLinks)} unique]'  # remove
    )  # remove
    print('\n'.join([u for u in uniqLinks[:5]]))  # remove
    # remove above ######################################### # remove

    print('', end='\n\nParsing page_source...')
    soup = BeautifulSoup(driverX.page_source, 'html.parser')
    plSel = '.products-bookmarking a[href]'
    piSel = f'div.products-i:has({plSel})'
    prodLinks_parsed = [(
        rUrl + p.select_one(plSel).get('href')
    ) for p in soup.select(piSel)]
    uniqLinks_parsed = list(set([
        l.strip('/bookmark') for l in prodLinks_parsed
    ]))
    print(
        f'\rPARSED PAGE_SOURCE  --->  found ',
        f' {len(prodLinks_parsed)} product links ',
        f' [{len(uniqLinks_parsed)} unique]'
    )
    print('\n'.join([u for u in uniqLinks_parsed[:5]]))
    driverX.quit()  # just in case
    del driverX  # just in case

    # return uniqLinks # remove
    return uniqLinks_parsed


prodUrls = getUniqLinks(fullUrl, rootUrl, max_scrolls=250, tmo=1)
pd.DataFrame({'links': prodUrls}).to_csv('prodLinks.csv', index=False)


'''######################## PRINTED OUTPUT ########################
https://tap.az/elanlar/dasinmaz-emlak/menziller
[176 of 250]  [started with] 8455 products -->   42 new products
[Failed to load more fail#1 (max: 5)]
[194 of 250]  [started with] 9271 products -->   42 new products
[Failed to load more fail#2 (max: 5)]
[235 of 250]  [started with] 11863 products -->   42 new products
[Failed to load more fail#3 (max: 5)]
[250 of 250]  [started with] 12535 products -->   42 new products
WITH SELENIUM found 10957 product links [1 unique]
https://tap.az/elanlar/dasinmaz-emlak/menziller/35903364


PARSED PAGE_SOURCE  --->  found   12583 product links   [12576 unique]
https://tap.az/elanlar/dasinmaz-emlak/menziller/35828514
https://tap.az/elanlar/dasinmaz-emlak/menziller/35833080
https://tap.az/elanlar/dasinmaz-emlak/menziller/35898353
https://tap.az/elanlar/dasinmaz-emlak/menziller/35837943
https://tap.az/elanlar/dasinmaz-emlak/menziller/35720966
################################################################'''