Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ############ PRINTED OUTPUT AT BOTTOM ############
- ## for https://stackoverflow.com/q/74598056/6146136
- # from selenium.webdriver.common.by import By
- # from selenium.webdriver.support.ui import WebDriverWait
- # from selenium.webdriver.support import expected_conditions as EC
- # import requests
- # from bs4 import BeautifulSoup
- # import pandas as pd
- rootUrl = 'https://tap.az'
- fullUrl = f'{rootUrl}/elanlar/dasinmaz-emlak/menziller'
- def getUniqLinks(URL, rUrl, max_scrolls=40, maxFails=5, tmo=3):
- elsViz = EC.visibility_of_all_elements_located
- scrollElToTop = "arguments[0].scrollIntoView(true);"
- driverX = webdriver.Chrome()
- driverX.get(URL)
- print(driverX.current_url)
- plXpath = '//div[@class="products-bookmarking"]//a[@href]'
- piXclas = "concat(' ',normalize-space(@class),' '),' products-i '"
- piXpath = f"//div[contains({piXclas})][{plXpath[2:]}]"
- nxtp_xpath = f'/following-sibling::{piXpath[2:]}'
- WebDriverWait(driverX, tmo).until(elsViz((By.XPATH, piXpath)))
- failCt = 0
- prodLinks = [] # remove
- for sci in range(max_scrolls):
- prods = driverX.find_elements(By.XPATH, piXpath)
- # remove below ################################### # remove
- if sci == 0: # remove
- prodLinks += [p.find_element( # remove
- By.XPATH, plXpath # remove
- ).get_attribute('href') for p in prods] # remove
- # remove above ################################### # remove
- driverX.execute_script(scrollElToTop, prods[-1])
- nxtp_xful = f'{piXpath}[{len(prods) - 1}]{nxtp_xpath}'
- try:
- WebDriverWait(driverX, tmo).until(elsViz((By.XPATH, nxtp_xful)))
- except:
- failCt += 1
- print(f'\n[Failed to load more fail#{failCt} (max: {maxFails})]')
- if failCt < maxFails:
- continue
- break
- nprods = driverX.find_elements(By.XPATH, nxtp_xful)
- # remove below ################################### # remove
- prodLinks += [p.find_element( # remove
- By.XPATH, plXpath # remove
- ).get_attribute('href') for p in nprods] # remove
- # remove above ################################### # remove
- print('', end=(f'\r[{sci+1} of {max_scrolls}]' + (
- f' [started with] {len(prods)} products --> '
- ) + f' {len(nprods)} new products'))
- # remove below ######################################### # remove
- uniqLinks = list(set([l.strip('/bookmark') for l in prodLinks])) # remove
- print( # remove
- f'\nWITH SELENIUM found {len(prodLinks)} product links', # remove
- f'[{len(uniqLinks)} unique]' # remove
- ) # remove
- print('\n'.join([u for u in uniqLinks[:5]])) # remove
- # remove above ######################################### # remove
- print('', end='\n\nParsing page_source...')
- soup = BeautifulSoup(driverX.page_source, 'html.parser')
- plSel = '.products-bookmarking a[href]'
- piSel = f'div.products-i:has({plSel})'
- prodLinks_parsed = [(
- rUrl + p.select_one(plSel).get('href')
- ) for p in soup.select(piSel)]
- uniqLinks_parsed = list(set([
- l.strip('/bookmark') for l in prodLinks_parsed
- ]))
- print(
- f'\rPARSED PAGE_SOURCE ---> found ',
- f' {len(prodLinks_parsed)} product links ',
- f' [{len(uniqLinks_parsed)} unique]'
- )
- print('\n'.join([u for u in uniqLinks_parsed[:5]]))
- driverX.quit() # just in case
- del driverX # just in case
- # return uniqLinks # remove
- return uniqLinks_parsed
- prodUrls = getUniqLinks(fullUrl, rootUrl, max_scrolls=250, tmo=1)
- pd.DataFrame({'links': prodUrls}).to_csv('prodLinks.csv', index=False)
- '''######################## PRINTED OUTPUT ########################
- https://tap.az/elanlar/dasinmaz-emlak/menziller
- [176 of 250] [started with] 8455 products --> 42 new products
- [Failed to load more fail#1 (max: 5)]
- [194 of 250] [started with] 9271 products --> 42 new products
- [Failed to load more fail#2 (max: 5)]
- [235 of 250] [started with] 11863 products --> 42 new products
- [Failed to load more fail#3 (max: 5)]
- [250 of 250] [started with] 12535 products --> 42 new products
- WITH SELENIUM found 10957 product links [1 unique]
- https://tap.az/elanlar/dasinmaz-emlak/menziller/35903364
- PARSED PAGE_SOURCE ---> found 12583 product links [12576 unique]
- https://tap.az/elanlar/dasinmaz-emlak/menziller/35828514
- https://tap.az/elanlar/dasinmaz-emlak/menziller/35833080
- https://tap.az/elanlar/dasinmaz-emlak/menziller/35898353
- https://tap.az/elanlar/dasinmaz-emlak/menziller/35837943
- https://tap.az/elanlar/dasinmaz-emlak/menziller/35720966
- ################################################################'''
Advertisement
Add Comment
Please, Sign In to add comment