Try95th

getUniqLinks [tap_az] for q_74598056

Nov 30th, 2022 (edited)
224
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.75 KB | None | 0 0
  1. ############ PRINTED OUTPUT AT BOTTOM ############
  2. ## for https://stackoverflow.com/q/74598056/6146136
  3.  
  4. # from selenium.webdriver.common.by import By
  5. # from selenium.webdriver.support.ui import WebDriverWait
  6. # from selenium.webdriver.support import expected_conditions as EC
  7. # import requests
  8. # from bs4 import BeautifulSoup
  9. # import pandas as pd
  10.  
  11.  
  12. rootUrl = 'https://tap.az'
  13. fullUrl = f'{rootUrl}/elanlar/dasinmaz-emlak/menziller'
  14.  
  15.  
  16. def getUniqLinks(URL, rUrl, max_scrolls=40, maxFails=5, tmo=3):
  17.     elsViz = EC.visibility_of_all_elements_located
  18.     scrollElToTop = "arguments[0].scrollIntoView(true);"
  19.  
  20.     driverX = webdriver.Chrome()
  21.  
  22.     driverX.get(URL)
  23.     print(driverX.current_url)
  24.  
  25.     plXpath = '//div[@class="products-bookmarking"]//a[@href]'
  26.     piXclas = "concat(' ',normalize-space(@class),' '),' products-i '"
  27.     piXpath = f"//div[contains({piXclas})][{plXpath[2:]}]"
  28.     nxtp_xpath = f'/following-sibling::{piXpath[2:]}'
  29.  
  30.     WebDriverWait(driverX, tmo).until(elsViz((By.XPATH, piXpath)))
  31.     failCt = 0
  32.     prodLinks = []  # remove
  33.     for sci in range(max_scrolls):
  34.         prods = driverX.find_elements(By.XPATH, piXpath)
  35.  
  36.         # remove below ################################### # remove
  37.         if sci == 0:  # remove
  38.             prodLinks += [p.find_element(  # remove
  39.                 By.XPATH, plXpath # remove
  40.             ).get_attribute('href') for p in prods]  # remove
  41.         # remove above ################################### # remove
  42.  
  43.         driverX.execute_script(scrollElToTop, prods[-1])
  44.  
  45.         nxtp_xful = f'{piXpath}[{len(prods) - 1}]{nxtp_xpath}'
  46.         try:
  47.             WebDriverWait(driverX, tmo).until(elsViz((By.XPATH, nxtp_xful)))
  48.         except:
  49.             failCt += 1
  50.             print(f'\n[Failed to load more fail#{failCt} (max: {maxFails})]')
  51.             if failCt < maxFails:
  52.                 continue
  53.             break
  54.         nprods = driverX.find_elements(By.XPATH, nxtp_xful)
  55.  
  56.         # remove below ################################### # remove
  57.         prodLinks += [p.find_element(  # remove
  58.             By.XPATH, plXpath  # remove
  59.         ).get_attribute('href') for p in nprods]  # remove
  60.         # remove above ################################### # remove
  61.  
  62.         print('', end=(f'\r[{sci+1} of {max_scrolls}]' + (
  63.             f'  [started with] {len(prods)} products --> '
  64.         ) + f'  {len(nprods)} new products'))
  65.  
  66.     # remove below ######################################### # remove
  67.     uniqLinks = list(set([l.strip('/bookmark') for l in prodLinks]))  # remove
  68.     print(  # remove
  69.         f'\nWITH SELENIUM found {len(prodLinks)} product links',  # remove
  70.         f'[{len(uniqLinks)} unique]'  # remove
  71.     )  # remove
  72.     print('\n'.join([u for u in uniqLinks[:5]]))  # remove
  73.     # remove above ######################################### # remove
  74.  
  75.     print('', end='\n\nParsing page_source...')
  76.     soup = BeautifulSoup(driverX.page_source, 'html.parser')
  77.     plSel = '.products-bookmarking a[href]'
  78.     piSel = f'div.products-i:has({plSel})'
  79.     prodLinks_parsed = [(
  80.         rUrl + p.select_one(plSel).get('href')
  81.     ) for p in soup.select(piSel)]
  82.     uniqLinks_parsed = list(set([
  83.         l.strip('/bookmark') for l in prodLinks_parsed
  84.     ]))
  85.     print(
  86.         f'\rPARSED PAGE_SOURCE  --->  found ',
  87.         f' {len(prodLinks_parsed)} product links ',
  88.         f' [{len(uniqLinks_parsed)} unique]'
  89.     )
  90.     print('\n'.join([u for u in uniqLinks_parsed[:5]]))
  91.     driverX.quit()  # just in case
  92.     del driverX  # just in case
  93.  
  94.     # return uniqLinks # remove
  95.     return uniqLinks_parsed
  96.  
  97.  
  98. prodUrls = getUniqLinks(fullUrl, rootUrl, max_scrolls=250, tmo=1)
  99. pd.DataFrame({'links': prodUrls}).to_csv('prodLinks.csv', index=False)
  100.  
  101.  
  102.  
  103. '''######################## PRINTED OUTPUT ########################
  104. https://tap.az/elanlar/dasinmaz-emlak/menziller
  105. [176 of 250]  [started with] 8455 products -->   42 new products
  106. [Failed to load more fail#1 (max: 5)]
  107. [194 of 250]  [started with] 9271 products -->   42 new products
  108. [Failed to load more fail#2 (max: 5)]
  109. [235 of 250]  [started with] 11863 products -->   42 new products
  110. [Failed to load more fail#3 (max: 5)]
  111. [250 of 250]  [started with] 12535 products -->   42 new products
  112. WITH SELENIUM found 10957 product links [1 unique]
  113. https://tap.az/elanlar/dasinmaz-emlak/menziller/35903364
  114.  
  115.  
  116. PARSED PAGE_SOURCE  --->  found   12583 product links   [12576 unique]
  117. https://tap.az/elanlar/dasinmaz-emlak/menziller/35828514
  118. https://tap.az/elanlar/dasinmaz-emlak/menziller/35833080
  119. https://tap.az/elanlar/dasinmaz-emlak/menziller/35898353
  120. https://tap.az/elanlar/dasinmaz-emlak/menziller/35837943
  121. https://tap.az/elanlar/dasinmaz-emlak/menziller/35720966
  122. ################################################################'''
  123.  
  124.  
  125.  
Advertisement
Add Comment
Please, Sign In to add comment