Advertisement
Guest User

Amazon Site Scraper

a guest
Aug 26th, 2024
609
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.95 KB | Source Code | 0 0
  1. import requests
  2. import bs4
  3.  
  4. # make sure the query isn't too general - instead of "books" choose "books about trees" or smth
  5. __QUERY__ = 'stickers'
  6.  
  7.  
  8. class SessionData:
  9.     def __init__(self, headers=None):
  10.         if headers is None:
  11.             headers = {
  12.                 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"}
  13.         initial = requests.get(url="https://amazon.com/home", headers=headers)
  14.  
  15.         cookies_dict = initial.cookies.get_dict()
  16.         session_id = cookies_dict['session-id']
  17.         session_id_time = cookies_dict['session-id-time']
  18.         i18n_prefs = cookies_dict['i18n-prefs']
  19.  
  20.         self.session_id = session_id
  21.         self.session_id_time = session_id_time
  22.         self.i18n_prefs = i18n_prefs
  23.  
  24.         self.__dict__ = {'session-id': self.session_id, 'session-id-time': self.session_id_time,
  25.                          'i18n-prefs': self.i18n_prefs}
  26.  
  27.  
  28. class AmazonSession:
  29.     def __init__(self):
  30.         self.headers = {
  31.             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"}
  32.         self.sessionData = SessionData()
  33.  
  34.     def getRawSearchHTML(self, query: str, page: int = 1) -> requests.Response:
  35.         return requests.get(
  36.             url=f'https://www.amazon.com/s?k={query}&page={page}',
  37.             headers=self.headers,
  38.             cookies=self.sessionData.__dict__
  39.         )
  40.  
  41.     def getPaginationAmount(self, query: str) -> int:
  42.         return 100  # amazon does not have an actual system to find the maximum amount, https://amazaon.com/s?k=cord&page=100 xd
  43.  
  44.  
  45. s = AmazonSession()
  46.  
  47. for i in range(s.getPaginationAmount((__QUERY__))):
  48.     html_ = s.getRawSearchHTML(__QUERY__, i + 1).text
  49.  
  50.     soup = bs4.BeautifulSoup(html_, 'html.parser')
  51.     product_containers = soup.find_all(name='div', attrs={'data-component-type': 's-search-result'})
  52.  
  53.     print(f'found {len(product_containers)} results for the query: {__QUERY__} (page {i})')
  54.     if len(product_containers) == 0:
  55.         print('end of pagination detected')
  56.         break
  57.  
  58.     for product in product_containers:
  59.         title_card = product.find(name='div', attrs={'data-cy': 'title-recipe'})
  60.         if title_card:
  61.             title = title_card.find(name='span')
  62.             if title:
  63.                 print(f'* Found product: {title.text}')
  64.                 desc_ = product.find(name='span', attrs={'class': 'a-size-base-plus a-color-base a-text-normal'})
  65.                 if desc_:
  66.                     print(f'\tdescription -> {desc_.text}')
  67.                     img_ = product.find(name='img', attrs={'class': 's-image'})
  68.                     if img_ and img_['src']:
  69.                         print(f'\t\tthumbnail image => {img_['src']}')
  70.             else:
  71.                 print('[no title found]')
  72.         else:
  73.             print('product is invalid]')
  74.  
  75.         print("\n")
  76.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement