Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import bs4
- # make sure the query isn't too general - instead of "books" choose "books about trees" or smth
- __QUERY__ = 'stickers'
- class SessionData:
- def __init__(self, headers=None):
- if headers is None:
- headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"}
- initial = requests.get(url="https://amazon.com/home", headers=headers)
- cookies_dict = initial.cookies.get_dict()
- session_id = cookies_dict['session-id']
- session_id_time = cookies_dict['session-id-time']
- i18n_prefs = cookies_dict['i18n-prefs']
- self.session_id = session_id
- self.session_id_time = session_id_time
- self.i18n_prefs = i18n_prefs
- self.__dict__ = {'session-id': self.session_id, 'session-id-time': self.session_id_time,
- 'i18n-prefs': self.i18n_prefs}
- class AmazonSession:
- def __init__(self):
- self.headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"}
- self.sessionData = SessionData()
- def getRawSearchHTML(self, query: str, page: int = 1) -> requests.Response:
- return requests.get(
- url=f'https://www.amazon.com/s?k={query}&page={page}',
- headers=self.headers,
- cookies=self.sessionData.__dict__
- )
- def getPaginationAmount(self, query: str) -> int:
- return 100 # amazon does not have an actual system to find the maximum amount, https://amazaon.com/s?k=cord&page=100 xd
- s = AmazonSession()
- for i in range(s.getPaginationAmount((__QUERY__))):
- html_ = s.getRawSearchHTML(__QUERY__, i + 1).text
- soup = bs4.BeautifulSoup(html_, 'html.parser')
- product_containers = soup.find_all(name='div', attrs={'data-component-type': 's-search-result'})
- print(f'found {len(product_containers)} results for the query: {__QUERY__} (page {i})')
- if len(product_containers) == 0:
- print('end of pagination detected')
- break
- for product in product_containers:
- title_card = product.find(name='div', attrs={'data-cy': 'title-recipe'})
- if title_card:
- title = title_card.find(name='span')
- if title:
- print(f'* Found product: {title.text}')
- desc_ = product.find(name='span', attrs={'class': 'a-size-base-plus a-color-base a-text-normal'})
- if desc_:
- print(f'\tdescription -> {desc_.text}')
- img_ = product.find(name='img', attrs={'class': 's-image'})
- if img_ and img_['src']:
- print(f'\t\tthumbnail image => {img_['src']}')
- else:
- print('[no title found]')
- else:
- print('product is invalid]')
- print("\n")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement