Advertisement
daniilak

Untitled

Oct 28th, 2021
205
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.85 KB | None | 0 0
  1. from requests import get
  2. from requests import Session
  3. from json import loads as json_loads, dumps as json_dumps
  4. from seleniumwire import webdriver
  5. from selenium.webdriver.chrome.options import Options
  6. from pickle import dump as pickle_dump, load as pickle_load
  7. from time import sleep
  8.  
  9. from conf import SEARCH_HEAD, proxy_options, executable_chrome_path, cookies_file, categories
  10.  
  11. def work_chrome(category, is_captcha = 0):
  12.     print('Open Browser')
  13.     chrome_options = Options()
  14.     chrome_options.add_extension(r'anitcaptcha_plugin.zip')
  15.     d = webdriver.Chrome(executable_path=executable_chrome_path, options=chrome_options)
  16.     d.get('https://www.ozon.ru'+category)
  17.     # поставить проверку, что не капча
  18.     if is_captcha == 1:
  19.         sleep(30)
  20.     pickle_dump(d.get_cookies(), open(cookies_file, "wb"))
  21.     print("Close Browser")
  22.     d.quit()
  23.  
  24. def debugging(text, name):
  25.     f = open('jsons/'+str(name)+'.json', 'w+', encoding='utf-8')
  26.     f.write(text)
  27.     f.close()
  28.  
  29. def make_session():
  30.     cookies = pickle_load(open(cookies_file, 'rb'))
  31.     session = Session()
  32.     for cookie in cookies:
  33.         session.cookies.set(cookie['name'], cookie['value'])
  34.     return session
  35.  
  36. def make_request(session, URL):
  37.     res = session.get(URL, headers=SEARCH_HEAD)
  38.     return res.text
  39.  
  40. def request_product(session, URL, category):
  41.     df = json_loads(make_request(session, URL))
  42.     if 'widgetStates' not in df:
  43.         print(f"Need captcha in request_product() url={URL}")
  44.         work_chrome(category, 1)
  45.         session = make_session()
  46.         df = json_loads(make_request(session, URL))
  47.     df = df['widgetStates']
  48.     for r in df:
  49.         debugging(json_dumps(json_loads(df[r])), r)
  50.         if 'addToFavorite' in r:
  51.             info = json_loads(df[r])['cellTrackingInfo']['product']
  52.             print(f"""Info product: \n URL={URL} \n ID={info['id']} \n title={info['title']} \n finalPrice={info['finalPrice']} \n price={info['price']} \n discount={info['discount']} \n brandName={info['brandName']} \n brandId={info['brandId']} \n categoryId={info['categoryId']} \n """)
  53.         if 'webCharacteristics' in r:
  54.             print("characteristics:")
  55.             if 'characteristics' in df[r]:
  56.                 for char in json_loads(df[r])['characteristics'][0]['short']:
  57.                     print(f""" key={char['key']} \n name={char['name']} \n value={char['values'][0]['text']} \n """)
  58.         if 'webGallery' in r:
  59.             for image in json_loads(df[r])['images']:
  60.                 print(f"image={image['src']}")
  61.         if 'webAspects' in r:
  62.             asp = json_loads(df[r])['aspects']
  63.             aspects_0 = asp[0]
  64.             print(aspects_0['type']+':')
  65.             for variant in aspects_0['variants']:
  66.                 print(f""" availability={variant['availability']} size={variant['data']['textRs'][0]['content']} coverImage={variant['data']['coverImage']}""")
  67.             if len(asp) == 2:
  68.                 aspects_1 = asp[1]
  69.                 print(aspects_0['type']+':')
  70.                 for variant in aspects_1['variants']:
  71.                     print(f""" availability={variant['availability']} size={variant['data']['textRs'][0]['content']} coverImage={variant['data']['coverImage']}""")
  72.     print("\n\n")
  73.  
  74. def request_category(category, index):
  75.     URL = "https://www.ozon.ru/api/composer-api.bx/page/json/v2?url="+category+"?page="+str(index)+"&page_changed=true"
  76.     if index == 1:
  77.         URL = "https://www.ozon.ru/api/composer-api.bx/page/json/v2?page_changed=true&url="+category
  78.    
  79.     session = make_session()
  80.     df = json_loads(make_request(session, URL))
  81.  
  82.     debugging(json_dumps(df), 'category_info')
  83.    
  84.     if 'widgetStates' not in df:
  85.         print(f"Need captcha in request_category() category={category} index={index}")
  86.         work_chrome(category, 1)
  87.         session = make_session()
  88.         df = json_loads(make_request(session, URL))
  89.         debugging(json_dumps(df), 'category_info')
  90.    
  91.     df = df['widgetStates']
  92.    
  93.     number = 0
  94.  
  95.     for el in df:
  96.         if 'searchResultsV2' not in el:
  97.             continue
  98.        
  99.         items = json_loads(df[el])['items']
  100.         for item in items:
  101.             link = 'https://www.ozon.ru/api/composer-api.bx/page/json/v2?url=' + item['action']['link'].split('/?asb')[0]+'/'
  102.             print(f"\n link product #{number}: {link}")
  103.             number += 1
  104.             # Если нужен перерыв между заходом в каждый товар, то ставим sleep
  105.             # sleep(1)
  106.             request_product(session, link, category)
  107.     return number
  108.  
  109. number = 0
  110. for category in categories:
  111.     # work_chrome(category)
  112.     number += request_category(category, 1)
  113.     number += request_category(category, 2)
  114.     number += request_category(category, 3)
  115.     number += request_category(category, 4)
  116. exit()
  117.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement