dEN5

python | parse yandex images | requests | json | params

Aug 18th, 2021 (edited)
7,297
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.00 KB | None | 0 0
  1. #created dEN5#7360 (DISCORD)
  2. #USE https://curl.trillworks.com/
  3.  
  4. import requests
  5. import json
  6. from bs4 import BeautifulSoup as bs
  7.  
  8. type_img_d= {
  9.     "gif":"gifan",
  10.     "png":"png",
  11.     "jpg":"jpg"
  12.  
  13. }
  14.  
  15. type_img_size= {
  16.     "Большие":"large",
  17.     "Средние":"medium",
  18.     "Маленькие":"small"
  19.  
  20. }
  21.  
  22. def get_req_img_whith_yandex(query_mn,start_=0,limit=1,type_="choice",add_page = True):
  23.     img_size,type_img,recent = False,False,False
  24.     headers = {
  25.         'authority': 'yandex.ru',
  26.         'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
  27.         'device-memory': '4',
  28.         'rtt': '250',
  29.         'sec-ch-ua-mobile': '?0',
  30.         'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
  31.         'viewport-width': '791',
  32.         'accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
  33.         'x-requested-with': 'XMLHttpRequest',
  34.         'dpr': '1',
  35.         'downlink': '4.6',
  36.         'ect': '4g',
  37.         'sec-fetch-site': 'same-origin',
  38.         'sec-fetch-mode': 'cors',
  39.         'sec-fetch-dest': 'empty',
  40.         'referer': 'https://yandex.ru/images/search?from=tabbar&text=google%20search%20api',
  41.         'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5',}
  42.     list_links = []
  43.     list_dict = []
  44.     iter = 0
  45.     iter+=start_
  46.     start = time.monotonic()
  47.     end = float()
  48.     pager = []
  49.  
  50.     while add_page:
  51.         params = [
  52.             ('format', 'json'),
  53.             ('request', '{"blocks":[{"block":"extra-content","params":{},"version":2},{"block":"serp-controller","params":{},"version":2},{"block":"serp-list_infinite_yes","params":{"initialPageNum":0},"version":2},{"block":"more_direction_next","params":{},"version":2},{"block":"gallery__items:ajax","params":{},"version":2}],"metadata":{"bundles":{"lb":"jCgK5?b*G$Xvb>:BUOR$"},"assets":{"las":"justifier-height=1;thumb-underlay=1;justifier-setheight=1;fitimages-height=1;justifier-fitincuts=1;react-with-dom=1;ca993f.0=1;d30d05.0=1;105ac6.0=1;bed1df.0=1"},"version":"0x0f74f9d0500","extraContent":{"names":["i-react-ajax-adapter"]}},"bmt":{"lb":"jCgK5?b*G$Xvb>:BUOR$"},"amt":{"las":"justifier-height=1;thumb-underlay=1;justifier-setheight=1;fitimages-height=1;justifier-fitincuts=1;react-with-dom=1;ca993f.0=1;d30d05.0=1;105ac6.0=1;bed1df.0=1"}}'),
  54.             ('yu', '1656658121627748886'),
  55.             ('p', iter),
  56.             ('from', 'tabbar'),
  57.             ('text', query_mn),
  58.             ('rpt', 'image'),
  59.             ('serpid', 'a7tbQ4lJOYyrChOZD000iQ'),
  60.             ('serpListType', 'horizontal'),
  61.             ('thumbSnippet', '0'),
  62.  
  63.         ]
  64.         if type_img:
  65.             params.append(("itype",type_img_d[type_img]))
  66.         if recent:
  67.             params.append(("recent","7D"))
  68.         if img_size:
  69.             try:
  70.                 params.append(("isize",type_img_size[img_size]))
  71.             except:
  72.                 size_offset = [("isize","eq"),("iw",img_size[0]),("ih",img_size[1])]
  73.                 for i in size_offset:
  74.                     params.append(i)
  75.  
  76.  
  77.         response = requests.get('https://yandex.ru/images/search', headers=headers, params=params)
  78.         json_data = json.dumps(response.text)
  79.         json_without_slash = json.loads(json_data)
  80.         try:
  81.             data_json = json.loads(json_without_slash)["blocks"][2]['html']
  82.         except json.decoder.JSONDecodeError:
  83.             break
  84.         soup = bs(data_json, 'html.parser')
  85.         list_json = soup.find_all("div", class_=re.compile("serp-item serp-item_type_search serp-item_group_search serp-item_pos_.* serp-item_scale_yes justifier__item i-bem"))
  86.         list_links_t = []
  87.         for i in list_json:
  88.             items = i.get("data-bem")
  89.             item = json.loads(items)
  90.             serp_item = item["serp-item"]
  91.             list_links.append(serp_item["preview"][0]["url"])
  92.             list_links_t.append(serp_item["preview"][0]["url"])
  93.             list_dict.append(serp_item)
  94.         if limit>1:
  95.             iter+=1
  96.         print(iter)
  97.         pager.append({f"{iter}":list_links_t})
  98.  
  99.         if iter==limit+start_:
  100.             print(iter)
  101.             end =  time.monotonic()
  102.             break
  103.         print(len(list_links))
  104.     print(len(list_links),end-start)
  105.     if type_=="all":
  106.         return list_links
  107.     if type_=="choice":
  108.         return choice(list_links),len(list_links)
  109.     if type_=="dic_ch":
  110.         return choice(list_dict)
  111.     if type_=="p_dict":
  112.         return pager
Add Comment
Please, Sign In to add comment