dEN5

python | parse yandex images | requests | json | params | search by image | search by word

Jan 7th, 2022
1,506
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 7.04 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import requests
  4. import json
  5. import sys
  6. from pprint import pprint
  7. from html_to_json import convert as cnv
  8. from random import randint
  9. from lxml.html.clean import Cleaner
  10. import lxml
  11. import re
  12. from bs4 import BeautifulSoup as bs
  13. from timeit import timeit
  14. import os
  15. from  glob import glob
  16. headers = {
  17.     'authority': 'yandex.ru',
  18.     'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
  19.     'device-memory': '8',
  20.     'rtt': '150',
  21.     'sec-ch-ua-mobile': '?0',
  22.     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
  23.     'viewport-width': '575',
  24.     'dpr': '1',
  25.     'downlink': '4.15',
  26.     'ect': '4g',
  27.     'sec-ch-ua-platform': '"Windows"',
  28.     'accept': 'application/json',
  29.     'sec-fetch-site': 'same-origin',
  30.     'sec-fetch-mode': 'cors',
  31.     'sec-fetch-dest': 'empty',
  32.     'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5'
  33. }
  34.  
  35.  
  36. def info(id):
  37.     params = (
  38.         ('docid', f'{id}'),
  39.         ('lang', 'ru'),
  40.         ('mt', '1'),
  41.         ('family', '0'),
  42.         ('pornowhitelist', '1'),
  43.         ('ipnd', '1'),
  44.     )
  45.  
  46.     response = requests.get('https://yandex.ru/images-apphost/rim',
  47.                             headers=headers, params=params).json()
  48.     return response
  49.  
  50.  
  51. def load_image(byte):
  52.     headers = {
  53.         'authority': 'yandex.ru',
  54.         'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
  55.         'device-memory': '8',
  56.         'rtt': '200',
  57.         'sec-ch-ua-mobile': '?0',
  58.         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
  59.         'viewport-width': '794',
  60.         'content-type': 'image/jpeg',
  61.         'dpr': '1',
  62.         'downlink': '2.65',
  63.         'ect': '4g',
  64.         'sec-ch-ua-platform': '"Windows"',
  65.         'accept': '*/*',
  66.         'origin': 'https://yandex.ru',
  67.         'sec-fetch-site': 'same-origin',
  68.         'sec-fetch-mode': 'cors',
  69.         'sec-fetch-dest': 'empty',
  70.         'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5'
  71.     }
  72.  
  73.     params = (
  74.         ('cbird', '37'),
  75.         ('images_avatars_size', 'preview'),
  76.         ('images_avatars_namespace', 'images-cbir')
  77.     )
  78.  
  79.     data = byte
  80.     response = requests.post('https://yandex.ru/images-apphost/image-download',
  81.                              headers=headers, params=params, data=data).json()
  82.     print(response)
  83.     return response
  84.  
  85.  
  86. def getInfoImage(url):
  87.     headers = {
  88.         'authority': 'yandex.ru',
  89.         'cache-control': 'max-age=0',
  90.         'device-memory': '8',
  91.         'dpr': '1',
  92.         'viewport-width': '1280',
  93.         'rtt': '200',
  94.         'downlink': '2.2',
  95.         'ect': '4g',
  96.         'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="96", "Google Chrome";v="96"',
  97.         'sec-ch-ua-mobile': '?0',
  98.         'sec-ch-ua-platform': '"Windows"',
  99.         'upgrade-insecure-requests': '1',
  100.         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
  101.         'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  102.         'sec-fetch-site': 'same-origin',
  103.         'sec-fetch-mode': 'navigate',
  104.         'sec-fetch-user': '?1',
  105.         'sec-fetch-dest': 'document',
  106.         'accept-language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6,zh;q=0.5'
  107.         }
  108.     if isinstance(dict, type(url)):
  109.         params = (
  110.             ('url', "".join(url['url'].split("/")[:-2]) + "orig"),
  111.             ('cbir_id', url['url'].split("get-images-cbir/")
  112.              [-1].split("/preview")[0]),
  113.             ('cbir_page', 'similar'),
  114.             ('rpt', 'imageview'),
  115.             ('family', '0'),
  116.             ('pornowhitelist', '1'),
  117.             ('ipnd', '1'),
  118.         )
  119.     elif not url.startswith("http"):
  120.         params = (
  121.         ('text', url),
  122.         ('from', 'tabbar'),
  123.         ('family', '0'),
  124.         ('pornowhitelist', '1'),
  125.         ('ipnd', '1'),
  126.         )  
  127.     else:
  128.         params = (
  129.             ('url', url),
  130.             ('cbir_page', 'similar'),
  131.             ('rpt', 'imageview'),
  132.             ('family', '0'),
  133.             ('pornowhitelist', '1'),
  134.             ('ipnd', '1'),
  135.         )
  136.     response = requests.get('https://yandex.ru/images/search',
  137.                             headers=headers, params=params)
  138.     print(response.url)
  139.     root = lxml.html.fromstring(response.content)
  140.     data = list(root.xpath('//*[@id]/@data-bem'))
  141.     for i in data:
  142.         i = json.loads(i)
  143.         if "serp-item" in i:
  144.             if "rimId" in i["serp-item"]:
  145.                 yield i["serp-item"]["rimId"]
  146.  
  147.  
  148. def sJson(response, name):
  149.     with open(f"{name}.html", "w", encoding="utf-8")as f:
  150.         cleaner = Cleaner(style=True, scripts=True, javascript=True, inline_style=True, links=True, add_nofollow=False,
  151.                           page_structure=True, safe_attrs_only=False)
  152.         f.write(cleaner.clean_html(response))
  153.  
  154.  
  155. all_links = []
  156.  
  157.  
  158. def vldc(elem):
  159.     try:
  160.         requests.get(elem)
  161.     except:
  162.         return False
  163.  
  164.  
  165. def map_append(elem):
  166.     all_links.append(elem["iu"])
  167.  
  168.  
  169. def get_from_dict(all_links, response):
  170.     for num, i in enumerate(response["rld"]):
  171.         infos = i["s"]
  172.         map(map_append, infos)
  173.  
  174.  
  175. def Glob_matching(src):
  176.     prt = glob("*.*")
  177.     if src in prt:
  178.         return True
  179.  
  180. def links_yd(uri):
  181.     if isinstance(str, type(uri)) and uri.startswith("C:") or Glob_matching(uri):
  182.         with open(uri, "rb") as image:
  183.             f = image.read()
  184.             try:
  185.                 response = info(getInfoImage(load_image(f)))
  186.                 for l in response:
  187.                     for num, i in enumerate(response["rld"]):
  188.                         infos = i["s"]
  189.                         for i in infos:
  190.                             all_links.append(i["iu"])
  191.                         if "id" in i:
  192.                             get_from_dict(all_links, info(i["id"]))
  193.  
  194.                 return all_links
  195.             except:
  196.                 return None
  197.     elif isinstance(bytes, type(uri)):
  198.         f = uri
  199.  
  200.         response = info(getInfoImage(load_image(f)))
  201.         for l in response:
  202.             for num, i in enumerate(response["rld"]):
  203.                 infos = i["s"]
  204.                 for i in infos:
  205.                     all_links.append(i["iu"])
  206.                 if "id" in i:
  207.                     get_from_dict(all_links, info(i["id"]))
  208.  
  209.         return all_links
  210.     else:
  211.         response = map(info,getInfoImage(uri))
  212.         for l in response:
  213.             for num, i in enumerate(l["rld"]):
  214.                 infos = i["s"]
  215.                 for i in infos:
  216.                     all_links.append(i["iu"])
  217.                 if "id" in i:
  218.                     get_from_dict(all_links, info(i["id"]))
  219.  
  220.         return all_links
  221.  
  222.  
  223. print(links_yd("cats"))
  224.  
Advertisement
Add Comment
Please, Sign In to add comment