Advertisement
Ksenia_C

Untitled

Nov 17th, 2022
962
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.01 KB | None | 0 0
  1. import datetime
  2. import json
  3. from selectolax.parser import HTMLParser
  4. import subprocess
  5. import urllib.parse
  6. from dataclasses import dataclass
  7.  
  8. COMMON_CITY_ID = {
  9.     'Москва': 1,
  10.     'Санкт-Петербург': 2,
  11. }
  12. WAY_TO_HSE = {
  13.     'Славянский бульвар': 229,
  14.     'Парк Победы': 87,
  15.     'Киевская': 46,
  16.     'Смоленская': 115,
  17.     'Арбатская': 8,
  18.     'Площадь Революции': 96,
  19.     'Курская': 61,
  20. }
  21.  
  22.  
  23. @dataclass
  24. class HouseFilter:
  25.     city: str
  26.     # type: str only flats - for houses I need dynamic filters
  27.     room_cnt: int
  28.     floor: int
  29.     price_min: int
  30.     price_max: int
  31.     metro: str
  32.  
  33.  
  34. @dataclass
  35. class HouseInfo:
  36.     pic_url: str
  37.     price: float
  38.     address: str
  39.     title: str
  40.     description: str
  41.     contacs: str
  42.  
  43.  
  44. def get_house_filters(path):
  45.     path_arr = path[path.find('?') + 1:].split('&')
  46.     result = HouseFilter(None, None, None, None, None, None)
  47.     for path_el in path_arr:
  48.         key, value = path_el.split('=')
  49.         if key in result.__dict__:
  50.             if key == 'city' or key == 'metro':
  51.                 result.__dict__[key] = urllib.parse.unquote(value)
  52.             else:
  53.                 result.__dict__[key] = value
  54.  
  55.     return result
  56.  
  57.  
  58. def get_house_info(info_jsoin):
  59.     house = HouseInfo(
  60.         info_jsoin['photos'][0]['fullUrl'],
  61.         info_jsoin['bargainTerms']['price'],
  62.         info_jsoin['title'],
  63.         info_jsoin['geo']['userInput'],
  64.         info_jsoin['description'],
  65.         info_jsoin['fullUrl'],
  66.     )
  67.     return {
  68.         'pic_url': house.pic_url,
  69.         'price': house.price,
  70.         'address': house.address,
  71.         'title': house.title,
  72.         'description': house.description,
  73.     }
  74.  
  75.  
  76. class Parser:
  77.     def __init__(self):
  78.         pass
  79.  
  80.     def get_response(self, filters):  # request.path
  81.         external_request = self.get_external_request(
  82.             get_house_filters(filters))
  83.         # print(external_request)
  84.         try:
  85.             result = subprocess.run(
  86.                 external_request, stdout=subprocess.PIPE, check=True)
  87.             response = result.stdout.decode()
  88.             return self.common(response)
  89.         except Exception:
  90.             return {}
  91.  
  92.     def get_debug_response(self):
  93.         response = None
  94.         with open('/home/ksenia/code/ux_parse/response.txt', 'r') as reponse_file:
  95.             response = reponse_file.read()
  96.         return self.common(response)
  97.  
  98.     def common(self, response):
  99.         tree = HTMLParser(response)
  100.         last_script = None
  101.         # find data section
  102.         for i in tree.css('script'):
  103.             if 'address' not in i.html:
  104.                 continue
  105.             last_script = i.html
  106.         with open(f'response_{datetime.now()}.txt', 'w') as file:
  107.             print("hwat the hell", file=file)
  108.             print(last_script, file=file)
  109.  
  110.         # get only json from <script> tags
  111.         last_script = last_script[last_script.find(
  112.             '.concat(') + len('.concat('):]
  113.         last_script = last_script[:last_script.find(');')]
  114.  
  115.         # find all offers
  116.         json_help = json.loads(last_script)
  117.         offers = []
  118.         for json_h in json_help:
  119.             if json_h['key'] != 'initialState':
  120.                 continue
  121.             offers = json_h['value']['results']['offers']
  122.             break
  123.  
  124.  
  125.  
  126.         # make result with offers info to show
  127.         results = []
  128.         for offer in offers:
  129.             results.append(get_house_info(offer))
  130.  
  131.         return results
  132.  
  133.     def add_browser_like_data(self, request):
  134.         headers = [
  135.             # smh hide. I don't know how right now
  136.             "'authority: www.cian.ru'",
  137.             "'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8'",
  138.             "'accept-language: en-US,en;q=0.7'",
  139.             "'cache-control: max-age=0'",
  140.             # "'cookie: '",
  141.             "'cookie: _CIAN_GK=160223d5-f0b4-4a96-aed4-4663aea841dd; session_region_id=1; session_main_town_region_id=1; __cf_bm=ilHDpgHYhL2dbUv1qTEktV8FOZIi_N3w4gTA6YTRmg8-1667436585-0-AbFj/TkKElVMA0qfh9z8RUpnHbJLDhAYjXiFPXEkyTsn8f9r2ygIdAGpqq0ptt/7CrZzonWrTYJ583EEGKJhOG4=; adb=1; login_mro_popup=1; sopr_utm=%7B%22utm_source%22%3A+%22direct%22%2C+%22utm_medium%22%3A+%22None%22%7D; sopr_session=a9b2e3d890284873'",
  142.             # "'if-none-match: '",
  143.             "'if-none-match: W/\"2350d5-qQBbuDMMifONQ+WFAH1pGlrFEcs\"'",
  144.             "'referer: https://www.cian.ru/'",
  145.             "'sec-fetch-dest: document'",
  146.             "'sec-fetch-mode: navigate'",
  147.             "'sec-fetch-site: same-origin'",
  148.             # "'sec-fetch-user: '",
  149.             "'sec-fetch-user: ?1'",
  150.             # "'sec-gpc: '",
  151.             "'sec-gpc: 1'",
  152.             # "'upgrade-insecure-requests: 1'",
  153.             "'upgrade-insecure-requests: 1'",
  154.             # "'user-agent: '",
  155.             "'user-agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'",
  156.         ]
  157.         for header in headers:
  158.             request += ["-H", header]
  159.         request.append("--compressed")
  160.         return request
  161.  
  162.     def get_external_request(self, filters: HouseFilter):
  163.         url = 'https://www.cian.ru/cat.php?deal_type=rent&engine_version=2&offer_type=flat&type=2'
  164.         if filters.city:
  165.             url += f'&region={COMMON_CITY_ID[filters.city]}'
  166.             if COMMON_CITY_ID[filters.city] == 2:
  167.                 url.replace('www', 'spb', 1)
  168.         if filters.room_cnt:
  169.             url += f'&room{filters.room_cnt}=1'
  170.         if filters.floor:
  171.             url += f'&maxfloor={filters.floor}&minfloor={filters.floor}'
  172.         if filters.price_min:
  173.             url += f'&minprice={filters.price_min}'
  174.         if filters.price_max:
  175.             url += f'&maxprice={filters.price_max}'
  176.         if filters.metro:
  177.             url += f'&metro%5B0%5D={WAY_TO_HSE[filters.metro]}'
  178.         curl = ["curl", url]
  179.         return self.add_browser_like_data(curl)
  180.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement