daniilak

Untitled

Dec 22nd, 2021 (edited)
170
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.45 KB | None | 0 0
  1. from bs4 import BeautifulSoup
  2. import json
  3. import requests
  4. from random import choice
  5. import time
  6.  
  7. ##########################
  8. FOLDER_NAME_HTML = 'html_kadastres'
  9.  
  10. ##########################
  11.  
  12. SEARCH_HEAD={
  13.     'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
  14.     'accept-encoding': 'gzip, deflate, br',
  15.     'accept-language': 'ru,en;q=0.9',
  16.     'cache-control': 'no-cache',
  17.     'dnt': '1',
  18.     'pragma': 'no-cache',
  19.     'referer': 'https:/kadbase.ru',
  20.     'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
  21.     'sec-ch-ua-mobile': '?0',
  22.     'sec-ch-ua-platform': '"Windows"',
  23.     'sec-fetch-dest': 'document',
  24.     'sec-fetch-mode': 'navigate',
  25.     'sec-fetch-site': 'same-origin',
  26.     'sec-fetch-user': '?1',
  27.     'upgrade-insecure-requests': '1',
  28.     'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
  29. }
  30. def getProxy():
  31.     proxies = [
  32.             '192.168.0.1',
  33.         ]
  34.     proxies = choice(proxies)
  35.     proxyDict = {
  36.         'http': f'http://login:pass@{proxies}:8000',
  37.         'https':  f'https://login:pass@{proxies}:8000',
  38.     }
  39.     return proxyDict
  40. def getKadArr():
  41.     try:
  42.         page = requests.get('https://kadbase.ru/cadastre/', headers=SEARCH_HEAD, proxies=getProxy())
  43.     except Exception as e:
  44.         print(str(e))
  45.         exit()
  46.     df = page.text
  47.     # f = open('kadastre.html','r', encoding='utf-8')
  48.     # df = f.read()
  49.     # f.close()
  50.     soup = BeautifulSoup(df, 'html.parser')
  51.     list_kadastres = soup.find('ul', {'class':'ul_list_open'})
  52.     answer = []
  53.     for row in list_kadastres.findAll('a'):
  54.         print(row)
  55.         tab_lists = soup.find('div', {'id':str(row['number'])})
  56.         ins = []
  57.         for link in tab_lists.findAll('a'):
  58.             print(link)
  59.             if '01:00-uslovnyj-kadastrovyj-rajon' in link['href']:
  60.                 continue
  61.             ins.append({
  62.                 'href':link['href'],
  63.                 'number': link['number'],
  64.                 'text': link.text,
  65.             })
  66.         answer.append( {
  67.             'href':row['href'],
  68.             'number': row['number'],
  69.             'text': row.text,
  70.             'in': ins
  71.         })
  72.     with open('kadastres.json', 'w', encoding='utf-8') as f:
  73.         json.dump(answer, f, ensure_ascii=False, indent=4)
  74.     return answer
  75.  
  76.  
  77. def req(URL, page_index, is_start = 1, index = 0):
  78.     try:
  79.         page = requests.get(URL, headers=SEARCH_HEAD, proxies=getProxy())
  80.     except:
  81.         print("ERR", URL)
  82.         exit()
  83.     if 'Ваш IP-адрес заблокирован.' in page.text:
  84.         print('Ваш IP-адрес заблокирован.')
  85.         index = index + 1
  86.         if index > 5:
  87.             exit()
  88.         return req(URL, page_index, is_start, index)
  89.     try:
  90.         htmlfilename = FOLDER_NAME_HTML+'/'+URL.replace('https://kadbase.ru/cadastre/','').replace('/','_').replace(':','#').replace('?','&')+'.html'
  91.         f = open(htmlfilename, 'w+')
  92.         f.write(page.text)
  93.         f.close()
  94.     except Exception as e:
  95.         print(str(e))
  96.     soup = BeautifulSoup(page.text, 'html.parser')
  97.     list_kadastres = soup.findAll('div', {'class':'kad_num', 'attrth':'Кадастровый квартал'})
  98.     list_kadastres_array  = []
  99.     for el in list_kadastres:
  100.         list_kadastres_array.append(el.text)
  101.     if is_start == 0:
  102.         try:
  103.             total_records = soup.find('div', {'class':'total_records'}).text.split('(страниц ')[1].split(')')[0]
  104.         except:
  105.             total_records = 1
  106.         return list_kadastres_array, int(int(total_records) + 1)
  107.     return list_kadastres_array, None
  108.  
  109. def parserPage(URL):
  110.     list_kadastres_array, total_records = req(URL, 0, 0)
  111.     for i in range(1, total_records):
  112.         tmp, _ = req(URL+'?page='+str(i), i)
  113.         list_kadastres_array = list_kadastres_array + tmp
  114.         time.sleep(2)
  115.         exit()
  116.     return list_kadastres_array
  117.  
  118. a = getKadArr()
  119. # f = open('kadastres.json', 'r', encoding='utf-8')
  120. # a = json.loads(f.read())
  121. # f.close()
  122. for el in a:
  123.     for e in el['in']:
  124.         list_kadastres = parserPage('https://kadbase.ru'+e['href'])
  125.         # e['list'] = list_kadastres
  126.         # with open('kadastres.json', 'w', encoding='utf-8') as f:
  127.         #     json.dump(a, f, ensure_ascii=False, indent=4)
  128.        
  129.  
Add Comment
Please, Sign In to add comment