Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from bs4 import BeautifulSoup
- import json
- import requests
- from random import choice
- import time
- ##########################
- FOLDER_NAME_HTML = 'html_kadastres'
- ##########################
- SEARCH_HEAD={
- 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
- 'accept-encoding': 'gzip, deflate, br',
- 'accept-language': 'ru,en;q=0.9',
- 'cache-control': 'no-cache',
- 'dnt': '1',
- 'pragma': 'no-cache',
- 'referer': 'https:/kadbase.ru',
- 'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
- 'sec-ch-ua-mobile': '?0',
- 'sec-ch-ua-platform': '"Windows"',
- 'sec-fetch-dest': 'document',
- 'sec-fetch-mode': 'navigate',
- 'sec-fetch-site': 'same-origin',
- 'sec-fetch-user': '?1',
- 'upgrade-insecure-requests': '1',
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
- }
- def getProxy():
- proxies = [
- '192.168.0.1',
- ]
- proxies = choice(proxies)
- proxyDict = {
- 'http': f'http://login:pass@{proxies}:8000',
- 'https': f'https://login:pass@{proxies}:8000',
- }
- return proxyDict
- def getKadArr():
- try:
- page = requests.get('https://kadbase.ru/cadastre/', headers=SEARCH_HEAD, proxies=getProxy())
- except Exception as e:
- print(str(e))
- exit()
- df = page.text
- # f = open('kadastre.html','r', encoding='utf-8')
- # df = f.read()
- # f.close()
- soup = BeautifulSoup(df, 'html.parser')
- list_kadastres = soup.find('ul', {'class':'ul_list_open'})
- answer = []
- for row in list_kadastres.findAll('a'):
- print(row)
- tab_lists = soup.find('div', {'id':str(row['number'])})
- ins = []
- for link in tab_lists.findAll('a'):
- print(link)
- if '01:00-uslovnyj-kadastrovyj-rajon' in link['href']:
- continue
- ins.append({
- 'href':link['href'],
- 'number': link['number'],
- 'text': link.text,
- })
- answer.append( {
- 'href':row['href'],
- 'number': row['number'],
- 'text': row.text,
- 'in': ins
- })
- with open('kadastres.json', 'w', encoding='utf-8') as f:
- json.dump(answer, f, ensure_ascii=False, indent=4)
- return answer
- def req(URL, page_index, is_start = 1, index = 0):
- try:
- page = requests.get(URL, headers=SEARCH_HEAD, proxies=getProxy())
- except:
- print("ERR", URL)
- exit()
- if 'Ваш IP-адрес заблокирован.' in page.text:
- print('Ваш IP-адрес заблокирован.')
- index = index + 1
- if index > 5:
- exit()
- return req(URL, page_index, is_start, index)
- try:
- htmlfilename = FOLDER_NAME_HTML+'/'+URL.replace('https://kadbase.ru/cadastre/','').replace('/','_').replace(':','#').replace('?','&')+'.html'
- f = open(htmlfilename, 'w+')
- f.write(page.text)
- f.close()
- except Exception as e:
- print(str(e))
- soup = BeautifulSoup(page.text, 'html.parser')
- list_kadastres = soup.findAll('div', {'class':'kad_num', 'attrth':'Кадастровый квартал'})
- list_kadastres_array = []
- for el in list_kadastres:
- list_kadastres_array.append(el.text)
- if is_start == 0:
- try:
- total_records = soup.find('div', {'class':'total_records'}).text.split('(страниц ')[1].split(')')[0]
- except:
- total_records = 1
- return list_kadastres_array, int(int(total_records) + 1)
- return list_kadastres_array, None
- def parserPage(URL):
- list_kadastres_array, total_records = req(URL, 0, 0)
- for i in range(1, total_records):
- tmp, _ = req(URL+'?page='+str(i), i)
- list_kadastres_array = list_kadastres_array + tmp
- time.sleep(2)
- exit()
- return list_kadastres_array
- a = getKadArr()
- # f = open('kadastres.json', 'r', encoding='utf-8')
- # a = json.loads(f.read())
- # f.close()
- for el in a:
- for e in el['in']:
- list_kadastres = parserPage('https://kadbase.ru'+e['href'])
- # e['list'] = list_kadastres
- # with open('kadastres.json', 'w', encoding='utf-8') as f:
- # json.dump(a, f, ensure_ascii=False, indent=4)
Add Comment
Please, Sign In to add comment