Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- URL = 'https://ru.wikipedia.org/wiki/%D0%A3%D0%BF%D0%BE%D0%BB%D0%BD%D0%BE%D0%BC%D0%BE%D1%87%D0%B5%D0%BD%D0%BD%D1%8B%D0%B9_%D0%BF%D0%BE_%D0%B7%D0%B0%D1%89%D0%B8%D1%82%D0%B5_%D0%BF%D1%80%D0%B0%D0%B2_%D0%BF%D1%80%D0%B5%D0%B4%D0%BF%D1%80%D0%B8%D0%BD%D0%B8%D0%BC%D0%B0%D1%82%D0%B5%D0%BB%D0%B5%D0%B9_%D0%B2_%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D0%B8'
- HEADERS = {
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
- 'accept': '*/*'}
- HOST = 'https://ru.wikipedia.org'
- def get_dig_date(str, mode=1):
- dictt = {'января': 1, 'янв': 1, 'jan': 1, 'january': 1, '1': 1, '01': 1,
- 'февраля': 2, 'фев': 2, 'feb': 2, 'february': 2, '2': 2, '02': 2,
- 'марта': 3, 'march': 3, '3': 3, '03': 3,
- 'апреля': 4, 'april': 4, '4': 4, '04': 4,
- 'мая': 5, 'may': 5, '5': 5, '05': 5,
- 'июня': 6, 'june': 6, '6': 6, '06': 6,
- 'июля': 7, 'july': 7, '7': 7, '07': 7,
- 'августа': 8, 'august': 8, '8': 8, '08': 8,
- 'сентября': 9, 'september': 9, '9': 9, '09': 9,
- 'октября': 10, 'october': 10, '10': 10,
- 'ноября': 11, 'november': 11, '11': 11,
- 'декабря': 12, 'december': 12, '12': 12,
- }
- str = str.replace('-', ' ')
- list = str.split()
- ind_d = 0
- ind_m = 1
- ind_y = 2
- if mode == 2:
- ind_d = 2
- ind_m = 1
- ind_y = 0
- lst = {'day': int(list[ind_d]), 'month': dictt[list[ind_m]], 'year': int(list[ind_y])}
- return lst
- def get_html(url, params=None):
- html = requests.get(url, headers=HEADERS, params=params)
- return html
- def get_content(html, mode):
- soup = BeautifulSoup(html.text, 'html.parser')
- table = soup.find('table', class_='infobox')
- image_box = table.find('td', class_='infobox-image')
- p_link = HOST + table.find('span', class_='no-wikidata').find_next('a').get('href')
- p_soup = BeautifulSoup(get_html(p_link).text, 'html.parser')
- p_box = p_soup.find('td', class_='plainlist')
- person = {
- 'name': table.find('span', class_='no-wikidata').get_text(),
- 'p_link': p_link,
- 'b_date': p_box.find_next('span', class_='wikidata-claim').find_next('a').get_text() + " " + p_box.find_next('span', class_='wikidata-claim').find_next('a').find_next('a').get_text(),
- 'ef_date': table.find('span', class_='no-wikidata').find_next('a').find_next('a').get_text() + " " + table.find('span', class_='no-wikidata').find_next('a').find_next('a').find_next('a').get_text(),
- 'image_link': 'https:' + image_box.find('img').get('src')
- }
- return person
- def parse():
- html = get_html(URL)
- person = get_content(html)
- name = {'first_name': person['name'].split()[0],
- 'middle_name': person['name'].split()[1],
- 'last_name': person['name'].split()[2]
- }
- content = {
- 'name': name,
- 'p_link': person['p_link'],
- 'b_date': get_dig_date(person['b_date']),
- 'ef_date': get_dig_date(person['ef_date'].replace('года', '')),
- 'image_link': person['image_link']
- }
- print(content)
- parse()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement