Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import urllib
- from bs4 import BeautifulSoup as bs
- def articleParse(url, newFileName):
- file = urllib.urlopen(url).read()
- f = open(newFileName, "wb")
- f.write(logo)
- f.close()
- headers = {'accept': '*/*',
- 'user-agent': 'Mozilla/5.0(X11;Linux x86_64...)Geco/20100101 Firefox/60.0'}
- base_url = 'https://habr.com/ru/all/top100/page'
- out = 'Lib.txt'
- //os.makedirs(newpath)
- articleNum = 0;
- //document.getElementsByClassName('post__title-text') - заголовок+
- //document.getElementsByClassName('post__time') в свойстве data-time_published дата и время публикации+
- //document.getElementsByClassName('inline-list__item inline-list__item_hub') - теги innerText
- //document.getElementsByClassName('post__text post__text-html js-mediator-article') - текст самой статьи
- def articleParse(url, headers, session, library):
- global articleNum
- request = session.get(url, headers=headers)
- if request.status_code == 200:
- soup = bs(request.content, 'html.parser')
- title = soup.find_all('span', attrs={'class' :'post__title-text'})[0].text
- time = soup.find_all('span', attrs={'class' :'post__time'})[0].['data-time_published']
- library.write('---\n')
- library = open(articleNum+'.txt', 'w')
- articleNum = articleNum + 1
- def habrPageParse(base_url, headers, session, pageNum, library):
- request = session.get(base_url + pageNum+'/', headers=headers)
- if request.status_code == 200:
- soup = bs(request.content, 'html.parser')
- links = soup.find_all('a', attrs={'class' :'post__title_link'})
- for link in links:
- articleParse(link['href'], headers, session, library)
- return 0
- else:
- return 1
- def habrParse(base_url, headers
- library = open(out, 'w')
- session = requests.session()
- page = 0
- while True:
- page = page + 1
- if (habrPageParse(base_url, headers, session, page, library) != 0):
- break;
- library.close()
- habrParse(base_url, headers)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement