Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- from sys import argv
- def write(data, filename):
- with open(filename, 'w', encoding='UTF-8') as f:
- f.write(str(data))
- def get_story_data(stories_links):
- for story in stories_links:
- soup = BeautifulSoup(requests.get(story).text, 'lxml')
- title = soup.find('h2', class_='topic-title accent').text.strip() # type: str
- author = soup.find('a', class_='userlogo link link-dual link-lead link-clear').text.strip()
- time = soup.find('span', class_='topic-date').text + ' ' + soup.find('span', class_='topic-time').text
- likes = soup.find('li', class_='topic-rating js-vote').find('span').text.strip()
- comments_count = soup.find('span', id='count-comments').text
- tag = soup.find('a', class_='link link-lead link-blue').text
- text = soup.find('div', class_='topic-text').text.strip()
- data = {
- 'title': title,
- 'author': author,
- 'time': time,
- 'likes': likes,
- 'comments_count': comments_count,
- 'tag': tag,
- 'text': text
- }
- title = title.strip('?.-=()!@#$%^&*_')
- write(data, title + '.txt')
- def get_page_links(pages_count):
- counter = 1
- links = ['http://sramo.org/index/page1/']
- response = requests.get('http://sramo.org')
- html = response.text
- soup = BeautifulSoup(html, 'lxml')
- while soup.find('a', class_='js-paging-next-page') is not None and counter != pages_count:
- link = soup.find('a', class_='js-paging-next-page').get('href')
- links.append(link)
- counter += 1
- response = requests.get(link)
- html = response.text
- soup = BeautifulSoup(html, 'lxml')
- return links
- def get_stories_links(page_links):
- stories_links = []
- for page_link in page_links:
- page = requests.get(page_link).text
- soup = BeautifulSoup(page, 'lxml')
- for data in soup.find_all('a', class_='link link-lead link-clear link-dark'):
- s_link = data.get('href')
- stories_links.append(s_link)
- return stories_links
- def main():
- pages_count = int(argv[1])
- page_links = get_page_links(pages_count)
- stories_links = get_stories_links(page_links)
- get_story_data(stories_links)
- if __name__ == '__main__':
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement