Advertisement
Guest User

Untitled

a guest
Jun 10th, 2018
187
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.36 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from sys import argv
  4.  
  5.  
  6. def write(data, filename):
  7.     with open(filename, 'w', encoding='UTF-8') as f:
  8.         f.write(str(data))
  9.  
  10.  
  11. def get_story_data(stories_links):
  12.     for story in stories_links:
  13.         soup = BeautifulSoup(requests.get(story).text, 'lxml')
  14.         title = soup.find('h2', class_='topic-title accent').text.strip()  # type: str
  15.         author = soup.find('a', class_='userlogo link link-dual link-lead link-clear').text.strip()
  16.         time = soup.find('span', class_='topic-date').text + ' ' + soup.find('span', class_='topic-time').text
  17.         likes = soup.find('li', class_='topic-rating js-vote').find('span').text.strip()
  18.         comments_count = soup.find('span', id='count-comments').text
  19.         tag = soup.find('a', class_='link link-lead link-blue').text
  20.         text = soup.find('div', class_='topic-text').text.strip()
  21.         data = {
  22.             'title': title,
  23.             'author': author,
  24.             'time': time,
  25.             'likes': likes,
  26.             'comments_count': comments_count,
  27.             'tag': tag,
  28.             'text': text
  29.         }
  30.         title = title.strip('?.-=()!@#$%^&*_')
  31.         write(data, title + '.txt')
  32.  
  33.  
  34. def get_page_links(pages_count):
  35.     counter = 1
  36.     links = ['http://sramo.org/index/page1/']
  37.     response = requests.get('http://sramo.org')
  38.     html = response.text
  39.     soup = BeautifulSoup(html, 'lxml')
  40.     while soup.find('a', class_='js-paging-next-page') is not None and counter != pages_count:
  41.         link = soup.find('a', class_='js-paging-next-page').get('href')
  42.         links.append(link)
  43.         counter += 1
  44.         response = requests.get(link)
  45.         html = response.text
  46.         soup = BeautifulSoup(html, 'lxml')
  47.     return links
  48.  
  49.  
  50. def get_stories_links(page_links):
  51.     stories_links = []
  52.     for page_link in page_links:
  53.         page = requests.get(page_link).text
  54.         soup = BeautifulSoup(page, 'lxml')
  55.         for data in soup.find_all('a', class_='link link-lead link-clear link-dark'):
  56.             s_link = data.get('href')
  57.             stories_links.append(s_link)
  58.     return stories_links
  59.  
  60.  
  61. def main():
  62.     pages_count = int(argv[1])
  63.     page_links = get_page_links(pages_count)
  64.     stories_links = get_stories_links(page_links)
  65.     get_story_data(stories_links)
  66.  
  67.  
  68. if __name__ == '__main__':
  69.     main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement