Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/bin/python3
- import bs4
- import requests
- import sys
- import random
- import pandas as pd
- from user_agents import *
- from requests.api import request
- #python3 litero_downloader.py [base_url] [starting_page] [ending_page]
- #parameters
- BASE_URL = sys.argv[1]
- STARTING_PAGE = int(sys.argv[2])
- ENDING_PAGE = int(sys.argv[3])
- #arrays
- link_arr = []
- title_arr = []
- descr_arr = []
- author_arr = []
- date_arr = []
- rating_arr = []
- hot_arr = []
- story_arr = []
- def get_random_user_agent():
- return USER_AGENTS[random.randint(0, len(USER_AGENTS) - 1)]
- #creating session
- # sess = requests.session()
- #for all the pages
- for cur_page in range(STARTING_PAGE, ENDING_PAGE):
- print('Page {}'.format(str(cur_page)))
- #for each story
- page_data = requests.get(
- BASE_URL +
- '/{}-page'.format(str(cur_page)),
- headers={'User-Agent' : get_random_user_agent()})
- bs0 = bs4.BeautifulSoup(page_data.text, features='lxml')
- all_entries_in_page = bs0.find_all(class_='b-sl-item-r w-34t')
- for cur_entry in all_entries_in_page:
- entry_link = cur_entry.find_all(class_='r-34i')[0]['href']
- entry_title = cur_entry.find_all(class_='r-34i')[0].text
- entry_descr = cur_entry.find_all(class_='b-sli-description p-57u')[0].text.replace('\xa0—\xa0', '') if(cur_entry.find_all(class_='b-sli-description p-57u')[0].text.replace('\xa0—\xa0', '') != '') else '-'
- entry_author = cur_entry.find_all(class_='b-sli-author')[0].text.replace('by','').replace('\n','').replace('\t','').replace('\xa0','')
- entry_date = cur_entry.find(class_='b-sli-date').text
- #rating
- entry_rating = 0.0
- try:
- entry_rating = float(cur_entry.find(class_='b-sli-rating').text)
- except:
- entry_rating = 0.0
- #is hot?
- entry_hot = False
- if(str(cur_entry.find(class_='b-sli-stat-icons')).find('HOT') >= 0):
- entry_hot = True
- print('Getting {}'.format(entry_title))
- #now going into the particular entry's page
- first_page_entry_data = requests.get(
- entry_link,
- headers={'User-Agent' : get_random_user_agent()}
- )
- story_text = ''
- number_of_part = len(bs4.BeautifulSoup(first_page_entry_data.text, features='lxml').find_all(class_='l_bJ')) - 2
- if(number_of_part > 0):
- #looping of all the parts
- for cur_part in range(0, number_of_part):
- print('Getting part {} / {}'.format(str(cur_part + 1), str(number_of_part)))
- try:
- part_data = requests.get(
- entry_link + '?page={}'.format(cur_part + 1),
- headers={'User-Agent' : get_random_user_agent()})
- except:
- continue
- bs1 = bs4.BeautifulSoup(part_data.text, features='lxml')
- story_text += str(bs1.find(class_='panel article aa_eQ')).replace('<div class="aa_ht"><div style="display:contents"><div class=""></div></div></div><div class="aa_hv aa_hy"><a class="aa_hz" data-action="ReportStoryToModerator-open" href="#" title="Report story"><i class="icon icon-exclamation-circle aa_hw"></i><span class="aa_hA">report</span></a></div></div>','')
- else:
- story_text += str(bs4.BeautifulSoup(first_page_entry_data.text, features='lxml').find(class_='panel article aa_eQ')).replace('<div class="aa_ht"><div style="display:contents"><div class=""></div></div></div><div class="aa_hv aa_hy"><a class="aa_hz" data-action="ReportStoryToModerator-open" href="#" title="Report story"><i class="icon icon-exclamation-circle aa_hw"></i><span class="aa_hA">report</span></a></div></div>','')
- #adding everything to arrays
- link_arr.append(entry_link)
- title_arr.append(entry_title)
- descr_arr.append(entry_descr)
- author_arr.append(entry_author)
- date_arr.append(entry_date)
- rating_arr.append(entry_rating)
- hot_arr.append(entry_hot)
- story_arr.append(story_text)
- df = pd.DataFrame(
- {
- 'link_arr' : link_arr,
- 'title_arr' : title_arr,
- 'descr_arr' : descr_arr,
- 'author_arr' : author_arr,
- 'date_arr' : date_arr,
- 'rating_arr' : rating_arr,
- 'hot_arr' : hot_arr,
- 'story_arr' : story_arr
- }
- )
- df = df.fillna('-')
- df.to_csv('out.csv')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement