Advertisement
Guest User

literotica downloader

a guest
Sep 23rd, 2022
47
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.92 KB | None | 0 0
  1. #!/bin/python3
  2.  
  3. import bs4
  4. import requests
  5. import sys
  6. import random
  7. import pandas as pd
  8. from user_agents import *
  9.  
  10. from requests.api import request
  11.  
  12. #python3 litero_downloader.py [base_url] [starting_page] [ending_page]
  13.  
  14. #parameters
  15. BASE_URL = sys.argv[1]
  16. STARTING_PAGE = int(sys.argv[2])
  17. ENDING_PAGE = int(sys.argv[3])
  18.  
  19. #arrays
  20. link_arr = []
  21. title_arr = []
  22. descr_arr = []
  23. author_arr = []
  24. date_arr = []
  25. rating_arr = []
  26. hot_arr = []
  27. story_arr = []
  28.  
  29. def get_random_user_agent():
  30.     return USER_AGENTS[random.randint(0, len(USER_AGENTS) - 1)]
  31.  
  32.  
  33. #creating session
  34. # sess = requests.session()
  35.  
  36. #for all the pages
  37. for cur_page in range(STARTING_PAGE, ENDING_PAGE):
  38.     print('Page {}'.format(str(cur_page)))
  39.     #for each story
  40.     page_data = requests.get(
  41.         BASE_URL +
  42.         '/{}-page'.format(str(cur_page)),
  43.     headers={'User-Agent' : get_random_user_agent()})
  44.     bs0 = bs4.BeautifulSoup(page_data.text, features='lxml')
  45.     all_entries_in_page = bs0.find_all(class_='b-sl-item-r w-34t')
  46.  
  47.     for cur_entry in all_entries_in_page:
  48.         entry_link = cur_entry.find_all(class_='r-34i')[0]['href']
  49.         entry_title = cur_entry.find_all(class_='r-34i')[0].text
  50.         entry_descr = cur_entry.find_all(class_='b-sli-description p-57u')[0].text.replace('\xa0—\xa0', '') if(cur_entry.find_all(class_='b-sli-description p-57u')[0].text.replace('\xa0—\xa0', '') != '') else '-'
  51.         entry_author = cur_entry.find_all(class_='b-sli-author')[0].text.replace('by','').replace('\n','').replace('\t','').replace('\xa0','')
  52.         entry_date = cur_entry.find(class_='b-sli-date').text
  53.         #rating
  54.         entry_rating = 0.0
  55.         try:
  56.             entry_rating = float(cur_entry.find(class_='b-sli-rating').text)
  57.         except:
  58.             entry_rating = 0.0
  59.        
  60.         #is hot?
  61.         entry_hot = False
  62.         if(str(cur_entry.find(class_='b-sli-stat-icons')).find('HOT') >= 0):
  63.             entry_hot = True
  64.  
  65.         print('Getting {}'.format(entry_title))
  66.         #now going into the particular entry's page
  67.         first_page_entry_data = requests.get(
  68.             entry_link,
  69.             headers={'User-Agent' : get_random_user_agent()}
  70.         )
  71.  
  72.         story_text = ''
  73.  
  74.         number_of_part = len(bs4.BeautifulSoup(first_page_entry_data.text, features='lxml').find_all(class_='l_bJ')) - 2
  75.  
  76.         if(number_of_part > 0):
  77.         #looping of all the parts
  78.             for cur_part in range(0, number_of_part):
  79.                 print('Getting part {} / {}'.format(str(cur_part + 1), str(number_of_part)))
  80.                 try:
  81.                     part_data = requests.get(
  82.                         entry_link + '?page={}'.format(cur_part + 1),
  83.                         headers={'User-Agent' : get_random_user_agent()})
  84.                 except:
  85.                     continue
  86.                 bs1 = bs4.BeautifulSoup(part_data.text, features='lxml')
  87.                 story_text += str(bs1.find(class_='panel article aa_eQ')).replace('<div class="aa_ht"><div style="display:contents"><div class=""></div></div></div><div class="aa_hv aa_hy"><a class="aa_hz" data-action="ReportStoryToModerator-open" href="#" title="Report story"><i class="icon icon-exclamation-circle aa_hw"></i><span class="aa_hA">report</span></a></div></div>','')
  88.         else:
  89.             story_text += str(bs4.BeautifulSoup(first_page_entry_data.text, features='lxml').find(class_='panel article aa_eQ')).replace('<div class="aa_ht"><div style="display:contents"><div class=""></div></div></div><div class="aa_hv aa_hy"><a class="aa_hz" data-action="ReportStoryToModerator-open" href="#" title="Report story"><i class="icon icon-exclamation-circle aa_hw"></i><span class="aa_hA">report</span></a></div></div>','')
  90.  
  91.         #adding everything to arrays
  92.         link_arr.append(entry_link)
  93.         title_arr.append(entry_title)
  94.         descr_arr.append(entry_descr)
  95.         author_arr.append(entry_author)
  96.         date_arr.append(entry_date)
  97.         rating_arr.append(entry_rating)
  98.         hot_arr.append(entry_hot)
  99.         story_arr.append(story_text)
  100.  
  101.  
  102. df = pd.DataFrame(
  103. {
  104. 'link_arr' : link_arr,
  105. 'title_arr' : title_arr,
  106. 'descr_arr' : descr_arr,
  107. 'author_arr' : author_arr,
  108. 'date_arr' : date_arr,
  109. 'rating_arr' : rating_arr,
  110. 'hot_arr' : hot_arr,
  111. 'story_arr' : story_arr
  112. }
  113. )
  114. df = df.fillna('-')
  115. df.to_csv('out.csv')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement