Advertisement
umi_0193

Parser_2022Practic

Jun 27th, 2022 (edited)
1,187
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.97 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import pandas as pd
  4. import re
  5. import time
  6. import traceback
  7. from langdetect import detect
  8.  
  9. linked_data = []
  10. HEADERS={"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
  11. categoty = 'Детективы'
  12. categoty_url  = 'DETEKTIWY/'
  13. url = 'http://lib.ru/' + categoty_url
  14. url_temp = requests.get(url, headers= HEADERS)    
  15. result = url_temp.text.partition('Авторы')[2]  
  16. result = result.partition('Свидетельство о регистрации')[0]        
  17. soup = BeautifulSoup(result, 'html.parser')
  18. links = soup.find_all('a')
  19. for a in links:
  20.     try:
  21.         author = a.text
  22.         author_link = a.get('href')
  23.         linked_data.append([author, categoty, author_link])
  24.     except:
  25.         continue
  26.  
  27. book_data = []
  28. for author,categoty, author_link in linked_data:
  29.     time.sleep(2)
  30.     try:
  31.         url = 'http://lib.ru/' + categoty_url + author_link
  32.         url_temp = requests.get(url, headers= HEADERS)    
  33.         result = url_temp.text.partition('дата модиф.')[2]  
  34.         result = result.partition('Свидетельство о регистрации')[0]        
  35.         soup = BeautifulSoup(result, 'html.parser')
  36.         links = soup.find_all('a')
  37.         for a in links:
  38.             try:
  39.                 title = a.text
  40.                 text_link = a.get('href')
  41.                 if not re.search(r'Contents', a.get('href')):
  42.                     book_data.append([title, author, categoty, author_link, text_link])
  43.             except:
  44.                 continue
  45.     except:continue
  46.  
  47. book_dataset = []
  48. count = 0
  49. for title, author, categoty, author_link, text_link in book_data:
  50.     if count == 1:
  51.         save_books = len(book_dataset)%10
  52.         if save_books == 0:
  53.             df = pd.DataFrame(book_dataset, columns=['title','author','category','text'])
  54.             df.to_csv('Giant_dataset_10.csv')
  55.         if re.search(r'.txt',text_link):
  56.             time.sleep(7)
  57.             try:
  58.                 url = 'http://lib.ru/'+ categoty_url + author_link + text_link
  59.                 url_temp = requests.get(url, headers= HEADERS)    
  60.                 result = url_temp.text.partition('<hr noshade>')[0]    
  61.                 soup = BeautifulSoup(result, 'html.parser')
  62.                 text = soup.text
  63.                 if re.search(r'503 Service Temporarily Unavailable',text):
  64.                     print("Пора отдыхать")
  65.                     print(title, author)
  66.                     break
  67.                 book_dataset.append([title, author, categoty, text])
  68.             except:
  69.                 print(author,title)
  70.                 continue
  71.         else:
  72.             print(text_link)
  73.     if title =="Да будет свет!":
  74.         count = 1
  75. df = pd.DataFrame(book_dataset, columns=['title','author','category','text'])
  76. df.to_csv('Giant_dataset_10.csv')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement