Advertisement
umi_0193

Parser_2022Practic

Jun 27th, 2022 (edited)
815
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.97 KB | None
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import pandas as pd
  4. import re
  5. import time
  6. import traceback
  7. from langdetect import detect
  8.  
  9. linked_data = []
  10. HEADERS={"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
  11. categoty = 'Детективы'
  12. categoty_url  = 'DETEKTIWY/'
  13. url = 'http://lib.ru/' + categoty_url
  14. url_temp = requests.get(url, headers= HEADERS)    
  15. result = url_temp.text.partition('Авторы')[2]  
  16. result = result.partition('Свидетельство о регистрации')[0]        
  17. soup = BeautifulSoup(result, 'html.parser')
  18. links = soup.find_all('a')
  19. for a in links:
  20.     try:
  21.         author = a.text
  22.         author_link = a.get('href')
  23.         linked_data.append([author, categoty, author_link])
  24.     except:
  25.         continue
  26.  
  27. book_data = []
  28. for author,categoty, author_link in linked_data:
  29.     time.sleep(2)
  30.     try:
  31.         url = 'http://lib.ru/' + categoty_url + author_link
  32.         url_temp = requests.get(url, headers= HEADERS)    
  33.         result = url_temp.text.partition('дата модиф.')[2]  
  34.         result = result.partition('Свидетельство о регистрации')[0]        
  35.         soup = BeautifulSoup(result, 'html.parser')
  36.         links = soup.find_all('a')
  37.         for a in links:
  38.             try:
  39.                 title = a.text
  40.                 text_link = a.get('href')
  41.                 if not re.search(r'Contents', a.get('href')):
  42.                     book_data.append([title, author, categoty, author_link, text_link])
  43.             except:
  44.                 continue
  45.     except:continue
  46.  
  47. book_dataset = []
  48. count = 0
  49. for title, author, categoty, author_link, text_link in book_data:
  50.     if count == 1:
  51.         save_books = len(book_dataset)%10
  52.         if save_books == 0:
  53.             df = pd.DataFrame(book_dataset, columns=['title','author','category','text'])
  54.             df.to_csv('Giant_dataset_10.csv')
  55.         if re.search(r'.txt',text_link):
  56.             time.sleep(7)
  57.             try:
  58.                 url = 'http://lib.ru/'+ categoty_url + author_link + text_link
  59.                 url_temp = requests.get(url, headers= HEADERS)    
  60.                 result = url_temp.text.partition('<hr noshade>')[0]    
  61.                 soup = BeautifulSoup(result, 'html.parser')
  62.                 text = soup.text
  63.                 if re.search(r'503 Service Temporarily Unavailable',text):
  64.                     print("Пора отдыхать")
  65.                     print(title, author)
  66.                     break
  67.                 book_dataset.append([title, author, categoty, text])
  68.             except:
  69.                 print(author,title)
  70.                 continue
  71.         else:
  72.             print(text_link)
  73.     if title =="Да будет свет!":
  74.         count = 1
  75. df = pd.DataFrame(book_dataset, columns=['title','author','category','text'])
  76. df.to_csv('Giant_dataset_10.csv')
Advertisement
RAW Paste Data Copied
Advertisement