Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- from bs4 import BeautifulSoup
- import pandas as pd
- import re
- import time
- import traceback
- from langdetect import detect
- linked_data = []
- HEADERS={"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
- categoty = 'Детективы'
- categoty_url = 'DETEKTIWY/'
- url = 'http://lib.ru/' + categoty_url
- url_temp = requests.get(url, headers= HEADERS)
- result = url_temp.text.partition('Авторы')[2]
- result = result.partition('Свидетельство о регистрации')[0]
- soup = BeautifulSoup(result, 'html.parser')
- links = soup.find_all('a')
- for a in links:
- try:
- author = a.text
- author_link = a.get('href')
- linked_data.append([author, categoty, author_link])
- except:
- continue
- book_data = []
- for author,categoty, author_link in linked_data:
- time.sleep(2)
- try:
- url = 'http://lib.ru/' + categoty_url + author_link
- url_temp = requests.get(url, headers= HEADERS)
- result = url_temp.text.partition('дата модиф.')[2]
- result = result.partition('Свидетельство о регистрации')[0]
- soup = BeautifulSoup(result, 'html.parser')
- links = soup.find_all('a')
- for a in links:
- try:
- title = a.text
- text_link = a.get('href')
- if not re.search(r'Contents', a.get('href')):
- book_data.append([title, author, categoty, author_link, text_link])
- except:
- continue
- except:continue
- book_dataset = []
- count = 0
- for title, author, categoty, author_link, text_link in book_data:
- if count == 1:
- save_books = len(book_dataset)%10
- if save_books == 0:
- df = pd.DataFrame(book_dataset, columns=['title','author','category','text'])
- df.to_csv('Giant_dataset_10.csv')
- if re.search(r'.txt',text_link):
- time.sleep(7)
- try:
- url = 'http://lib.ru/'+ categoty_url + author_link + text_link
- url_temp = requests.get(url, headers= HEADERS)
- result = url_temp.text.partition('<hr noshade>')[0]
- soup = BeautifulSoup(result, 'html.parser')
- text = soup.text
- if re.search(r'503 Service Temporarily Unavailable',text):
- print("Пора отдыхать")
- print(title, author)
- break
- book_dataset.append([title, author, categoty, text])
- except:
- print(author,title)
- continue
- else:
- print(text_link)
- if title =="Да будет свет!":
- count = 1
- df = pd.DataFrame(book_dataset, columns=['title','author','category','text'])
- df.to_csv('Giant_dataset_10.csv')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement