Parser_2022Practic

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import traceback
from langdetect import detect

linked_data = []
HEADERS={"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
categoty = 'Детективы'
categoty_url  = 'DETEKTIWY/'
url = 'http://lib.ru/' + categoty_url
url_temp = requests.get(url, headers= HEADERS)
result = url_temp.text.partition('Авторы')[2]
result = result.partition('Свидетельство о регистрации')[0]
soup = BeautifulSoup(result, 'html.parser')
links = soup.find_all('a')
for a in links:
    try:
        author = a.text
        author_link = a.get('href')
        linked_data.append([author, categoty, author_link])
    except:
        continue

book_data = []
for author,categoty, author_link in linked_data:
    time.sleep(2)
    try:
        url = 'http://lib.ru/' + categoty_url + author_link
        url_temp = requests.get(url, headers= HEADERS)
        result = url_temp.text.partition('дата модиф.')[2]
        result = result.partition('Свидетельство о регистрации')[0]
        soup = BeautifulSoup(result, 'html.parser')
        links = soup.find_all('a')
        for a in links:
            try:
                title = a.text
                text_link = a.get('href')
                if not re.search(r'Contents', a.get('href')):
                    book_data.append([title, author, categoty, author_link, text_link])
            except:
                continue
    except:continue

book_dataset = []
count = 0
for title, author, categoty, author_link, text_link in book_data:
    if count == 1:
        save_books = len(book_dataset)%10
        if save_books == 0:
            df = pd.DataFrame(book_dataset, columns=['title','author','category','text'])
            df.to_csv('Giant_dataset_10.csv')
        if re.search(r'.txt',text_link):
            time.sleep(7)
            try:
                url = 'http://lib.ru/'+ categoty_url + author_link + text_link
                url_temp = requests.get(url, headers= HEADERS)
                result = url_temp.text.partition('<hr noshade>')[0]
                soup = BeautifulSoup(result, 'html.parser')
                text = soup.text
                if re.search(r'503 Service Temporarily Unavailable',text):
                    print("Пора отдыхать")
                    print(title, author)
                    break
                book_dataset.append([title, author, categoty, text])
            except:
                print(author,title)
                continue
        else:
            print(text_link)
    if title =="Да будет свет!":
        count = 1
df = pd.DataFrame(book_dataset, columns=['title','author','category','text'])
df.to_csv('Giant_dataset_10.csv')