Final_parse

import pandas as pd
import numpy as np
import lxml
import openpyxl as pxl
import parser
from datetime import datetime as dt
import requests
import bs4
from bs4 import BeautifulSoup as bs
from unicodedata import normalize
####for_new_task
###url is https://cbr.ru/banking_sector/credit/coinfo/a2020/?regnum=600

headers = []        # list for links
df = pd.DataFrame()     #output df
df_why = pd.DataFrame()     #df of links + text
dict_lin = {}       #dict for create df in prev descrip
for_iterate = []    # ликв. or отозв. mass
k = 0   #value for iteration in for_iterate mass

def Getlink(table, count, k):               # tuple of mass for links and iteration value
    #k = 0
    for_iterate_temp = []       #temp list for func
    headers_temp = []           #temp list for func
    for i, td in enumerate(table.find_all('td')):
        if i == count:
            for_iterate_temp.append(td.text)
            count += 6

    for td in (table.find_all('strong')):           #get full info per each ID
        if for_iterate_temp[k] == 'отозв.':
            headers_temp.append(td.a['href'])       #saving links for each ID
        k += 1
    return (headers_temp, k)


def GetDictoflinks(headers, dict_lin):              #dict of links
    for link in headers:
        last = []
        str_temp = ''                       #contain all text
        url_banki = f"https://www.banki.ru{link}"       #link for search info
        r_ = requests.get(url_banki)
        soup_ = bs(r_.text, "lxml")
        if link not in dict_lin:
            dict_lin[f"https://www.banki.ru{link}"] = []        #adding link as key  case some duplicates at the web
        for i in soup_.find_all("dd", {"class": "margin-bottom-zero"}):     #search text of release
            # ar = i.text.strip()
            ai = i.text             #get text as str
            """str_temp += ar
            stop = 0"""
            ai = ai.replace(u'\n', u' ')    #replace bad symbols
            ai = ai.replace(u'\xa0', u' ')  #replace bad symbols
            str_temp += ai              #concat string
        dict_lin[f"https://www.banki.ru{link}"].append(str(str_temp))       #adding str_temp as value for each key

    return dict_lin

def GetFullDF(df, dict_lin):                #get final output df for excel
    data = list(dict_lin.items())           #convert dict to list
    an_array = np.array(data, dtype=object)             #np array for convert to DF
    df_why = pd.DataFrame(an_array)                 #get df of links + texts

    df = pd.concat([df, df_why], axis=1)                #get output df

    return df


def DropReason(df):
    df = df.reset_index(drop=True)

    for throw in df[df['причина'] == 'ликв.'].index:  # drop where reason is ликв.
        df = df.drop(index=[throw])

    df = df.drop_duplicates(subset=['номер лицензии'])  # drop duplicates
    df = df.reset_index(drop=True)
    return df

#######TODO UNDERSTAND HOW TO PARSE WEBPAGE AND GET TOTAL COUNT OF PAGES            still active

for g in range(1, 3):
    URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}"                #url for search table

    r = requests.get(URL_TEMPLATE, headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'},
                     timeout=15)
    print(r.content)
    soup = bs(r.text, "html.parser")

    df_temp = pd.read_html(URL_TEMPLATE, encoding= 'utf8')          #get table(list) of df

    for i in df_temp:
            df = pd.concat([df, i], axis=0)             # or df = pd.concat([df_temp[0], i], axis=0)

    table = soup.find('table', class_="standard-table standard-table--row-highlight margin-bottom-default")
    count = 3       #counter for get reason
    headers += Getlink(table, count, k)[0]      #collect output list of links
    #k = getlink(vacancies_names, count, k)[1]

df = DropReason(df)

dict_lin = GetDictoflinks(headers, dict_lin)

df = GetFullDF(df, dict_lin)

df = df.set_index('Unnamed: 0')             #drop useless column
df.rename(columns={0:'link', 1: 'text'}, inplace=True)              #get better name
df.to_csv("name.csv",index=False, header=True)              #Saving as csv cause df.to_excel has some troubles

#TODO AFTER CREATE OUTPUT.CSV FILE
#After save open random xlsx file(or create)
#Move to DATA
#open "get from txt file/Csv file
#click to UTF8 coding
#get table

#THATS ALL