Advertisement
Danila_lipatov

parser_names

Sep 26th, 2022 (edited)
156
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.68 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import lxml
  4. import openpyxl as pxl
  5. import parser
  6. from datetime import datetime as dt
  7. import requests
  8. import bs4
  9. from bs4 import BeautifulSoup as bs
  10. from unicodedata import normalize
  11.  
  12. headers = []
  13. df = pd.DataFrame()
  14. df_why = pd.DataFrame()
  15. dict_lin = {}
  16. indexes = []
  17. for_iterate = []
  18. #######TODO UNDERSTAND HOW TO PARSE WEBPAGE AND GET TOTAL COUNT OF PAGES
  19. """URL_TEMPLATE = f"https://www.banki.ru/banks/memory/"
  20. r = requests.get(URL_TEMPLATE)
  21.  
  22. soup = bs(r.text, "html.parser")
  23.  
  24. vacancies_names = soup.find('div', class_= "layout-wrapper padding-top-default bg-white position-relative")
  25. tax = ""
  26. for i in vacancies_names.find('div'):
  27.    print(i.text)
  28. print(tax)
  29.  
  30.  
  31. """
  32. k = 0
  33. for g in range(1, 21):
  34.     #URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}"
  35.     URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}"
  36.     #r = requests.get(URL_TEMPLATE)
  37.  
  38.     r = requests.get(URL_TEMPLATE, headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'},
  39.                      timeout=15)
  40.     print(r.content)
  41.     soup = bs(r.text, "html.parser")
  42.  
  43.     df_temp = pd.read_html(URL_TEMPLATE, encoding= 'utf8')
  44.  
  45.     for i in df_temp:
  46.             df = pd.concat([df, i], axis=0)
  47. #df = i.drop(index=[i[i['причина'] == 'ликв.'].index])
  48.  
  49.     vacancies_names = soup.find('table', class_="standard-table standard-table--row-highlight margin-bottom-default")
  50.     count = 3
  51.     #for i in vacancies_names.find_all('strong'):
  52.     for i, td in enumerate(vacancies_names.find_all('td')):
  53.         if i == count:
  54.             for_iterate.append(td.text)
  55.             count += 6
  56.  
  57.     for td in (vacancies_names.find_all('strong')):
  58.         if for_iterate[k] == 'отозв.':
  59.             headers.append(td.a['href'])
  60.         k += 1
  61.         #if a == 'отозв.':
  62.             #print(td.a['href'], count)
  63.             #count += 1
  64.         #print(driver.find_element(By.XPATH, "/html/body/div[2]/div[1]/table/tbody/tr[2]/td[4]"), count)
  65.         #print(vacancies_names.find_all('td'))
  66.         #if vacancies_names.find_element(By.XPATH, "/html/body/div[2]/div[1]/table/tbody/tr[2]/td[4]") == r'<td>ликв.</td>':
  67.         #title = i.a['href']
  68.         #headers.append(i.a['href'])
  69. stop = 0
  70.  
  71.  
  72. df = df.reset_index(drop=True)
  73.  
  74. for throw in df[df['причина'] == 'ликв.'].index:
  75.          df = df.drop(index=[throw])
  76.  
  77. df = df.drop_duplicates(subset=['номер лицензии'])
  78. df = df.reset_index(drop=True)
  79.  
  80. for link in headers:
  81.     last = []
  82.     url_banki = f"https://www.banki.ru{link}"
  83.     r_ = requests.get(url_banki)
  84.     soup_ = bs(r_.text, "lxml")
  85.     #vacancies_text = soup_.find('dd', class_='margin-bottom-zero')
  86.     #for i in vacancies_text.find_all('dd', class_='margin-bottom-zero'):
  87.     if link not in dict_lin:
  88.         dict_lin[f"https://www.banki.ru{link}"] = []
  89.     #ar = i
  90.     for i in soup_.find_all('dd', class_='margin-bottom-zero'):
  91.  
  92.         print(i.text, last)
  93.         last.append(i.text)
  94.         stop = 0
  95.         #i.text = i.text.replace(u'\n', u' ')
  96.         #i.text = i.text.replace(u'\xa0', u' ')
  97.     dict_lin[f"https://www.banki.ru{link}"] = last
  98.     stop = 0
  99.  
  100.  
  101. data = list(dict_lin.items())
  102. an_array = np.array(data, dtype=object)
  103. print(an_array)
  104. df_why = pd.DataFrame(an_array)
  105.  
  106. ## save to xlsx file
  107.  
  108. #filepath = 'my_excel_file.xlsx'
  109.  
  110. #df.to_excel(filepath, index=False)
  111. #df_why.to_excel("somth_6.xlsx", sheet_name='Sheet1', index=False, header=True)
  112. # headers.append(title)
  113. df = pd.concat([df, df_why], axis= 1)
  114. df.to_excel("somth_12.xlsx", sheet_name='Sheet1', index=False, header=True)
  115. #print(df)
  116.  
  117.  
  118.  
  119.  
  120.  
  121.  
  122.  
  123.  
  124. """
  125.  
  126.  
  127.  
  128.  
  129.  
  130.  
  131.  
  132. URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}"
  133.    r = requests.get(URL_TEMPLATE)
  134.  
  135.    soup = bs(r.text, "html.parser")
  136.  
  137.    vacancies_names = soup.find('table', class_="standard-table standard-table--row-highlight margin-bottom-default")
  138.    #vacancies_info = soup.find('th', class_="th-sortable", title="дата отзыва")
  139.    #print(vacancies_names.text)
  140.    g = 1
  141.    for i in vacancies_names.find_all('td'):
  142.        #title = i.text
  143.        #print(title)
  144.        #print(i.text)
  145.        headers.append(i.text)
  146.        stop = 0
  147.        g += 1
  148.        #headers.append(title)
  149. dates_count = 4
  150. count = 0
  151. for i in range(len(headers)):
  152.    dates_count += 6
  153.    if dt.strptime(headers[dates_count], '%d.%m.%Y') > dt.strptime("01.01.2005", '%d.%m.%Y'):
  154.        print(headers[dates_count])
  155.    count += 1
  156.    print(count)
  157.  
  158. stop = 0 """
  159. """vacancies_info = soup.find_all('tr', class_="standard-table standard-table--row-highlight margin-bottom-default")
  160.  
  161. for info in vacancies_info:
  162.    print(info)"""
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement