vmamontov

parse html-page

Jul 9th, 2021 (edited)
165
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # pip install beautifulsoup4
  2. # как установить описано здесь - https://www.crummy.com/software/BeautifulSoup/bs4/doc.ru/bs4ru.html#id8
  3.  
  4. import requests
  5. from bs4 import BeautifulSoup
  6.  
  7. r = requests.get(url)
  8. soup = BeautifulSoup(r.content, "html.parser")
  9.  
  10. all_h3 = soup.find_all('h3')
  11.  
  12. for i in range(len(all_h3[2:])):
  13.     curr_h4 = str(all_h3[i+2].next_sibling.next_sibling)
  14.     next_h4 = ''
  15.  
  16.     if i < 31:
  17.         next_h4 = str(all_h3[i+3].next_sibling.next_sibling)
  18.     else:
  19.         next_h4 = str(soup.find('div', {'class': 'clearfix'}))
  20.  
  21.     pointA = str(soup).find(curr_h4)
  22.     pointB = str(soup).find(next_h4)
  23.     text = str(soup)[pointA:pointB]
  24.     # print(i, text)
  25.  
  26.     flow = all_h3[i+2].get_text()
  27.     programm = BeautifulSoup(text).find('h4').get_text()
  28.     rows = len(BeautifulSoup(text).find_all('tr'))
  29.     # print(i, flow, programm, rows)
  30.    
  31.     with open('result.csv', 'a') as file:
  32.         result = f'{flow};{programm};{rows}\n'
  33.         file.write(result)
RAW Paste Data