Yerdneass

parserPoint

May 7th, 2020
105
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.15 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import csv
  4.  
  5. def getHtml(url):
  6.     response = requests.get(url)
  7.     return response.text
  8.  
  9. def pageData(html):
  10.     soup = BeautifulSoup(html, 'lxml')
  11.  
  12.     arts = soup.find('div', class_='post-list-container').find_all('article', class_ = 'post-list-container-item')
  13.  
  14.     for art in arts:
  15.     title = art.find('div', class_='post-list-container-item-text').find('h2').text.strip()
  16.  
  17.     url = 'https://point.md/ru/' + art.find('div', class_='post-list-container-item-text').find('h2').find('a').get('href')
  18.  
  19.     time = art.find('div', class_='post-list-container-item-text').find('div', class_='post-list-container-item-text-info').find('span').find('time').text.strip()
  20.  
  21.     photo = art.find('figure').find('img').get('src')
  22.  
  23.  
  24. data = {'title' : title,
  25.         'url' : url,
  26.         'time' : time,
  27.         'photo' : photo}
  28.  
  29. writeCsv(data)
  30.  
  31.  
  32.  
  33.  
  34. def writeCsv(data):
  35.     with open('pointm.csv', 'a') as f:
  36.     writer = csv.writer(f)
  37.     writer.writerow((data['title'],
  38.                      data['time'],
  39.                      data['photo'],
  40.                      data['url']))
  41.  
  42. def main():
  43.     URL = 'https://point.md/ru/'
  44.     html = getHtml(URL)
  45.     pageData(html)
  46.  
  47.  
  48. if __name__ == '__main__':
  49. main()
Add Comment
Please, Sign In to add comment