Advertisement
dim4942

Parser

Aug 16th, 2019
525
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.02 KB | None | 0 0
  1. import requests
  2. import urllib
  3. from bs4 import BeautifulSoup as bs
  4.  
  5. def articleParse(url, newFileName):
  6.     file = urllib.urlopen(url).read()
  7.     f = open(newFileName, "wb")
  8.     f.write(logo)
  9.     f.close()
  10.  
  11. headers = {'accept': '*/*',
  12.            'user-agent': 'Mozilla/5.0(X11;Linux x86_64...)Geco/20100101 Firefox/60.0'}
  13. base_url = 'https://habr.com/ru/all/top100/page'
  14. out = 'Lib.txt'
  15. //os.makedirs(newpath)
  16. articleNum = 0;
  17.  
  18.  
  19. //document.getElementsByClassName('post__title-text') - заголовок+
  20. //document.getElementsByClassName('post__time') в свойстве data-time_published дата и время публикации+
  21. //document.getElementsByClassName('inline-list__item inline-list__item_hub') - теги innerText
  22. //document.getElementsByClassName('post__text post__text-html js-mediator-article') - текст самой статьи
  23.  
  24. def articleParse(url, headers, session, library):
  25.     global articleNum
  26.     request = session.get(url, headers=headers)
  27.     if request.status_code == 200:
  28.         soup = bs(request.content, 'html.parser')
  29.         title = soup.find_all('span', attrs={'class' :'post__title-text'})[0].text
  30.         time = soup.find_all('span', attrs={'class' :'post__time'})[0].['data-time_published']
  31.        
  32.     library.write('---\n')
  33.     library = open(articleNum+'.txt', 'w')
  34.     articleNum = articleNum + 1
  35.    
  36.  
  37. def habrPageParse(base_url, headers, session, pageNum, library):
  38.     request = session.get(base_url + pageNum+'/', headers=headers)
  39.     if request.status_code == 200:
  40.         soup = bs(request.content, 'html.parser')
  41.         links = soup.find_all('a', attrs={'class' :'post__title_link'})
  42.         for link in links:
  43.             articleParse(link['href'], headers, session, library)
  44.         return 0
  45.     else:
  46.         return 1
  47.  
  48.  
  49. def habrParse(base_url, headers
  50.     library = open(out, 'w')
  51.     session = requests.session()
  52.     page = 0
  53.     while True:
  54.         page = page + 1
  55.         if (habrPageParse(base_url, headers, session, page, library) != 0):
  56.             break;
  57.     library.close()
  58.    
  59.        
  60.  
  61. habrParse(base_url, headers)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement