Advertisement
MeowalsoMeow

web_scraper_stage5

Jun 21st, 2021
484
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.17 KB | None | 0 0
  1. def save_article(the_article_type, cwd, url):
  2. # url = "https://www.nature.com/nature/articles"
  3. r = requests.get(url)
  4. if r.status_code == 200:
  5. soup = BeautifulSoup(r.content, 'html.parser')
  6. title_news = []
  7. puncs = string.punctuation
  8. for x in soup.find_all('article'):
  9. # article_type = x.find('span', attrs={'data-test': 'article.type'}).text
  10. article_type = x.find('span', class_='c-meta__type').text
  11. # <span class="c-meta__type">Research Summary</span>
  12. # print(article_type)
  13. # if article_type == f'\n{article_type}\n':
  14. if article_type == the_article_type:
  15. # print('article_type == the_article_type')
  16. title = x.find('a', {'data-track-action': "view article"}).text
  17. name = title.strip(' ').translate(str.maketrans(" ", "_", puncs)) + '.txt'
  18. print(name)
  19. title_news.append(name)
  20. article_url = f"https://www.nature.com{x.a.get('href')}"
  21. print(article_url)
  22. r2 = requests.get(article_url)
  23. soup2 = BeautifulSoup(r2.content, 'html.parser')
  24. text = soup2.find('div', class_='article-item__body').text.strip()
  25. # print(text)
  26. # with open(name, 'w') as file:
  27. # file.write(text)
  28. print('cwd', cwd)
  29. print('real cwd', os.getcwd())
  30. # os.chdir(cwd) if os.getcwd() != cwd else print('no')
  31. print('changed dir')
  32. file = open(name, 'wb')
  33. file.write(text.strip().encode('utf-8'))
  34. file.close()
  35.  
  36.  
  37. def save_articles():
  38. pages = int(input())
  39. _type = input()
  40. for x in range(pages):
  41. print(x)
  42. cwd = f'Page_{x + 1}'
  43. print(cwd)
  44. url = f'https://www.nature.com/nature/articles?searchType=journalSearch&sort=PubDate&page={x + 1}'
  45. print(url)
  46. os.mkdir(cwd)
  47. os.chdir(cwd)
  48. save_article(_type, cwd, url)
  49. os.chdir('C:/Users/linyu/PycharmProjects/Web Scraper/Web Scraper/task')
  50.  
  51.  
  52. save_articles()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement