Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def save_article(the_article_type, cwd, url):
- # url = "https://www.nature.com/nature/articles"
- r = requests.get(url)
- if r.status_code == 200:
- soup = BeautifulSoup(r.content, 'html.parser')
- title_news = []
- puncs = string.punctuation
- for x in soup.find_all('article'):
- # article_type = x.find('span', attrs={'data-test': 'article.type'}).text
- article_type = x.find('span', class_='c-meta__type').text
- # <span class="c-meta__type">Research Summary</span>
- # print(article_type)
- # if article_type == f'\n{article_type}\n':
- if article_type == the_article_type:
- # print('article_type == the_article_type')
- title = x.find('a', {'data-track-action': "view article"}).text
- name = title.strip(' ').translate(str.maketrans(" ", "_", puncs)) + '.txt'
- print(name)
- title_news.append(name)
- article_url = f"https://www.nature.com{x.a.get('href')}"
- print(article_url)
- r2 = requests.get(article_url)
- soup2 = BeautifulSoup(r2.content, 'html.parser')
- text = soup2.find('div', class_='article-item__body').text.strip()
- # print(text)
- # with open(name, 'w') as file:
- # file.write(text)
- print('cwd', cwd)
- print('real cwd', os.getcwd())
- # os.chdir(cwd) if os.getcwd() != cwd else print('no')
- print('changed dir')
- file = open(name, 'wb')
- file.write(text.strip().encode('utf-8'))
- file.close()
- def save_articles():
- pages = int(input())
- _type = input()
- for x in range(pages):
- print(x)
- cwd = f'Page_{x + 1}'
- print(cwd)
- url = f'https://www.nature.com/nature/articles?searchType=journalSearch&sort=PubDate&page={x + 1}'
- print(url)
- os.mkdir(cwd)
- os.chdir(cwd)
- save_article(_type, cwd, url)
- os.chdir('C:/Users/linyu/PycharmProjects/Web Scraper/Web Scraper/task')
- save_articles()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement