Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def article_extractor(newspaper_url, title_topic=None):
- dd = defaultdict(list)
- source = newspaper.build(newspaper_url)
- arts = [i.url for i in source.articles]
- if title_topic is None:
- relevant_arts = [i for i in arts]
- else:
- relevant_arts = [i for i in arts if title_topic in i]
- for i in relevant_arts:
- art = newspaper.build_article(i)
- art.download()
- art.parse()
- dd["title"].append(art.title)
- dd["text"].append(art.text)
- return pd.DataFrame.from_dict(dd)
- def get_articles(*newspaper_url, **kwargs):
- results = []
- for url in newspaper_url:
- articles = article_extractor(url, title_topic="Trump")
- articles["paper"] = url
- results.append(articles)
- return pd.concat(results)
- def clean_text(string):
- string = re.sub(r"SIGN UP FOR OUR NEWSLETTER", "", string)
- string = re.sub(r"Read more here", "", string)
- string = re.sub(r"REUTERS", "", string)
- string = re.sub(r"\?", "'", string)
- string = re.sub(r"\n", "", string)
- return string
- def preprocess_articles(articles):
- clean_arts = []
- for art in articles:
- clean_art = tcy.preprocess.preprocess_text(art,
- fix_unicode=True,
- lowercase=True,
- no_currency_symbols=True,
- no_numbers=True,
- no_urls=True)
- clean_arts.append(clean_art)
- return clean_arts
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement