Advertisement
Guest User

Untitled

a guest
Apr 23rd, 2017
59
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.53 KB | None | 0 0
  1. def article_extractor(newspaper_url, title_topic=None):
  2.  
  3. dd = defaultdict(list)
  4.  
  5. source = newspaper.build(newspaper_url)
  6. arts = [i.url for i in source.articles]
  7. if title_topic is None:
  8. relevant_arts = [i for i in arts]
  9. else:
  10. relevant_arts = [i for i in arts if title_topic in i]
  11.  
  12. for i in relevant_arts:
  13. art = newspaper.build_article(i)
  14. art.download()
  15. art.parse()
  16. dd["title"].append(art.title)
  17. dd["text"].append(art.text)
  18.  
  19. return pd.DataFrame.from_dict(dd)
  20.  
  21. def get_articles(*newspaper_url, **kwargs):
  22.  
  23. results = []
  24. for url in newspaper_url:
  25. articles = article_extractor(url, title_topic="Trump")
  26. articles["paper"] = url
  27. results.append(articles)
  28. return pd.concat(results)
  29.  
  30. def clean_text(string):
  31.  
  32. string = re.sub(r"SIGN UP FOR OUR NEWSLETTER", "", string)
  33. string = re.sub(r"Read more here", "", string)
  34. string = re.sub(r"REUTERS", "", string)
  35. string = re.sub(r"\?", "'", string)
  36. string = re.sub(r"\n", "", string)
  37.  
  38. return string
  39.  
  40. def preprocess_articles(articles):
  41.  
  42. clean_arts = []
  43. for art in articles:
  44. clean_art = tcy.preprocess.preprocess_text(art,
  45. fix_unicode=True,
  46. lowercase=True,
  47. no_currency_symbols=True,
  48. no_numbers=True,
  49. no_urls=True)
  50. clean_arts.append(clean_art)
  51. return clean_arts
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement