Advertisement
Guest User

Untitled

a guest
Jan 23rd, 2020
81
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.73 KB | None | 0 0
  1. def summary_of_article(article):
  2. sentence_scores = {}
  3.  
  4. scraped_data = urllib.request.urlopen(article)
  5.  
  6. article = scraped_data.read()
  7.  
  8. parsed_article = bs.BeautifulSoup(article,'lxml')
  9.  
  10. paragraphs = parsed_article.find_all('p')
  11.  
  12. article_text = ""
  13.  
  14. for p in paragraphs:
  15. article_text += p.text
  16.  
  17. article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)
  18. article_text = re.sub(r'\s+', ' ', article_text)
  19.  
  20. formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )
  21. formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)
  22.  
  23. sentence_list = nltk.sent_tokenize(article_text)
  24.  
  25. stopwords = nltk.corpus.stopwords.words('french')
  26.  
  27. word_frequencies = {}
  28. for word in nltk.word_tokenize(formatted_article_text):
  29. if word not in stopwords:
  30. if word not in word_frequencies.keys():
  31. word_frequencies[word] = 1
  32. else:
  33. word_frequencies[word] += 1
  34. maximum_frequncy = max(word_frequencies.values())
  35.  
  36. for word in word_frequencies.keys():
  37. word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
  38. for sent in sentence_list:
  39. for word in nltk.word_tokenize(sent.lower()):
  40. if word in word_frequencies.keys():
  41. if len(sent.split(' ')) < 50:
  42. if sent not in sentence_scores.keys():
  43. sentence_scores[sent] = word_frequencies[word]
  44. else:
  45. sentence_scores[sent] += word_frequencies[word]
  46.  
  47. summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)
  48.  
  49. summary = ' '.join(summary_sentences)
  50. return summary
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement