Guest User

Untitled

a guest
Mar 19th, 2018
260
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.13 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. """Implementation of Rapid Automatic Keyword Extraction algorithm.
  3.  
  4. As described in the paper `Automatic keyword extraction from individual
  5. documents` by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley.
  6.  
  7. Thai language by Mr.Wannaphong Phatthiyaphaibun <wannaphong@kkumail.com>
  8. """
  9.  
  10. import string
  11. from collections import defaultdict, Counter
  12. from itertools import chain, groupby, product
  13.  
  14. import pythainlp
  15.  
  16.  
  17. class Rake(object):
  18.  
  19. def __init__(self, stopwords=None, punctuations=None, language='thai'):
  20. """Constructor.
  21.  
  22. :param stopwords: List of Words to be ignored for keyword extraction.
  23. :param punctuations: Punctuations to be ignored for keyword extraction.
  24. :param language: Language to be used for stopwords
  25. """
  26. # If stopwords not provided we use language stopwords by default.
  27. self.stopwords = stopwords
  28. if self.stopwords is None:
  29. self.stopwords = pythainlp.corpus.stopwords.words(language)
  30.  
  31. # If punctuations are not provided we ignore all punctuation symbols.
  32. self.punctuations = punctuations
  33. if self.punctuations is None:
  34. self.punctuations = string.punctuation
  35.  
  36. # All things which act as sentence breaks during keyword extraction.
  37. self.to_ignore = set(chain(self.stopwords, self.punctuations))
  38.  
  39. # Stuff to be extracted from the provided text.
  40. self.frequency_dist = None
  41. self.degree = None
  42. self.rank_list = None
  43. self.ranked_phrases = None
  44.  
  45. def extract_keywords_from_text(self, text):
  46. """Method to extract keywords from the text provided.
  47.  
  48. :param text: Text to extract keywords from, provided as a string.
  49. """
  50. sentences = pythainlp.tokenize.sent_tokenize(text)
  51. self.extract_keywords_from_sentences(sentences)
  52.  
  53. def extract_keywords_from_sentences(self, sentences):
  54. """Method to extract keywords from the list of sentences provided.
  55.  
  56. :param sentences: Text to extraxt keywords from, provided as a list
  57. of strings, where each string is a sentence.
  58. """
  59. phrase_list = self._generate_phrases(sentences)
  60. self._build_frequency_dist(phrase_list)
  61. self._build_word_co_occurance_graph(phrase_list)
  62. self._build_ranklist(phrase_list)
  63.  
  64. def get_ranked_phrases(self):
  65. """Method to fetch ranked keyword strings.
  66.  
  67. :return: List of strings where each string represents an extracted
  68. keyword string.
  69. """
  70. return self.ranked_phrases
  71.  
  72. def get_ranked_phrases_with_scores(self):
  73. """Method to fetch ranked keyword strings along with their scores.
  74.  
  75. :return: List of tuples where each tuple is formed of an extracted
  76. keyword string and its score. Ex: (5.68, 'Four Scoures')
  77. """
  78. return self.rank_list
  79.  
  80. def get_word_frequency_distribution(self):
  81. """Method to fetch the word frequency distribution in the given text.
  82.  
  83. :return: Dictionary (defaultdict) of the format `word -> frequency`.
  84. """
  85. return self.frequency_dist
  86.  
  87. def get_word_degrees(self):
  88. """Method to fetch the degree of words in the given text. Degree can be
  89. defined as sum of co-occurances of the word with other words in the
  90. given text.
  91.  
  92. :return: Dictionary (defaultdict) of the format `word -> degree`.
  93. """
  94. return self.degree
  95.  
  96. def _build_frequency_dist(self, phrase_list):
  97. """Builds frequency distribution of the words in the given body of text.
  98.  
  99. :param phrase_list: List of List of strings where each sublist is a
  100. collection of words which form a contender phrase.
  101. """
  102. self.frequency_dist = Counter(chain.from_iterable(phrase_list))
  103.  
  104. def _build_word_co_occurance_graph(self, phrase_list):
  105. """Builds the co-occurance graph of words in the given body of text to
  106. compute degree of each word.
  107.  
  108. :param phrase_list: List of List of strings where each sublist is a
  109. collection of words which form a contender phrase.
  110. """
  111. co_occurance_graph = defaultdict(lambda: defaultdict(lambda: 0))
  112. for phrase in phrase_list:
  113. # For each phrase in the phrase list, count co-occurances of the
  114. # word with other words in the phrase.
  115. #
  116. # Note: Keep the co-occurances graph as is, to help facilitate its
  117. # use in other creative ways if required later.
  118. for (word, coword) in product(phrase, phrase):
  119. co_occurance_graph[word][coword] += 1
  120. self.degree = defaultdict(lambda: 0)
  121. for key in co_occurance_graph:
  122. self.degree[key] = sum(co_occurance_graph[key].values())
  123.  
  124. def _build_ranklist(self, phrase_list):
  125. """Method to rank each contender phrase using the formula
  126.  
  127. phrase_score = sum of scores of words in the phrase.
  128. word_score = d(w)/f(w) where d is degree and f is frequency.
  129.  
  130. :param phrase_list: List of List of strings where each sublist is a
  131. collection of words which form a contender phrase.
  132. """
  133. self.rank_list = []
  134. for phrase in phrase_list:
  135. rank = 0.0
  136. for word in phrase:
  137. rank += 1.0 * self.degree[word] / self.frequency_dist[word]
  138. self.rank_list.append((rank, ' '.join(phrase)))
  139. self.rank_list.sort(reverse=True)
  140. self.ranked_phrases = [ph[1] for ph in self.rank_list]
  141.  
  142. def _generate_phrases(self, sentences):
  143. """Method to generate contender phrases given the sentences of the text
  144. document.
  145.  
  146. :param sentences: List of strings where each string represents a
  147. sentence which forms the text.
  148. :return: Set of string tuples where each tuple is a collection
  149. of words forming a contender phrase.
  150. """
  151. phrase_list = set()
  152. # Create contender phrases from sentences.
  153. for sentence in sentences:
  154. word_list = [word.lower() for word in pythainlp.word_tokenize(sentence)]
  155. phrase_list.update(self._get_phrase_list_from_words(word_list))
  156. return phrase_list
  157.  
  158. def _get_phrase_list_from_words(self, word_list):
  159. """Method to create contender phrases from the list of words that form
  160. a sentence by dropping stopwords and punctuations and grouping the left
  161. words into phrases. Ex:
  162.  
  163. Sentence: Red apples, are good in flavour.
  164. List of words: ['red', 'apples', ",", 'are', 'good', 'in', 'flavour']
  165. List after dropping punctuations and stopwords.
  166. List of words: ['red', 'apples', *, *, good, *, 'flavour']
  167. List of phrases: [('red', 'apples'), ('good',), ('flavour',)]
  168.  
  169. :param word_list: List of words which form a sentence when joined in
  170. the same order.
  171. :return: List of contender phrases that are formed after dropping
  172. stopwords and punctuations.
  173. """
  174. groups = groupby(word_list, lambda x: x not in self.to_ignore)
  175. return [tuple(group[1]) for group in groups if group[0]]
Add Comment
Please, Sign In to add comment