Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- """Implementation of Rapid Automatic Keyword Extraction algorithm.
- As described in the paper `Automatic keyword extraction from individual
- documents` by Stuart Rose, Dave Engel, Nick Cramer and Wendy Cowley.
- Thai language by Mr.Wannaphong Phatthiyaphaibun <wannaphong@kkumail.com>
- """
- import string
- from collections import defaultdict, Counter
- from itertools import chain, groupby, product
- import pythainlp
- class Rake(object):
- def __init__(self, stopwords=None, punctuations=None, language='thai'):
- """Constructor.
- :param stopwords: List of Words to be ignored for keyword extraction.
- :param punctuations: Punctuations to be ignored for keyword extraction.
- :param language: Language to be used for stopwords
- """
- # If stopwords not provided we use language stopwords by default.
- self.stopwords = stopwords
- if self.stopwords is None:
- self.stopwords = pythainlp.corpus.stopwords.words(language)
- # If punctuations are not provided we ignore all punctuation symbols.
- self.punctuations = punctuations
- if self.punctuations is None:
- self.punctuations = string.punctuation
- # All things which act as sentence breaks during keyword extraction.
- self.to_ignore = set(chain(self.stopwords, self.punctuations))
- # Stuff to be extracted from the provided text.
- self.frequency_dist = None
- self.degree = None
- self.rank_list = None
- self.ranked_phrases = None
- def extract_keywords_from_text(self, text):
- """Method to extract keywords from the text provided.
- :param text: Text to extract keywords from, provided as a string.
- """
- sentences = pythainlp.tokenize.sent_tokenize(text)
- self.extract_keywords_from_sentences(sentences)
- def extract_keywords_from_sentences(self, sentences):
- """Method to extract keywords from the list of sentences provided.
- :param sentences: Text to extraxt keywords from, provided as a list
- of strings, where each string is a sentence.
- """
- phrase_list = self._generate_phrases(sentences)
- self._build_frequency_dist(phrase_list)
- self._build_word_co_occurance_graph(phrase_list)
- self._build_ranklist(phrase_list)
- def get_ranked_phrases(self):
- """Method to fetch ranked keyword strings.
- :return: List of strings where each string represents an extracted
- keyword string.
- """
- return self.ranked_phrases
- def get_ranked_phrases_with_scores(self):
- """Method to fetch ranked keyword strings along with their scores.
- :return: List of tuples where each tuple is formed of an extracted
- keyword string and its score. Ex: (5.68, 'Four Scoures')
- """
- return self.rank_list
- def get_word_frequency_distribution(self):
- """Method to fetch the word frequency distribution in the given text.
- :return: Dictionary (defaultdict) of the format `word -> frequency`.
- """
- return self.frequency_dist
- def get_word_degrees(self):
- """Method to fetch the degree of words in the given text. Degree can be
- defined as sum of co-occurances of the word with other words in the
- given text.
- :return: Dictionary (defaultdict) of the format `word -> degree`.
- """
- return self.degree
- def _build_frequency_dist(self, phrase_list):
- """Builds frequency distribution of the words in the given body of text.
- :param phrase_list: List of List of strings where each sublist is a
- collection of words which form a contender phrase.
- """
- self.frequency_dist = Counter(chain.from_iterable(phrase_list))
- def _build_word_co_occurance_graph(self, phrase_list):
- """Builds the co-occurance graph of words in the given body of text to
- compute degree of each word.
- :param phrase_list: List of List of strings where each sublist is a
- collection of words which form a contender phrase.
- """
- co_occurance_graph = defaultdict(lambda: defaultdict(lambda: 0))
- for phrase in phrase_list:
- # For each phrase in the phrase list, count co-occurances of the
- # word with other words in the phrase.
- #
- # Note: Keep the co-occurances graph as is, to help facilitate its
- # use in other creative ways if required later.
- for (word, coword) in product(phrase, phrase):
- co_occurance_graph[word][coword] += 1
- self.degree = defaultdict(lambda: 0)
- for key in co_occurance_graph:
- self.degree[key] = sum(co_occurance_graph[key].values())
- def _build_ranklist(self, phrase_list):
- """Method to rank each contender phrase using the formula
- phrase_score = sum of scores of words in the phrase.
- word_score = d(w)/f(w) where d is degree and f is frequency.
- :param phrase_list: List of List of strings where each sublist is a
- collection of words which form a contender phrase.
- """
- self.rank_list = []
- for phrase in phrase_list:
- rank = 0.0
- for word in phrase:
- rank += 1.0 * self.degree[word] / self.frequency_dist[word]
- self.rank_list.append((rank, ' '.join(phrase)))
- self.rank_list.sort(reverse=True)
- self.ranked_phrases = [ph[1] for ph in self.rank_list]
- def _generate_phrases(self, sentences):
- """Method to generate contender phrases given the sentences of the text
- document.
- :param sentences: List of strings where each string represents a
- sentence which forms the text.
- :return: Set of string tuples where each tuple is a collection
- of words forming a contender phrase.
- """
- phrase_list = set()
- # Create contender phrases from sentences.
- for sentence in sentences:
- word_list = [word.lower() for word in pythainlp.word_tokenize(sentence)]
- phrase_list.update(self._get_phrase_list_from_words(word_list))
- return phrase_list
- def _get_phrase_list_from_words(self, word_list):
- """Method to create contender phrases from the list of words that form
- a sentence by dropping stopwords and punctuations and grouping the left
- words into phrases. Ex:
- Sentence: Red apples, are good in flavour.
- List of words: ['red', 'apples', ",", 'are', 'good', 'in', 'flavour']
- List after dropping punctuations and stopwords.
- List of words: ['red', 'apples', *, *, good, *, 'flavour']
- List of phrases: [('red', 'apples'), ('good',), ('flavour',)]
- :param word_list: List of words which form a sentence when joined in
- the same order.
- :return: List of contender phrases that are formed after dropping
- stopwords and punctuations.
- """
- groups = groupby(word_list, lambda x: x not in self.to_ignore)
- return [tuple(group[1]) for group in groups if group[0]]
Add Comment
Please, Sign In to add comment