Advertisement
Guest User

Untitled

a guest
Jul 18th, 2019
151
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 14.15 KB | None | 0 0
  1. """
  2. Further messing around with NLP and extracting topics.
  3.  
  4. Built a class for analyzing text to determine topics. It allows for part of speech analysis and has a pipeline for
  5. cleaning the data set. It then uses the Gensim LDA model to determine which words are the topics.
  6.  
  7. Using the Gensim part of speech analysis to filter out everything but nouns, in conjunction with my previous
  8. lemmatization and stemming I was able to get results that seem much better even with still using only the SKLearn news
  9. group data set.
  10. """
  11. from __future__ import annotations
  12. from gensim.utils import simple_preprocess
  13. from gensim.parsing.preprocessing import STOPWORDS
  14. from nltk.stem import WordNetLemmatizer, SnowballStemmer
  15. from typing import AnyStr, Callable, List, Set, Union
  16. import gensim
  17. import nltk
  18. import numpy as np
  19. import os.path
  20. import sklearn.datasets
  21. import sys
  22.  
  23.  
  24. class TopicAnalyzer:
  25. def __init__(
  26. self,
  27. dictionary_fname: AnyStr,
  28. model_fname: AnyStr,
  29. regen: bool = False,
  30. seed: int = 400,
  31. stopwords: Union[Set, frozenset]=gensim.parsing.preprocessing.STOPWORDS,
  32. model_class: gensim.models.LdaModel=gensim.models.LdaMulticore,
  33. part_of_speach: Callable=nltk.pos_tag,
  34. verbose: Bool=False
  35. ):
  36. self._dictionary = None
  37. self._dictionary_fname = dictionary_fname
  38. self._model = None
  39. self._model_fname = model_fname
  40. self._preparation_pipeline = []
  41. self._prepared_documents = []
  42. self._regen = regen
  43. self._stopwords = set(stopwords)
  44. self._model_class = model_class
  45.  
  46. self._dictionary_config = {}
  47. self._dictionary_extremes_config = {}
  48. self._model_config = {}
  49.  
  50. self._part_of_speach = part_of_speach
  51. self._verbose = verbose
  52.  
  53. np.random.seed(seed)
  54.  
  55. @property
  56. def dictionary(self) -> gensim.corpora.Dictionary:
  57. if not self._dictionary:
  58. if self.can_use_existing_model():
  59. self._load_dictionary()
  60. else:
  61. self._generate()
  62. return self._dictionary
  63.  
  64. @property
  65. def documents(self) -> List[AnyStr]:
  66. return self._documents
  67.  
  68. @property
  69. def model(self) -> gensim.models.ldamodel.LdaModel:
  70. if not self._model:
  71. if self.can_use_existing_model():
  72. self._load_model()
  73. else:
  74. self._generate()
  75. return self._model
  76.  
  77. @property
  78. def regen(self):
  79. return self._regen
  80.  
  81. @property
  82. def stopwords(self) -> Set:
  83. return self._stopwords
  84.  
  85. def add_document(self, *documents: Tuple[AnyStr]) -> TopicAnalyzer:
  86. """ Adds a document to the corpus that will be used to generate the dictionary and model. It will only store the
  87. documents if the dictionary and model might be generated. """
  88. if not self.can_use_existing_model():
  89. index = 0
  90. self._progress(f"Adding {len(documents)} Documents")
  91. for document in documents:
  92. excerpt = document[:100].replace("\n", " ")
  93. self._progress(f"{index+1:6}/{len(documents):6}: {excerpt:100} \r", "")
  94. self._prepared_documents.append(self._prepare_document(document))
  95. index += 1
  96. self._progress("")
  97. return self
  98.  
  99. def add_prep_step(self, step: Callable) -> TopicAnalyzer:
  100. """ Adds a step to the document preparation pipeline that is used for cleaning documents before feeding them
  101. into the dictionary and model. """
  102. self._preparation_pipeline.append(step)
  103. return self
  104.  
  105. def add_stop_words(self, *words) -> TopicAnalyzer:
  106. self._stopwords.update(words)
  107. return self
  108.  
  109. def analyze_topics(self, document: AnyStr) -> Tuple[AnyStr]:
  110. self._progress(f"Asked to analyze '{document[:25]}...'")
  111. bow_vector = self.dictionary.doc2bow(self._prepare_document(document))
  112. return sorted(self.model[bow_vector], key=lambda tup: -1*tup[1])
  113.  
  114. def configure_dictionary(self, **kwargs) -> TopicAnalyzer:
  115. self._dictionary_config = kwargs
  116. return self
  117.  
  118. def configure_dictionary_extremes(self, **kwargs) -> TopicAnalyzer:
  119. self._dictionary_extremes_config = kwargs
  120. return self
  121.  
  122. def configure_model(self, **kwargs) -> TopicAnalyzer:
  123. self._model_config = kwargs
  124. return self
  125.  
  126. def can_use_existing_model(self) -> bool:
  127. """ If the user hasn't requested the model to be regenerated and both the dictionary and model files exist we
  128. use them. """
  129. return (
  130. not self.regen
  131. and os.path.exists(self._dictionary_fname)
  132. and os.path.exists(self._model_fname)
  133. )
  134.  
  135. def prepared_documents(self) -> List[List[AnyStr]]:
  136. return self._prepared_documents
  137.  
  138. def _generate(self):
  139. """ Generates and saves both the dictionary and the LDA model. """
  140. self._progress("Generating Dictionary")
  141. self._dictionary = self._generate_dictionary()
  142. self._progress(f"Saving Dictionary ({self._dictionary_fname})")
  143. with open(self._dictionary_fname, "wb") as dictionary_file:
  144. self._dictionary.save(dictionary_file)
  145.  
  146. self._progress("Generating Model")
  147. self._model = self._generate_model()
  148. self._progress(f"Saving Model ({self._model_fname})")
  149. self._model.save(self._model_fname)
  150.  
  151. self._regen = False
  152.  
  153. def _generate_dictionary(self) -> gensim.corpora.Dictionary:
  154. dictionary = gensim.corpora.Dictionary(
  155. self.prepared_documents(), **self._dictionary_config
  156. )
  157. dictionary.filter_extremes(**self._dictionary_extremes_config)
  158. return dictionary
  159.  
  160. def _generate_model(self) -> gensim.models.ldamodel.LdaModel:
  161. bow_corpus = [self._dictionary.doc2bow(doc) for doc in self.prepared_documents()]
  162. return self._model_class(bow_corpus, id2word=self._dictionary, **self._model_config)
  163.  
  164. def _load_dictionary(self):
  165. self._dictionary = gensim.corpora.Dictionary.load(self._dictionary_fname)
  166.  
  167. def _load_model(self):
  168. self._model = self._model_class.load(self._model_fname)
  169.  
  170. def _prepare_document(self, document: AnyStr) -> List[AnyStr]:
  171. prepared_documents = []
  172. for token in self._part_of_speach(gensim.utils.simple_preprocess(document)):
  173. word = token
  174. if len(word) > 3 and word not in self.stopwords:
  175. for step in self._preparation_pipeline:
  176. word = step(word)
  177. if not word:
  178. break
  179. if word:
  180. prepared_documents.append(word)
  181. return prepared_documents
  182.  
  183. def _progress(self, message: AnyStr, line_end="\n"):
  184. if self._verbose:
  185. sys.stdout.write(f"{message}{line_end}")
  186. sys.stdout.flush()
  187.  
  188.  
  189. if __name__ == "__main__":
  190. nltk.download("wordnet")
  191. nltk.download('averaged_perceptron_tagger')
  192. stemmer = SnowballStemmer("english")
  193. lemmatizer = WordNetLemmatizer()
  194. newsgroups_train = sklearn.datasets.fetch_20newsgroups(subset="train", shuffle=True)
  195.  
  196. analyzer = TopicAnalyzer(
  197. "dictionary.txt",
  198. "lda-model.txt",
  199. # regen=True,
  200. part_of_speach=lambda tokens:(
  201. word
  202. for word, pos in nltk.pos_tag(tokens)
  203. if pos in {"NN", "NNS", "NNP", "NNPS"}
  204. ),
  205. verbose=True
  206. )
  207.  
  208. analyzer.add_document(*newsgroups_train.data)
  209.  
  210. analyzer.add_prep_step(
  211. lambda word: lemmatizer.lemmatize(word)
  212. ).add_prep_step(lambda word: stemmer.stem(word))
  213. analyzer.configure_dictionary_extremes(no_below=15, no_above=0.1, keep_n=100000)
  214. analyzer.configure_model(num_topics=8, passes=10, workers=2)
  215.  
  216. documents = [(
  217. "The knockout stage of the 2019 Cricket World Cup will see "
  218. "two semi-finals, with the winners of each progressing to "
  219. "the final at Lord's. The first semi-final will be held at "
  220. "Old Trafford in Manchester and the second semi-final will "
  221. "be held at Edgbaston in Birmingham just as they did back "
  222. "in 1999, with all of the knockout games having a reserve "
  223. "day. It will be the third time Edgbaston has hosted a "
  224. "World Cup semi-final and the fourth semi-final to be held "
  225. "at Old Trafford - a record for a World Cup venue. The "
  226. "final will be held at Lord's in London for a record fifth "
  227. "time.\n\nOn 25 June 2019, Australia became the first team "
  228. "to qualify for the semi-finals, after beating England at "
  229. "Lord's. India became the second team to qualify for the "
  230. "semi-finals, after they defeated Bangladesh at Edgbaston "
  231. "on 2 July 2019. The following day saw tournament hosts "
  232. "England become the third team to qualify for the "
  233. "semi-finals, after they beat New Zealand at the Riverside "
  234. "Ground. New Zealand were the fourth and final team to "
  235. "qualify for the semi-finals, after Pakistan were unable to "
  236. "increase their net run rate sufficiently enough in their "
  237. "match against Bangladesh at Lord's."
  238. ), "Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and dedicated, can long endure.",
  239. "The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve the political bands which have connected them with another, and to assume among the powers of the earth, the separate and equal station to which the Laws of Nature and of Nature's God entitle them, a decent respect to the opinions of mankind requires that they should declare the causes which impel them to the separation. We hold these truths to be self-evident, that all men are created equal, that they are endowed by their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit of Happiness.--That to secure these rights, Governments are instituted among Men, deriving their just powers from the consent of the governed, --That whenever any Form of Government becomes destructive of these ends, it is the Right of the People to alter or to abolish it, and to institute new Government, laying its foundation on such principles and organizing its powers in such form, as to them shall seem most likely to effect their Safety and Happiness. Prudence, indeed, will dictate that Governments long established should not be changed for light and transient causes; and accordingly all experience hath shewn, that mankind are more disposed to suffer, while evils are sufferable, than to right themselves by abolishing the forms to which they are accustomed. But when a long train of abuses and usurpations, pursuing invariably the same Object evinces a design to reduce them under absolute Despotism, it is their right, it is their duty, to throw off such Government, and to provide new Guards for their future security.--Such has been the patient sufferance of these Colonies; and such is now the necessity which constrains them to alter their former Systems of Government. The history of the present King of Great Britain is a history of repeated injuries and usurpations, all having in direct object the establishment of an absolute Tyranny over these States. To prove this, let Facts be submitted to a candid world.",
  240. "Popular part of speech taggers (NLTK POS tagger, Stanford POS tagger) often make mistakes in the CV’s phrases tagging task. The reason is that often a CV text neglects grammar in order to highlight experience and to give it some structure (people start sentences with a predicate, not with a subject, sometimes phrases miss appropriate grammatical structure), a lot of words are specific terms or names. We had to write our own POS tagger solving the aforementioned problems. The classification is performed with a Keras neural network with three input layers each designed to take special class of data. The first input layer takes a variable length vector comprised of the described above features of the candidate phrases which could have arbitrary number of words. This feature vector is processed with an LSTM layer.",
  241. "One day you decided to create a navigation app for casual travelers. The app was centered around a beautiful map which helped users quickly orient themselves in any city. One of the most requested features for the app was automatic route planning. A user should be able to enter an address and see the fastest route to that destination displayed on the map. The first version of the app could only build the routes over roads. People who traveled by car were bursting with joy. But apparently, not everybody likes to drive on their vacation. So with the next update, you added an option to build walking routes. Right after that, you added another option to let people use public transport in their routes. However, that was only the beginning. Later you planned to add route building for cyclists. And even later, another option for building routes through all of a city’s tourist attractions.",
  242. '"Hi- yi ! You\'re up a stump, ain\'t you!" No answer. Tom surveyed his last touch with the eye of an artist, then he gave his brush another gentle sweep and surveyed the result, as before. Ben ranged up alongside of him. Tom\'s mouth watered for the apple, but he stuck to his work. Ben said: "Hello, old chap, you got to work, hey?" Tom wheeled suddenly and said: "Why, it\'s you, Ben! I warn\'t noticing." "Say -- I\'m going in a-swimming, I am. Don\'t you wish you could? But of course you\'d druther work -- wouldn\'t you? Course you would!" Tom contemplated the boy a bit, and said: "What do you call work?" "Why, ain\'t that work?"',
  243. "‎Our job is to love others without stopping to inquire whether or not they are worthy.",
  244. "You never know which thing you do is going to turn out to be important."]
  245.  
  246. for document in documents:
  247. topics = analyzer.analyze_topics(document)
  248.  
  249. print(f"\n--- --- Analyze --- ---\n\n{document}\n\n--- --- Analysis --- ---\n")
  250. for index, score in topics:
  251. print(analyzer.model.show_topic(index, 5))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement