Guest User

Untitled

a guest
Oct 18th, 2018
96
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.64 KB | None | 0 0
  1. import os
  2.  
  3. from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
  4. from sklearn.decomposition import LatentDirichletAllocation
  5. import pandas as pd
  6. import numpy as np
  7.  
  8. class TopicModelling:
  9. def topic_modelling(self, clean_docs):
  10. vect = CountVectorizer(ngram_range=(1, 1), stop_words='english')
  11. dtm = vect.fit_transform(clean_docs)
  12. dtf_df = pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())
  13. lda = LatentDirichletAllocation(n_topics= 20, n_components=5)
  14. doc_topic = lda.fit_transform(dtm)
  15. return (vect, lda, doc_topic)
  16.  
  17. def display_topics(self, lda, feature_names, no_top_words):
  18. for topic_idx, topic in enumerate(lda.components_):
  19. print("Topic %d:" % (topic_idx))
  20. print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
  21.  
  22. def display_do_topic(self, doc_topic):
  23. for i in range(doc_topic.shape[0]):
  24. topic_most_pr = doc_topic[i].argmax()
  25. print("doc: {} topic: {}\n".format(i,topic_most_pr))
  26.  
  27.  
  28. if (__name__ == '__main__'):
  29.  
  30. input_folder_path = "<data-folder-path>"
  31.  
  32. files_path = [os.path.join(input_folder_path, x) for x in os.listdir(input_folder_path)]
  33.  
  34. cleaned_docs = []
  35. for file in files_path:
  36. f = open(file)
  37. content = f.read()
  38. cleaned_docs.append(content)
  39.  
  40. topic_modelling_obj = TopicModelling()
  41.  
  42. (vect, lda, doc_topic) = topic_modelling_obj.topic_modelling(cleaned_raw_docs)
  43.  
  44. topic_modelling_obj.display_doc_topic(doc_topic)
  45.  
  46. no_top_words = 10
  47. feature_names = vect.get_feature_names()
  48. topic_modelling_obj.display_topics(lda, feature_names, no_top_words)
Add Comment
Please, Sign In to add comment