Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
- from sklearn.decomposition import LatentDirichletAllocation
- import pandas as pd
- import numpy as np
- class TopicModelling:
- def topic_modelling(self, clean_docs):
- vect = CountVectorizer(ngram_range=(1, 1), stop_words='english')
- dtm = vect.fit_transform(clean_docs)
- dtf_df = pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())
- lda = LatentDirichletAllocation(n_topics= 20, n_components=5)
- doc_topic = lda.fit_transform(dtm)
- return (vect, lda, doc_topic)
- def display_topics(self, lda, feature_names, no_top_words):
- for topic_idx, topic in enumerate(lda.components_):
- print("Topic %d:" % (topic_idx))
- print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
- def display_do_topic(self, doc_topic):
- for i in range(doc_topic.shape[0]):
- topic_most_pr = doc_topic[i].argmax()
- print("doc: {} topic: {}\n".format(i,topic_most_pr))
- if (__name__ == '__main__'):
- input_folder_path = "<data-folder-path>"
- files_path = [os.path.join(input_folder_path, x) for x in os.listdir(input_folder_path)]
- cleaned_docs = []
- for file in files_path:
- f = open(file)
- content = f.read()
- cleaned_docs.append(content)
- topic_modelling_obj = TopicModelling()
- (vect, lda, doc_topic) = topic_modelling_obj.topic_modelling(cleaned_raw_docs)
- topic_modelling_obj.display_doc_topic(doc_topic)
- no_top_words = 10
- feature_names = vect.get_feature_names()
- topic_modelling_obj.display_topics(lda, feature_names, no_top_words)
Add Comment
Please, Sign In to add comment