Untitled

import os

from sklearn.feature_extraction.text  import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pandas as pd
import numpy as np

class TopicModelling:
    def topic_modelling(self, clean_docs):
        vect = CountVectorizer(ngram_range=(1, 1), stop_words='english')
        dtm = vect.fit_transform(clean_docs)
        dtf_df = pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())
        lda = LatentDirichletAllocation(n_topics= 20, n_components=5)
        doc_topic = lda.fit_transform(dtm)
        return (vect, lda, doc_topic)

def display_topics(self, lda, feature_names, no_top_words):
        for topic_idx, topic in enumerate(lda.components_):
            print("Topic %d:" % (topic_idx))
            print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

def display_do_topic(self, doc_topic):
    for i in range(doc_topic.shape[0]):
        topic_most_pr = doc_topic[i].argmax()
        print("doc: {} topic: {}\n".format(i,topic_most_pr))


if (__name__ == '__main__'):

    input_folder_path = "<data-folder-path>"

    files_path = [os.path.join(input_folder_path, x) for x in os.listdir(input_folder_path)]

    cleaned_docs = []
    for file in files_path:
        f = open(file)
        content = f.read()
        cleaned_docs.append(content)

    topic_modelling_obj = TopicModelling()

    (vect, lda, doc_topic) = topic_modelling_obj.topic_modelling(cleaned_raw_docs)

    topic_modelling_obj.display_doc_topic(doc_topic)

    no_top_words = 10
    feature_names = vect.get_feature_names()
    topic_modelling_obj.display_topics(lda, feature_names, no_top_words)