quick test drive on LDA topic modelling
import numpy as np import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation as LDA corpus_verbitam = ["computer, tea, early grey, hot"] cnt_vec = CountVectorizer(stop_words='english') cnt_data = cnt_vec.fit_transform(corpus_verbitam) words = cnt_vec.get_feature_names() # consider to remove stop words, punctuations lda = LDA(n_components=3) lda.fit(cnt_data) for topic_idx, topic in enumerate(lda.components_): print('\nTopic #%d:' % topic_idx+1) print(' '.join([words[i] for i in topic.argsort()[:-11:-1]]))












