Instructor: Kang-Pyo Lee
# ! pip install --user --upgrade scikit-learn pyldavis
# ! pip install --user wordcloud
import pandas as pd
import nltk
df2 = pd.read_csv("html_metadata.csv", sep = "\t")
df2 = df2.drop_duplicates(keep="first")
df2.article_title = df2.article_title.astype(str)
df2
import nltk
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
global_stopwords = stopwords.words("english")
local_stopwords = [c for c in string.punctuation] +\
['’', '``', '…', '...', "''", '‘', '“', '”', "'m", "'re", "'s", "'ve", 'amp', 'https', "n't", 'rt',
'a…', 'co', 'i…','it’s', 'â€\x9d', 'one', 'could','would', 'also', 'â€', 'said', '—', 'itâ']
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(use_idf=True, norm="l2", stop_words=global_stopwords+local_stopwords, max_df=0.7)
X = vectorizer.fit_transform(df2.article_title)
X.shape
k = 5
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=k, random_state=0)
kmeans
%time kmeans.fit(X)
kmeans.cluster_centers_
kmeans.cluster_centers_.shape
kmeans.labels_
df2["cluster"] = kmeans.labels_
df2[["article_title", "cluster"]]
df2.cluster.value_counts()
counts = df2.cluster.value_counts()
df2[df2.cluster == counts.idxmax()].sample(10, replace=False, random_state=0)[["article_title", "cluster"]]
df2[df2.cluster == counts.idxmin()].sample(10, replace=False, random_state=0)[["article_title", "cluster"]]
import nltk
df2["words"] = df2.article_title.apply(lambda x: nltk.word_tokenize(x))
df2["tagged_words"] = df2.words.apply(lambda x: nltk.pos_tag(x))
from collections import Counter
def get_counter(dataframe, stopwords=[]):
counter = Counter()
for l in dataframe.tagged_words:
word_set = set()
for t in l:
word = t[0].lower()
tag = t[1]
if word not in stopwords:
word_set.add(word)
counter.update(word_set)
return counter
counter_max = get_counter(df2[df2.cluster == counts.idxmax()], global_stopwords+local_stopwords)
counter_max.most_common(30)
counter_min = get_counter(df2[df2.cluster == counts.idxmin()], global_stopwords+local_stopwords)
counter_min.most_common(30)
# ! pip install --user wordcloud
from wordcloud import WordCloud
from IPython.display import Image
wc = WordCloud(background_color="white", max_words=100, width=800, height=500)
wc.generate_from_frequencies(counter_max)
wc.to_file("wordcloud.png")
Image(filename="wordcloud.png")
### LDA Topic Modeling
num_topics = 5
from sklearn.decomposition import LatentDirichletAllocation as LDA
lda = LDA(n_components=num_topics, random_state=0) # LDA uses randomness to get a probability distribution
lda
%time lda.fit(X)
lda.components_
lda.components_.shape
def show_topics(model, feature_names, num_top_words):
for topic_idx, topic_scores in enumerate(model.components_):
print("***Topic {}:".format(topic_idx))
print(" + ".join(["{:.2f} * {}".format(topic_scores[i], feature_names[i]) for i in topic_scores.argsort()[::-1][:num_top_words]]))
print()
show_topics(lda, vectorizer.get_feature_names(), 10)
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda, X, vectorizer)
### End of Part 3