File size: 3,422 Bytes
87e9481
 
 
 
 
 
 
 
ab5e72f
87e9481
 
ab5e72f
 
 
87e9481
 
 
 
ab5e72f
 
87e9481
 
 
 
 
 
 
 
 
b409097
87e9481
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from bertopic import BERTopic
import streamlit as st
import streamlit.components.v1 as components
#from datasets import load_dataset
import pandas as pd
from datasets import load_dataset
import json

##Load Dataset from HF Hub
#dataset = load_dataset("rshah/million-headlines")
#news = pd.DataFrame.from_dict(dataset["train"])

#Load dataset locally - faster for demo
news = pd.read_parquet("topic_10000.par")
news['date'] = pd.to_datetime(news['publish_date'], format='%Y%m%d')
timestamps = news.date.to_list()
tweets = news.headline_text.to_list()

#Load topics
with open("topics", "r") as fp:
    topics = json.load(fp)

option_n = 5

st.set_page_config(page_title="News Topic Clustering")
st.title("News Topic Clustering")
st.caption("By Rajiv Shah")
st.caption("")
st.caption("This is a simple example of using identifying topics in the [one million ABC news headline dataset](https://huggingface.co/datasets/rshah/million-headlines).  \
    If you look at the code for this app, you will see how it uses just a few lines of [BERTopic](https://maartengr.github.io/BERTopic/index.html) to \
         build the topics and create the visualizations")
st.caption("The preloaded existing model provides the more interesting results.  However, this app can be run live by building a new model, but \
    is limited to a small number of rows. I also limited topics over time to the existing model.")


form = st.sidebar.form("Main Settings")
form.header("Main Settings")
option = form.selectbox(
    'What model would you like to run',
    ('Load existing model', 'Build new model'),index=0)

option_n = form.number_input(
    'What topic would you like to get terms for?',
    min_value=0,max_value=10,value=5)

submitted = form.form_submit_button(label = 'Select Model')

if option == 'Load existing model':
    ##Load existing model
    topic_model = BERTopic.load("topic_10000.model")
    #topics, _ = topic_model.transform(tweets)
else:
    ##Builds Topic Model
    #news_sample = news[(news['date'] > '2015-06-01')]
    news_sample = news[(news['date'] > '2017-01-01') & (news['date'] < '2019-01-01') ]
    news_sample = news_sample.sample(200,random_state=123)
    tweets = news_sample.headline_text.to_list()
    topic_model = BERTopic(min_topic_size=5, verbose=True)
    topics, _ = topic_model.fit_transform(tweets)


#Get top topics
freq = topic_model.get_topic_info()
freq = freq.iloc[1: , :]   ##drop -1 row
freq.head(10)
st.header("The Main Topic Clusters")
st.write(freq)


topic_nr = freq.iloc[option_n]["Topic"]  # We select a frequent topic
st.caption("")
st.write('Top words in topic cluster: ',option_n)
#st.caption(option_n)
mytuple = (topic_model.get_topic(topic_nr))
for item in mytuple:
    st.write(str(item[0]))

st.header("Relationships between clusters ")
st.plotly_chart(topic_model.visualize_hierarchy())


if option == 'Load existing model':
    st.header("Topics over time for Existing Model")
    topics_over_time = topic_model.topics_over_time(docs=tweets, 
                                                topics=topics, 
                                                timestamps=timestamps, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)

    st.plotly_chart(topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20))