rajistics's picture
fixed BERTopic name
b409097
from bertopic import BERTopic
import streamlit as st
import streamlit.components.v1 as components
#from datasets import load_dataset
import pandas as pd
from datasets import load_dataset
import json
##Load Dataset from HF Hub
#dataset = load_dataset("rshah/million-headlines")
#news = pd.DataFrame.from_dict(dataset["train"])
#Load dataset locally - faster for demo
news = pd.read_parquet("topic_10000.par")
news['date'] = pd.to_datetime(news['publish_date'], format='%Y%m%d')
timestamps = news.date.to_list()
tweets = news.headline_text.to_list()
#Load topics
with open("topics", "r") as fp:
topics = json.load(fp)
option_n = 5
st.set_page_config(page_title="News Topic Clustering")
st.title("News Topic Clustering")
st.caption("By Rajiv Shah")
st.caption("")
st.caption("This is a simple example of using identifying topics in the [one million ABC news headline dataset](https://huggingface.co/datasets/rshah/million-headlines). \
If you look at the code for this app, you will see how it uses just a few lines of [BERTopic](https://maartengr.github.io/BERTopic/index.html) to \
build the topics and create the visualizations")
st.caption("The preloaded existing model provides the more interesting results. However, this app can be run live by building a new model, but \
is limited to a small number of rows. I also limited topics over time to the existing model.")
form = st.sidebar.form("Main Settings")
form.header("Main Settings")
option = form.selectbox(
'What model would you like to run',
('Load existing model', 'Build new model'),index=0)
option_n = form.number_input(
'What topic would you like to get terms for?',
min_value=0,max_value=10,value=5)
submitted = form.form_submit_button(label = 'Select Model')
if option == 'Load existing model':
##Load existing model
topic_model = BERTopic.load("topic_10000.model")
#topics, _ = topic_model.transform(tweets)
else:
##Builds Topic Model
#news_sample = news[(news['date'] > '2015-06-01')]
news_sample = news[(news['date'] > '2017-01-01') & (news['date'] < '2019-01-01') ]
news_sample = news_sample.sample(200,random_state=123)
tweets = news_sample.headline_text.to_list()
topic_model = BERTopic(min_topic_size=5, verbose=True)
topics, _ = topic_model.fit_transform(tweets)
#Get top topics
freq = topic_model.get_topic_info()
freq = freq.iloc[1: , :] ##drop -1 row
freq.head(10)
st.header("The Main Topic Clusters")
st.write(freq)
topic_nr = freq.iloc[option_n]["Topic"] # We select a frequent topic
st.caption("")
st.write('Top words in topic cluster: ',option_n)
#st.caption(option_n)
mytuple = (topic_model.get_topic(topic_nr))
for item in mytuple:
st.write(str(item[0]))
st.header("Relationships between clusters ")
st.plotly_chart(topic_model.visualize_hierarchy())
if option == 'Load existing model':
st.header("Topics over time for Existing Model")
topics_over_time = topic_model.topics_over_time(docs=tweets,
topics=topics,
timestamps=timestamps,
global_tuning=True,
evolution_tuning=True,
nr_bins=20)
st.plotly_chart(topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20))