Spaces:
Runtime error
Runtime error
from bertopic import BERTopic | |
import streamlit as st | |
import streamlit.components.v1 as components | |
#from datasets import load_dataset | |
import pandas as pd | |
from datasets import load_dataset | |
import json | |
##Load Dataset from HF Hub | |
#dataset = load_dataset("rshah/million-headlines") | |
#news = pd.DataFrame.from_dict(dataset["train"]) | |
#Load dataset locally - faster for demo | |
news = pd.read_parquet("topic_10000.par") | |
news['date'] = pd.to_datetime(news['publish_date'], format='%Y%m%d') | |
timestamps = news.date.to_list() | |
tweets = news.headline_text.to_list() | |
#Load topics | |
with open("topics", "r") as fp: | |
topics = json.load(fp) | |
option_n = 5 | |
st.set_page_config(page_title="News Topic Clustering") | |
st.title("News Topic Clustering") | |
st.caption("By Rajiv Shah") | |
st.caption("") | |
st.caption("This is a simple example of using identifying topics in the [one million ABC news headline dataset](https://huggingface.co/datasets/rshah/million-headlines). \ | |
If you look at the code for this app, you will see how it uses just a few lines of [BERTopic](https://maartengr.github.io/BERTopic/index.html) to \ | |
build the topics and create the visualizations") | |
st.caption("The preloaded existing model provides the more interesting results. However, this app can be run live by building a new model, but \ | |
is limited to a small number of rows. I also limited topics over time to the existing model.") | |
form = st.sidebar.form("Main Settings") | |
form.header("Main Settings") | |
option = form.selectbox( | |
'What model would you like to run', | |
('Load existing model', 'Build new model'),index=0) | |
option_n = form.number_input( | |
'What topic would you like to get terms for?', | |
min_value=0,max_value=10,value=5) | |
submitted = form.form_submit_button(label = 'Select Model') | |
if option == 'Load existing model': | |
##Load existing model | |
topic_model = BERTopic.load("topic_10000.model") | |
#topics, _ = topic_model.transform(tweets) | |
else: | |
##Builds Topic Model | |
#news_sample = news[(news['date'] > '2015-06-01')] | |
news_sample = news[(news['date'] > '2017-01-01') & (news['date'] < '2019-01-01') ] | |
news_sample = news_sample.sample(200,random_state=123) | |
tweets = news_sample.headline_text.to_list() | |
topic_model = BERTopic(min_topic_size=5, verbose=True) | |
topics, _ = topic_model.fit_transform(tweets) | |
#Get top topics | |
freq = topic_model.get_topic_info() | |
freq = freq.iloc[1: , :] ##drop -1 row | |
freq.head(10) | |
st.header("The Main Topic Clusters") | |
st.write(freq) | |
topic_nr = freq.iloc[option_n]["Topic"] # We select a frequent topic | |
st.caption("") | |
st.write('Top words in topic cluster: ',option_n) | |
#st.caption(option_n) | |
mytuple = (topic_model.get_topic(topic_nr)) | |
for item in mytuple: | |
st.write(str(item[0])) | |
st.header("Relationships between clusters ") | |
st.plotly_chart(topic_model.visualize_hierarchy()) | |
if option == 'Load existing model': | |
st.header("Topics over time for Existing Model") | |
topics_over_time = topic_model.topics_over_time(docs=tweets, | |
topics=topics, | |
timestamps=timestamps, | |
global_tuning=True, | |
evolution_tuning=True, | |
nr_bins=20) | |
st.plotly_chart(topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)) |