JamesBentley commited on
Commit
7c64532
·
verified ·
1 Parent(s): 102e55b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +324 -0
app.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import torch
5
+ from chronos import ChronosPipeline
6
+ import plotly.graph_objects as go
7
+ import plotly.express as px
8
+ import base64
9
+
10
+ @st.cache_resource
11
+ def load_pipeline():
12
+ return ChronosPipeline.from_pretrained(
13
+ "amazon/chronos-t5-small",
14
+ device_map="auto",
15
+ torch_dtype=torch.bfloat16,
16
+ )
17
+
18
+ @st.cache_data
19
+ def preprocess_data(data, date_column, metric_column, date_format):
20
+ if date_format == "day-month-year":
21
+ data[date_column] = pd.to_datetime(data[date_column], dayfirst=True)
22
+ elif date_format == "month-day-year":
23
+ data[date_column] = pd.to_datetime(data[date_column], dayfirst=False)
24
+
25
+ time_series_data = data.set_index(date_column)[metric_column].astype(float)
26
+ return time_series_data
27
+
28
+ def make_forecast(time_series_data, prediction_length, interval):
29
+ pipeline = load_pipeline()
30
+ context = torch.tensor(time_series_data.values)
31
+ forecast = pipeline.predict(context, prediction_length)
32
+
33
+ low, median, high = np.quantile(forecast[0].numpy(), [0.1, 0.5, 0.9], axis=0)
34
+ last_date = time_series_data.index[-1]
35
+ forecast_index = pd.date_range(start=last_date + pd.Timedelta(days=interval), periods=prediction_length, freq=f'{interval}D')
36
+ forecast_df = pd.DataFrame({
37
+ "Date": forecast_index,
38
+ "Low": low,
39
+ "Median": median,
40
+ "High": high
41
+ })
42
+
43
+ # Ensure 'Date' is a column, not the index
44
+ forecast_df.reset_index(drop=True, inplace=True)
45
+
46
+ return forecast_df
47
+
48
+ def get_csv_download_link(df, filename):
49
+ csv = df.to_csv(index=True)
50
+ b64 = base64.b64encode(csv.encode()).decode()
51
+ href = f'<a href="data:file/csv;base64,{b64}" download="{filename}">Download {filename}</a>'
52
+ return href
53
+
54
+ def visualize_initial_forecast(forecast_df, time_series_data):
55
+ fig = go.Figure()
56
+
57
+ fig.add_trace(go.Scatter(x=time_series_data.index, y=time_series_data,
58
+ mode='lines', name='Historical Data',
59
+ line=dict(color='blue')))
60
+ fig.add_trace(go.Scatter(x=forecast_df['Date'], y=forecast_df['Low'],
61
+ mode='lines+markers', name='Low Forecast',
62
+ line=dict(color='red')))
63
+ fig.add_trace(go.Scatter(x=forecast_df['Date'], y=forecast_df['Median'],
64
+ mode='lines+markers', name='Median Forecast',
65
+ line=dict(color='green')))
66
+ fig.add_trace(go.Scatter(x=forecast_df['Date'], y=forecast_df['High'],
67
+ mode='lines+markers', name='High Forecast',
68
+ line=dict(color='orange')))
69
+
70
+ fig.update_layout(
71
+ title="Chronos Forecast - click and drag the crosshairs to select an area to zoom in on",
72
+ xaxis_title="Date",
73
+ yaxis_title="Metric Value",
74
+ legend_title="Legend",
75
+ font=dict(size=12),
76
+ xaxis=dict(rangeslider=dict(visible=False), type="date"),
77
+ )
78
+
79
+ # Remove this line: st.plotly_chart(fig)
80
+
81
+ # Add download links
82
+ st.markdown(get_csv_download_link(time_series_data.reset_index(), "historical_data.csv"), unsafe_allow_html=True)
83
+ st.markdown(get_csv_download_link(forecast_df, "forecast_data.csv"), unsafe_allow_html=True)
84
+
85
+ return fig
86
+
87
+ def visualize_forecast(forecast_df, time_series_data, ground_truth_df=None, ground_truth_date_format=None, ground_truth_date_col=None, ground_truth_metric_col=None):
88
+ try:
89
+ # Ensure forecast_df has a datetime index
90
+ if 'Date' in forecast_df.columns:
91
+ forecast_df['Date'] = pd.to_datetime(forecast_df['Date'])
92
+ forecast_df.set_index('Date', inplace=True)
93
+ elif not isinstance(forecast_df.index, pd.DatetimeIndex):
94
+ forecast_df.index = pd.to_datetime(forecast_df.index)
95
+
96
+ # Prepare ground truth data if available
97
+ if ground_truth_df is not None:
98
+ if ground_truth_date_format == "day-month-year":
99
+ ground_truth_df[ground_truth_date_col] = pd.to_datetime(ground_truth_df[ground_truth_date_col], dayfirst=True)
100
+ elif ground_truth_date_format == "month-day-year":
101
+ ground_truth_df[ground_truth_date_col] = pd.to_datetime(ground_truth_df[ground_truth_date_col], dayfirst=False)
102
+ ground_truth_df.set_index(ground_truth_date_col, inplace=True)
103
+ ground_truth_df = ground_truth_df.rename(columns={ground_truth_metric_col: 'Actual'})
104
+
105
+ # Merge forecast with ground truth
106
+ merged_df = forecast_df.join(ground_truth_df['Actual'], how='outer')
107
+ else:
108
+ merged_df = forecast_df
109
+
110
+ # Sort the index to ensure correct plotting
111
+ merged_df.sort_index(inplace=True)
112
+
113
+ # Calculate variances and percentage variances
114
+ if 'Actual' in merged_df.columns:
115
+ merged_df['Low Variance'] = merged_df['Low'] - merged_df['Actual']
116
+ merged_df['Median Variance'] = merged_df['Median'] - merged_df['Actual']
117
+ merged_df['High Variance'] = merged_df['High'] - merged_df['Actual']
118
+
119
+ merged_df['Low % Variance'] = (merged_df['Low'] - merged_df['Actual']) / merged_df['Actual'] * 100
120
+ merged_df['Median % Variance'] = (merged_df['Median'] - merged_df['Actual']) / merged_df['Actual'] * 100
121
+ merged_df['High % Variance'] = (merged_df['High'] - merged_df['Actual']) / merged_df['Actual'] * 100
122
+
123
+ # Determine the maximum value for the y-axis scale
124
+ max_value = merged_df[['Low', 'Median', 'High'] + (['Actual'] if 'Actual' in merged_df.columns else [])].max().max()
125
+
126
+ # Plot the trendlines using Plotly
127
+ fig = go.Figure()
128
+
129
+ if 'Actual' in merged_df.columns:
130
+ fig.add_trace(go.Scatter(x=merged_df.index, y=merged_df['Actual'],
131
+ mode='lines+markers', name='Actual',
132
+ line=dict(color='black', dash='dot')))
133
+
134
+ fig.add_trace(go.Scatter(x=merged_df.index, y=merged_df['Low'],
135
+ mode='lines+markers', name='Low Forecast',
136
+ line=dict(color='red')))
137
+ fig.add_trace(go.Scatter(x=merged_df.index, y=merged_df['Median'],
138
+ mode='lines+markers', name='Median Forecast',
139
+ line=dict(color='green')))
140
+ fig.add_trace(go.Scatter(x=merged_df.index, y=merged_df['High'],
141
+ mode='lines+markers', name='High Forecast',
142
+ line=dict(color='blue')))
143
+
144
+ # Update layout
145
+ fig.update_layout(
146
+ title="Actual vs Forecast - click and drag the crosshairs to select an area to zoom in on",
147
+ xaxis_title="Date",
148
+ yaxis_title="Metric Value",
149
+ legend_title="Legend",
150
+ font=dict(size=12),
151
+ xaxis=dict(
152
+ rangeslider=dict(visible=False),
153
+ type="date"
154
+ ),
155
+ yaxis=dict(range=[0, max_value * 1.1]) # Set y-axis range dynamically with some padding
156
+ )
157
+
158
+ st.plotly_chart(fig)
159
+
160
+ # Prepare CSV for download
161
+ csv_df = merged_df.copy()
162
+ csv_df = csv_df.round(2) # Round all float columns to 2 decimal places
163
+ csv_df = csv_df.replace([np.inf, -np.inf], np.nan).fillna('') # Replace inf with empty string
164
+
165
+ # Add download link for the comparison chart data
166
+ st.markdown(get_csv_download_link(csv_df, "forecast_vs_actual.csv"), unsafe_allow_html=True)
167
+
168
+ # Calculate and display variances if ground truth is available
169
+ if 'Actual' in merged_df.columns:
170
+ # Filter for only the forecasted period
171
+ forecast_period = merged_df.dropna(subset=['Low', 'Median', 'High', 'Actual'])
172
+
173
+ # Calculate total variances for the forecasted period only
174
+ totals = forecast_period[["Low", "Median", "High", "Actual"]].sum()
175
+ total_low_variance = (totals["Low"] - totals["Actual"]) / totals["Actual"] if totals["Actual"] != 0 else 0
176
+ total_median_variance = (totals["Median"] - totals["Actual"]) / totals["Actual"] if totals["Actual"] != 0 else 0
177
+ total_high_variance = (totals["High"] - totals["Actual"]) / totals["Actual"] if totals["Actual"] != 0 else 0
178
+
179
+ # Create a bar chart for percentage variances
180
+ bar_df = pd.DataFrame({
181
+ 'Metric': ['Low Variance', 'Median Variance', 'High Variance'],
182
+ 'Value': [total_low_variance * 100, total_median_variance * 100, total_high_variance * 100]
183
+ })
184
+
185
+ bar_fig = px.bar(bar_df, x='Metric', y='Value', title='Percentage Variances', labels={'Value': 'Percentage (%)'})
186
+ st.plotly_chart(bar_fig)
187
+
188
+ # Add download link for the variance data
189
+ st.markdown(get_csv_download_link(bar_df, "variance_data.csv"), unsafe_allow_html=True)
190
+
191
+ st.write(f"Total Low Variance: {total_low_variance:.2f}%")
192
+ st.write(f"Total Median Variance: {total_median_variance:.2f}%")
193
+ st.write(f"Total High Variance: {total_high_variance:.2f}%")
194
+
195
+ except Exception as e:
196
+ st.error(f"An error occurred during visualization: {str(e)}")
197
+ st.write("Debug: Exception details")
198
+ st.write(e)
199
+
200
+ def main():
201
+ st.title("Amazon Chronos Forecasting App")
202
+
203
+ tab1, tab2, tab3 = st.tabs(["Run a Forecast", "Compare to Actual", "User Guide"])
204
+
205
+ with tab1:
206
+ uploaded_file = st.file_uploader("Upload CSV file with historical data", type=["csv"])
207
+ if uploaded_file is not None:
208
+ data = pd.read_csv(uploaded_file)
209
+ st.write("File uploaded successfully")
210
+ st.subheader("Uploaded Data")
211
+ st.write(data)
212
+
213
+ date_column = st.selectbox("Select the Date column", data.columns)
214
+ metric_column = st.selectbox("Select the Metric column", data.columns)
215
+ date_format = st.radio("Select the date format of the Date column", ("day-month-year", "month-day-year"))
216
+
217
+ prediction_length = st.number_input("Enter the prediction length", min_value=1, value=12)
218
+ interval = st.number_input("Enter the interval in days", min_value=1, value=7)
219
+
220
+ if st.button("Make Forecast"):
221
+ time_series_data = preprocess_data(data, date_column, metric_column, date_format)
222
+ forecast_df = make_forecast(time_series_data, prediction_length, interval)
223
+
224
+ st.session_state.forecast_df = forecast_df
225
+ st.session_state.time_series_data = time_series_data
226
+
227
+ st.subheader("Forecast Visualization")
228
+ st.write("Forecasted Values:")
229
+ st.write(forecast_df)
230
+
231
+ initial_forecast_fig = visualize_initial_forecast(forecast_df, time_series_data)
232
+ st.session_state.initial_forecast_fig = initial_forecast_fig
233
+ st.plotly_chart(initial_forecast_fig)
234
+
235
+ with tab2:
236
+ st.subheader("Compare Forecast to Actual Data")
237
+
238
+ if 'forecast_df' not in st.session_state or 'time_series_data' not in st.session_state:
239
+ st.warning("Please make a forecast in the 'Run Forecast' tab first.")
240
+ else:
241
+ ground_truth_file = st.file_uploader("Upload CSV file with your actual 'ground truth' data to see how accurate the forecast is", type=["csv"], key="ground_truth_file")
242
+ if ground_truth_file is not None:
243
+ ground_truth_df = pd.read_csv(ground_truth_file)
244
+ st.write("Actual data file uploaded successfully")
245
+ st.subheader("Actual Data")
246
+ st.write(ground_truth_df)
247
+
248
+ ground_truth_date_col = st.selectbox("Select the Date column for actual data", ground_truth_df.columns, key="gt_date_col")
249
+ ground_truth_metric_col = st.selectbox("Select the Metric column for actual data", ground_truth_df.columns, key="gt_metric_col")
250
+ ground_truth_date_format = st.radio("Select the date format for actual data", ("day-month-year", "month-day-year"), key="gt_date_format")
251
+
252
+ if st.button("Compare Forecast to Actual Data"):
253
+ st.subheader("Comparison with Actual Data")
254
+ if 'initial_forecast_fig' in st.session_state:
255
+ st.subheader("Chronos Forecast")
256
+ st.plotly_chart(st.session_state.initial_forecast_fig)
257
+
258
+ st.subheader("Forecast vs Actual Data")
259
+ visualize_forecast(st.session_state.forecast_df, st.session_state.time_series_data,
260
+ ground_truth_df, ground_truth_date_format, ground_truth_date_col, ground_truth_metric_col)
261
+
262
+ with tab3:
263
+ st.subheader("User Guide")
264
+ st.write("""
265
+ This is a demo HuggingFace app which gives you everything you need to test Amazon Chronos T5 Small using a demo ecommerce sales dataset.
266
+
267
+ As per the Hugging Face description:
268
+
269
+ 'Chronos is a family of pretrained time series forecasting models based on language model architectures. Chronos models have been trained on a large corpus of publicly available time series data, as well as synthetic data generated using Gaussian processes.
270
+ For more info see:
271
+ - [Hugging Face Chronos T5 Small](https://huggingface.co/amazon/chronos-t5-small)
272
+ - [GitHub: Chronos Forecasting](https://github.com/amazon-science/chronos-forecasting)
273
+
274
+ Please Share, Cite and Connect with Me:
275
+
276
+ If you liked or found this notebook at all helpful please share it, and simply cite me as the original source... feel free to connect with me on LinkedIn here:
277
+ - [LinkedIn: James Bentley](https://www.linkedin.com/in/james-bentley-1b329214/)
278
+
279
+ Youtube Video Walkthrough of a Google Colab Notebook I built previously - which I based this app on:
280
+ - [Watch here](https://www.youtube.com/watch?v=jyrOmIiI2Bc&t=103s)
281
+
282
+ Disclaimer: This is purely for educational purposes.
283
+
284
+ **Upload Your CSV File From Your Computer:**
285
+ It should contain two columns, the first column should contain your dates, and the second should contain the metric you would like to predict, as pictured below.
286
+
287
+ You can download a copies of the csv files I use for this test here (be sure to save them as csv):
288
+ - [Sales.csv](https://docs.google.com/spreadsheets/d/1_tyquxKwYRWFyp0r8tMvpWoAIqJmS8fEG0wsxFT58B0/edit?usp=sharing)
289
+ - [Actual.csv](https://docs.google.com/spreadsheets/d/1yjebWmbmY-rAyB_TDXAye8i-yoiqKA2dW_SHmtL2ihM/edit?usp=sharing)
290
+
291
+ **Confirm Your Column Names:**
292
+ Now we just need to confirm which column contains your dates and which contains your metric that you want to forecast, this is just so we can properly handle it based on whatever you have named them.
293
+
294
+ **Generate Forecast and CSV File:**
295
+ To run your forecast you will need to confirm two settings,
296
+
297
+ - The forecast length, so this is the number of timepoints you want to run the forecast for, so for example if you wanted to run a 31 day forecast for a month, then you would select 31, if you wanted to run only 7 days next week, then you would select seven, or if you wanted to run 12 months, with one forecast for each month, you would select 12. The current default is set at 12 (to work with the demo). If you plan to assess forecast accuracy against some test data, then you should make sure that this number matches the number of date ranges you want to test against where you have data.
298
+
299
+ - The Interval Period, so this means how many days should be between each forecasted period, so if you wanted to run the forecast for consecutive days then you would select 1, if you wanted to run the forecast for each week, then you would select 7.
300
+
301
+ **Check the Accuracy of Your Forecast Against Actual Data:**
302
+
303
+ If you want to check the accuracy of the forecaster against some real data, which you didn't include in the original csv, then you can do that by uploading an actual.csv file (or whatever you choose to name it).
304
+
305
+ This file should contain the actual data for the dates you ran the forecast for.
306
+
307
+ This should be a two column file with a date range in the first column, and the metric in the second column, and by comparing this to the forecast you'll be able to see what kind of accuracy it outputs.
308
+
309
+ Below is the file I use in my demo
310
+
311
+ - [Actual.csv](https://docs.google.com/spreadsheets/d/1yjebWmbmY-rAyB_TDXAye8i-yoiqKA2dW_SHmtL2ihM/edit?usp=sharing)
312
+
313
+
314
+ **Select the Actual.csv File and Confirm The Column Names:**
315
+ Now you just need to confirm the column names that need to be used.
316
+
317
+ **Generate Actual vs Forecast Trendline Chart and CSV:**
318
+ Now that you have setup your actual file you can generate a trendline chart to show how the forecasts tracked vs your actual data for the forecasted date range.
319
+
320
+ A csv file is also available to download which shows the combined original data, forecasts and actuals with % variances.
321
+ """)
322
+
323
+ if __name__ == "__main__":
324
+ main()