Sanshruth commited on
Commit
83d8f3b
·
verified ·
1 Parent(s): 6f86a30

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +364 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import h2o
3
+ from h2o.automl import H2OAutoML
4
+ import pandas as pd
5
+ import os
6
+ import numpy as np
7
+ from sklearn.metrics import mean_squared_error
8
+ import matplotlib.pyplot as plt
9
+ import shutil
10
+ import zipfile
11
+ import io
12
+ import tempfile
13
+ import zipfile
14
+
15
+ # Set page config at the very beginning
16
+ st.set_page_config(page_title="AquaLearn", layout="wide")
17
+
18
+ # Initialize the H2O server
19
+ h2o.init()
20
+ def rename_columns_alphabetically(df):
21
+ new_columns = [chr(65 + i) for i in range(len(df.columns))]
22
+ return df.rename(columns=dict(zip(df.columns, new_columns)))
23
+
24
+ def sanitize_column_name(name):
25
+ # Replace non-alphanumeric characters with underscores
26
+ sanitized = ''.join(c if c.isalnum() else '_' for c in name)
27
+ # Ensure the name starts with a letter or underscore
28
+ if not sanitized[0].isalpha() and sanitized[0] != '_':
29
+ sanitized = 'f_' + sanitized
30
+ return sanitized
31
+
32
+ # Create a directory for saving models
33
+ if not os.path.exists("saved_models"):
34
+ os.makedirs("saved_models")
35
+
36
+ def load_data():
37
+ st.title("Aqua Learn")
38
+ uploaded_file = st.file_uploader("Choose a CSV file", type="csv")
39
+ if uploaded_file is not None:
40
+ train = pd.read_csv(uploaded_file)
41
+ st.write(train.head())
42
+ return h2o.H2OFrame(train)
43
+ return None
44
+
45
+ def select_problem_type():
46
+ return st.selectbox("Select Problem Type:", ['Classification', 'Regression'])
47
+
48
+ def select_target_column(train_h2o):
49
+ return st.selectbox("Select Target Column:", train_h2o.columns)
50
+
51
+ def prepare_features(train_h2o, y, problem_type):
52
+ x = train_h2o.columns
53
+ x.remove(y)
54
+ if problem_type == 'Classification':
55
+ train_h2o[y] = train_h2o[y].asfactor()
56
+
57
+ # Rename columns
58
+ new_columns = [chr(65 + i) for i in range(len(train_h2o.columns))]
59
+ train_h2o.columns = new_columns
60
+ y = new_columns[-1] # Assume the target is the last column
61
+ x = new_columns[:-1]
62
+
63
+ return x, y, train_h2o
64
+
65
+ def select_algorithms():
66
+ algorithm_options = ['DeepLearning', 'GLM', 'GBM', 'DRF', 'XGBoost']
67
+ return st.multiselect("Select Algorithms:", algorithm_options)
68
+
69
+ def set_automl_parameters():
70
+ max_models = st.number_input("Max Models:", value=20, min_value=1)
71
+ max_runtime = st.number_input("Max Runtime (seconds):", value=600, min_value=1)
72
+ return max_models, max_runtime
73
+
74
+ def run_automl(x, y, train, problem_type, selected_algos, max_models, max_runtime):
75
+ aml = H2OAutoML(max_models=max_models,
76
+ seed=1,
77
+ max_runtime_secs=max_runtime,
78
+ sort_metric="AUC" if problem_type == 'Classification' else "RMSE",
79
+ include_algos=selected_algos)
80
+ aml.train(x=x, y=y, training_frame=train)
81
+ return aml
82
+
83
+ def display_results(aml, test):
84
+ st.subheader("AutoML Leaderboard")
85
+ st.write(aml.leaderboard.as_data_frame())
86
+
87
+ st.subheader("Best Model Performance")
88
+ best_model = aml.leader
89
+ perf = best_model.model_performance(test)
90
+ st.write(perf)
91
+
92
+ def save_and_evaluate_models(aml, test, y, problem_type):
93
+ if st.button("Save Models and Calculate Performance"):
94
+ model_performances = []
95
+ for model_id in aml.leaderboard['model_id'].as_data_frame().values:
96
+ model = h2o.get_model(model_id[0])
97
+
98
+ # model_path = os.path.join("saved_models", f"{model_id[0]}")
99
+ # h2o.save_model(model=model, path=model_path, force=True)
100
+ # st.session_state.saved_models.append((model_id[0], model_path))
101
+
102
+ preds = model.predict(test)
103
+ actual = test[y].as_data_frame().values.flatten()
104
+ predicted = preds.as_data_frame()['predict'].values.flatten()
105
+
106
+ if problem_type == 'Classification':
107
+ performance = (actual == predicted).mean()
108
+ metric_name = 'accuracy'
109
+ else:
110
+ performance = np.sqrt(mean_squared_error(actual, predicted))
111
+ metric_name = 'rmse'
112
+
113
+ model_performances.append({'model_id': model_id[0], metric_name: performance})
114
+
115
+ performance_df = pd.DataFrame(model_performances)
116
+ st.write(performance_df)
117
+
118
+ # Create and display the bar plot
119
+ st.subheader("Model Performance Visualization")
120
+ fig, ax = plt.subplots(figsize=(10, 6))
121
+ performance_df.sort_values(by=metric_name, ascending=False, inplace=True)
122
+ ax.barh(performance_df['model_id'], performance_df[metric_name], color='skyblue')
123
+ ax.set_xlabel(metric_name.capitalize())
124
+ ax.set_ylabel('Model ID')
125
+ ax.set_title(f'Model {metric_name.capitalize()} from H2O AutoML')
126
+ ax.grid(axis='x')
127
+ st.pyplot(fig)
128
+
129
+ def download_model():
130
+ st.subheader("Download Model")
131
+ if 'saved_models' in st.session_state and st.session_state.saved_models:
132
+ model_to_download = st.selectbox("Select Model to Download:",
133
+ [model[0] for model in st.session_state.saved_models])
134
+ if st.button("Download Selected Model"):
135
+ model_path = next(model[1] for model in st.session_state.saved_models if model[0] == model_to_download)
136
+
137
+ if os.path.isdir(model_path):
138
+ # If it's a directory, create a zip file
139
+ zip_buffer = io.BytesIO()
140
+ with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
141
+ for root, _, files in os.walk(model_path):
142
+ for file in files:
143
+ zip_file.write(os.path.join(root, file),
144
+ os.path.relpath(os.path.join(root, file), model_path))
145
+
146
+ zip_buffer.seek(0)
147
+ st.download_button(
148
+ label="Click to Download",
149
+ data=zip_buffer,
150
+ file_name=f"{model_to_download}.zip",
151
+ mime="application/zip"
152
+ )
153
+ else:
154
+ # If it's already a file, offer it for download
155
+ with open(model_path, "rb") as file:
156
+ st.download_button(
157
+ label="Click to Download",
158
+ data=file,
159
+ file_name=f"{model_to_download}.zip",
160
+ mime="application/zip"
161
+ )
162
+ else:
163
+ st.write("No models available for download. Please train and save models first.")
164
+
165
+ def further_training(aml, x, y, train, problem_type):
166
+ st.subheader("Further Training")
167
+ leaderboard_df = aml.leaderboard.as_data_frame()
168
+ model_to_train = st.selectbox("Select Model for Training:", leaderboard_df['model_id'].tolist())
169
+ training_time = st.number_input("Training Time (seconds):", value=60, min_value=1)
170
+
171
+ if st.button("Train Model"):
172
+ model = h2o.get_model(model_to_train)
173
+
174
+ with st.spinner(f"Training model: {model_to_train} for {training_time} seconds..."):
175
+ if isinstance(model, h2o.estimators.stackedensemble.H2OStackedEnsembleEstimator):
176
+ aml = H2OAutoML(max_runtime_secs=training_time, seed=1, sort_metric="AUC" if problem_type == 'Classification' else "RMSE")
177
+ aml.train(x=x, y=y, training_frame=train)
178
+ model = aml.leader
179
+ else:
180
+ model.train(x=x, y=y, training_frame=train, max_runtime_secs=training_time)
181
+
182
+ perf = model.model_performance(train)
183
+ st.write("Model performance after training:")
184
+ st.write(perf)
185
+
186
+ # Create a temporary directory to save the model
187
+ temp_dir = os.path.join("saved_models", "temp")
188
+ os.makedirs(temp_dir, exist_ok=True)
189
+ model_path = os.path.join(temp_dir, f"{model.model_id}")
190
+ h2o.save_model(model=model, path=model_path, force=True)
191
+
192
+ # Create a zip file of the model
193
+ zip_buffer = io.BytesIO()
194
+ with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
195
+ for root, _, files in os.walk(model_path):
196
+ for file in files:
197
+ zip_file.write(os.path.join(root, file),
198
+ os.path.relpath(os.path.join(root, file), model_path))
199
+
200
+ zip_buffer.seek(0)
201
+ st.download_button(
202
+ label="Download Retrained Model",
203
+ data=zip_buffer,
204
+ file_name=f"{model.model_id}.zip",
205
+ mime="application/zip"
206
+ )
207
+
208
+ # Clean up the temporary directory
209
+ shutil.rmtree(temp_dir)
210
+
211
+ st.success(f"Retrained model ready for download: {model.model_id}")
212
+
213
+ def make_prediction():
214
+ st.subheader("Make Prediction")
215
+
216
+ uploaded_zip = st.file_uploader("Upload a zip file containing the model", type="zip")
217
+ if uploaded_zip is not None:
218
+ with tempfile.TemporaryDirectory() as tmpdirname:
219
+ zip_path = os.path.join(tmpdirname, "model.zip")
220
+ with open(zip_path, "wb") as f:
221
+ f.write(uploaded_zip.getbuffer())
222
+
223
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
224
+ zip_ref.extractall(tmpdirname)
225
+
226
+ extracted_files = os.listdir(tmpdirname)
227
+ if len(extracted_files) == 0:
228
+ st.error("The uploaded zip file is empty.")
229
+ return
230
+
231
+ model_file = next((f for f in extracted_files if f != "model.zip"), None)
232
+ if model_file is None:
233
+ st.error("No model file found in the uploaded zip.")
234
+ return
235
+
236
+ model_path = os.path.join(tmpdirname, model_file)
237
+
238
+ try:
239
+ model_for_prediction = h2o.load_model(model_path)
240
+ except Exception as e:
241
+ st.error(f"Error loading the model: {str(e)}")
242
+ st.error("Please ensure you're uploading a valid H2O model file.")
243
+ return
244
+
245
+ # Ask user to input feature names
246
+ feature_names_input = st.text_input("Enter feature names, separated by commas:")
247
+ original_feature_names = [name.strip() for name in feature_names_input.split(',') if name.strip()]
248
+
249
+ if not original_feature_names:
250
+ st.error("Please enter at least one feature name.")
251
+ return
252
+
253
+ # Create a mapping from original names to A, B, C, etc.
254
+ feature_mapping = {name: chr(65 + i) for i, name in enumerate(original_feature_names)}
255
+ reverse_mapping = {v: k for k, v in feature_mapping.items()}
256
+
257
+ prediction_type = st.radio("Choose prediction type:", ["Upload CSV", "Single Entry"])
258
+
259
+ if prediction_type == "Upload CSV":
260
+ uploaded_csv = st.file_uploader("Upload a CSV file for prediction", type="csv")
261
+ if uploaded_csv is not None:
262
+ prediction_data = pd.read_csv(uploaded_csv)
263
+
264
+ # Rename columns to A, B, C, etc.
265
+ prediction_data = prediction_data.rename(columns=feature_mapping)
266
+
267
+ prediction_h2o = h2o.H2OFrame(prediction_data)
268
+ try:
269
+ predictions = model_for_prediction.predict(prediction_h2o)
270
+ predictions_df = predictions.as_data_frame()
271
+
272
+ # Combine original data with predictions
273
+ result_df = pd.concat([prediction_data, predictions_df], axis=1)
274
+
275
+ # Rename columns back to original names for display
276
+ result_df = result_df.rename(columns=reverse_mapping)
277
+
278
+ st.write("Predictions (showing first 10 rows):")
279
+ st.write(result_df.head(10))
280
+
281
+ # Option to download the full results
282
+ csv = result_df.to_csv(index=False)
283
+ st.download_button(
284
+ label="Download full results as CSV",
285
+ data=csv,
286
+ file_name="predictions_results.csv",
287
+ mime="text/csv"
288
+ )
289
+ except Exception as e:
290
+ st.error(f"Error making predictions: {str(e)}")
291
+ st.error("Please ensure your CSV file matches the model's expected input format.")
292
+
293
+ else: # Single Entry
294
+ sample_input = {}
295
+ for original_name, coded_name in feature_mapping.items():
296
+ value = st.text_input(f"Enter {original_name} ({coded_name}):")
297
+ try:
298
+ sample_input[coded_name] = [float(value)]
299
+ except ValueError:
300
+ sample_input[coded_name] = [value]
301
+
302
+ if st.button("Predict"):
303
+ sample_h2o = h2o.H2OFrame(sample_input)
304
+ try:
305
+ predictions = model_for_prediction.predict(sample_h2o)
306
+ prediction_value = predictions['predict'][0,0]
307
+ st.write(f"Predicted value: {prediction_value}")
308
+ except Exception as e:
309
+ st.error(f"Error making prediction: {str(e)}")
310
+ st.error("Please ensure you've entered valid input values.")
311
+ else:
312
+ st.write("Please upload a zip file containing the model to make predictions.")
313
+ def main():
314
+ train_h2o = load_data()
315
+ if train_h2o is not None:
316
+ problem_type = select_problem_type()
317
+ target_column = select_target_column(train_h2o)
318
+
319
+ if st.button("Set Target and Continue"):
320
+ x, target_column, train_h2o = prepare_features(train_h2o, target_column, problem_type)
321
+ st.session_state.features_prepared = True
322
+ st.session_state.x = x
323
+ st.session_state.target_column = target_column
324
+ st.session_state.train_h2o = train_h2o
325
+ st.session_state.problem_type = problem_type
326
+
327
+ if 'features_prepared' in st.session_state and st.session_state.features_prepared:
328
+ st.write(f"Target Column: {st.session_state.target_column}")
329
+ st.write(f"Feature Columns: {st.session_state.x}")
330
+
331
+ train, test = st.session_state.train_h2o.split_frame(ratios=[0.8])
332
+
333
+ selected_algos = select_algorithms()
334
+ max_models, max_runtime = set_automl_parameters()
335
+
336
+ if st.button("Start AutoML"):
337
+ if not selected_algos:
338
+ st.error("Please select at least one algorithm.")
339
+ else:
340
+ with st.spinner("Running AutoML..."):
341
+ aml = run_automl(st.session_state.x, st.session_state.target_column, train,
342
+ st.session_state.problem_type, selected_algos, max_models, max_runtime)
343
+
344
+ st.success("AutoML training completed.")
345
+ st.session_state.aml = aml
346
+ st.session_state.test = test
347
+
348
+ if 'aml' in st.session_state:
349
+ display_results(st.session_state.aml, st.session_state.test)
350
+ save_and_evaluate_models(st.session_state.aml, st.session_state.test, st.session_state.target_column, st.session_state.problem_type)
351
+ download_model()
352
+ further_training(st.session_state.aml, st.session_state.x, st.session_state.target_column, train, st.session_state.problem_type)
353
+
354
+ make_prediction() # Call make_prediction without arguments
355
+
356
+ if __name__ == "__main__":
357
+ if 'features_prepared' not in st.session_state:
358
+ st.session_state.features_prepared = False
359
+ if 'saved_models' not in st.session_state:
360
+ st.session_state.saved_models = []
361
+ main()
362
+
363
+ # Clean up saved models when the script ends
364
+ shutil.rmtree("saved_models", ignore_errors=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit==1.39.0
2
+ h2o==3.46.0.5
3
+ pandas==2.2.2
4
+ matplotlib==3.7.1
5
+ numpy==1.26.4