Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
import os | |
import sys | |
from company_bankruptcy.components.model_trainer import ModelTrainer | |
from company_bankruptcy.components.data_transformation import DataTransformation | |
from company_bankruptcy.utils.utils import load_object | |
from company_bankruptcy.logger.logger import logging | |
from company_bankruptcy.exception.exception import CustomException | |
def get_prob(input_df, trained_models_dict, feature_selection_dict, opt_dict): | |
if best_model_name == 'Average Ensemble': | |
default_prob = 0 | |
for model_name in trained_models_dict: | |
if model_name == 'best_model_name': | |
continue | |
temp_features_list = feature_selection_dict[model_name][1]['selected_shap_feats'] | |
temp_prob = trained_models_dict[model_name].predict_proba(input_df[temp_features_list])[:, 1] | |
default_prob += temp_prob | |
default_prob /= (len(trained_models_dict) - 1) | |
elif best_model_name == 'Optimized Ensemble': | |
rfm_features_list = feature_selection_dict['RandomForestClassifier'][1]['selected_shap_feats'] | |
xgbm_features_list = feature_selection_dict['XGBClassifier'][1]['selected_shap_feats'] | |
lrm_features_list = feature_selection_dict['LogisticRegression'][1]['selected_shap_feats'] | |
svcm_features_list = feature_selection_dict['SVC'][1]['selected_shap_feats'] | |
preds_list = [] | |
for idx in opt_dict: | |
opt = opt_dict[idx]['opt'] | |
rfm = opt_dict[idx]['rfm'] | |
xgbm = opt_dict[idx]['xgbm'] | |
lrm = opt_dict[idx]['lrm'] | |
svcm = opt_dict[idx]['svcm'] | |
rfm_probs = rfm.predict_proba(input_df[rfm_features_list])[:, 1] | |
xgbm_probs = xgbm.predict_proba(input_df[xgbm_features_list])[:, 1] | |
lrm_probs = lrm.predict_proba(input_df[lrm_features_list])[:, 1] | |
svcm_probs = svcm.predict_proba(input_df[svcm_features_list])[:, 1] | |
model_preds = np.column_stack([ | |
rfm_probs, | |
xgbm_probs, | |
lrm_probs, | |
svcm_probs | |
]) | |
preds_list.append(opt.predict(model_preds)) | |
default_prob = np.mean(np.column_stack(preds_list), axis=1) | |
elif best_model_name == 'Rank Ensemble': | |
rank_ensemble_list = [] | |
prob_list = [] | |
model_names_list = [] | |
for model_name in trained_models_dict: | |
if model_name == 'best_model_name': | |
continue | |
temp_features_list = feature_selection_dict[model_name][1]['selected_shap_feats'] | |
model_names_list.append(model_name) | |
rank_ensemble_list.append((model_name, trained_models_dict[model_name].best_score_)) | |
prob_list.append(trained_models_dict[model_name].predict_proba(input_df[temp_features_list])[:, 1]) | |
rank_ensemble_list = sorted(rank_ensemble_list, key=lambda x: x[1]) | |
default_prob = 0 | |
for i in range(len(rank_ensemble_list)): | |
default_prob += (i+1) * prob_list[model_names_list.index(rank_ensemble_list[i][0])] | |
default_prob /= (len(rank_ensemble_list) * (1 + len(rank_ensemble_list)) / 2) | |
else: | |
model = trained_models_dict[best_model_name] | |
temp_features_list = feature_selection_dict[best_model_name][1]['selected_shap_feats'] | |
default_prob = model.predict_proba(input_df[temp_features_list])[:, 1] | |
return default_prob | |
st.set_page_config( | |
page_title='Default Predictor', | |
layout='centered' | |
) | |
try: | |
st.title('Company Bankruptcy Predictor') | |
logging.info('Initiating dictionaries') | |
if 'trained_models_dict' not in st.session_state: | |
model_trainer_obj = ModelTrainer() | |
trained_models_dict = load_object( | |
os.path.join( | |
model_trainer_obj.model_trainer_config.trained_models_path, | |
'trained_models_dict.pkl' | |
) | |
) | |
opt_dict = load_object( | |
os.path.join( | |
model_trainer_obj.model_trainer_config.trained_models_path, | |
'opt_dict.pkl' | |
) | |
) | |
data_transformation_obj = DataTransformation() | |
feature_selection_dict = load_object( | |
data_transformation_obj.data_transformation_config.feature_selection_dict_file_path | |
) | |
example_data = pd.read_excel('app_input_example.xlsx') | |
# example_data = pd.read_csv('app_input_example.csv') | |
st.session_state['trained_models_dict'] = trained_models_dict | |
st.session_state['opt_dict'] = opt_dict | |
st.session_state['feature_selection_dict'] = feature_selection_dict | |
st.session_state['example_data'] = example_data | |
else: | |
trained_models_dict = st.session_state['trained_models_dict'] | |
opt_dict = st.session_state['opt_dict'] | |
feature_selection_dict = st.session_state['feature_selection_dict'] | |
example_data = st.session_state['example_data'] | |
logging.info('Dictionaries initiated') | |
logging.info('Checking button clicked') | |
if 'clicked' not in st.session_state: | |
st.session_state.clicked = False | |
logging.info(f'Button check passed with value {st.session_state.clicked}') | |
st.subheader('Please, fill in the input boxes or provide an csv/excel file and click on submit button to get the default probability(ies).') | |
best_model_name = trained_models_dict['best_model_name'] | |
logging.info("Getting features' list") | |
if best_model_name in ['Average Ensemble', 'Optimized Ensemble', 'Rank Ensemble']: | |
features_list = [] | |
for model_name in feature_selection_dict: | |
features_list.extend( | |
feature_selection_dict[model_name][1]['selected_shap_feats'] | |
) | |
features_list = list(set(features_list)) | |
else: | |
features_list = feature_selection_dict[best_model_name][1]['selected_shap_feats'] | |
logging.info("Features' list found") | |
upload_container = st.container() | |
with upload_container: | |
upload_col1, upload_col2 = st.columns([0.6, 0.4]) | |
uploaded_file = upload_col1.file_uploader( | |
'Upload a csv/excel file with data', | |
type=["csv", "xlsx"] | |
) | |
# example_data = pd.read_csv('app_input_example.csv') | |
# example_data = pd.read_csv('artifacts/data.csv') | |
# example_data = pd.read_excel('app_input_example.xlsx') | |
# @st.cache_data | |
# def convert_df(df): | |
# return df.to_csv(index=False).encode("utf-8") | |
# # return df.to_excel(index=False).encode("utf-8") | |
# csv_data = convert_df(df=example_data[features_list]) | |
csv_data = example_data[features_list].to_csv(index=False).encode("utf-8") | |
upload_col2.write('An example of the data file') | |
upload_col2.download_button( | |
'Download', | |
data=csv_data, | |
file_name='input_example.csv', | |
mime="text/csv" | |
) | |
n_cols = 2 | |
n_rows = int((len(features_list) - len(features_list) % n_cols) / n_cols) | |
if len(features_list) % n_cols != 0: | |
n_rows += 1 | |
logging.info('Constructing the app input structure') | |
input_dict = {} | |
feature_idx = 0 | |
for i in range(n_rows): | |
temp_input_container = st.container() | |
with temp_input_container: | |
col1, col2 = st.columns(n_cols) | |
if i <= n_rows - 1 and len(features_list) % 2 == 0: | |
input_dict[features_list[feature_idx]] = [ | |
col1.number_input( | |
features_list[feature_idx], | |
format='%.6f' if features_list[feature_idx].split(' ')[-1] != 'Flag' else '%.0f' | |
) | |
] | |
input_dict[features_list[feature_idx+1]] = [ | |
col2.number_input( | |
features_list[feature_idx+1], | |
format='%.6f' if features_list[feature_idx+1].split(' ')[-1] != 'Flag' else '%.0f' | |
) | |
] | |
else: | |
input_dict[features_list[feature_idx]] = [ | |
col1.number_input( | |
features_list[feature_idx], | |
format='%.6f' if features_list[feature_idx].split(' ')[-1] != 'Flag' else '%.0f' | |
) | |
] | |
feature_idx += 2 | |
logging.info('Input structure constructed') | |
def set_button_click(): | |
st.session_state.clicked = True | |
st.button('Submit', on_click=set_button_click) | |
if st.session_state.clicked and uploaded_file is None: | |
st.session_state.clicked = False | |
logging.info(f'Calculating prob for {best_model_name}') | |
input_df = pd.DataFrame(input_dict) | |
default_prob = get_prob(input_df, trained_models_dict, feature_selection_dict, opt_dict) | |
st.write(f"Default probability: {default_prob[0]:.4f}") | |
logging.info(f'Default prob: {default_prob[0]:.4f}') | |
elif st.session_state.clicked and uploaded_file is not None: | |
st.session_state.clicked = False | |
# bites_data = uploaded_file.getvalue() | |
# stringio = StringIO(bites_data.decode('utf-8')) | |
# string_data = stringio.read() | |
logging.info('Loading uploaded data') | |
file_extension = uploaded_file.name.split('.')[-1] | |
if file_extension == 'csv': | |
input_df = pd.read_csv(uploaded_file) | |
else: | |
input_df = pd.read_excel(uploaded_file) | |
# input_df = pd.read_excel(uploaded_file) | |
logging.info('Uploaded data loaded') | |
with st.spinner('Please wait...'): | |
logging.info(f'Calculating probabilies for {best_model_name}') | |
default_prob = get_prob(input_df, trained_models_dict, feature_selection_dict, opt_dict) | |
logging.info('Probabilities calculated') | |
result_df = pd.DataFrame() | |
result_df['default_probability'] = default_prob | |
result_data = result_df.to_csv(index=False).encode("utf-8") | |
st.success('Done!') | |
st.download_button( | |
'Download the predicted probabilities', | |
data=result_data, | |
file_name='default_probabilities.csv', | |
mime='text/csv' | |
) | |
except Exception as e: | |
logging.info('Error occured while creating streamlit app') | |
raise CustomException(e, sys) | |