Corey Morris
commited on
Commit
·
d506f10
1
Parent(s):
5b83d0b
WIP commit. Currently have nlargest error
Browse files- app.py +32 -1
- result_data_processor.py +2 -0
app.py
CHANGED
@@ -107,7 +107,8 @@ def create_line_chart(df, model_names, metrics):
|
|
107 |
|
108 |
def find_top_differences_table(df, target_model, closest_models, num_differences=10, exclude_columns=['Parameters']):
|
109 |
# Calculate the absolute differences for each task between the target model and the closest models
|
110 |
-
|
|
|
111 |
# Unstack the differences and sort by the largest absolute difference
|
112 |
top_differences = differences.unstack().nlargest(num_differences)
|
113 |
# Convert the top differences to a DataFrame for display
|
@@ -120,6 +121,36 @@ def find_top_differences_table(df, target_model, closest_models, num_differences
|
|
120 |
return top_differences_table, unique_top_differences_tasks
|
121 |
|
122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
data_provider = ResultDataProcessor()
|
124 |
|
125 |
# st.title('Model Evaluation Results including MMLU by task')
|
|
|
107 |
|
108 |
def find_top_differences_table(df, target_model, closest_models, num_differences=10, exclude_columns=['Parameters']):
|
109 |
# Calculate the absolute differences for each task between the target model and the closest models
|
110 |
+
new_df = df.drop(columns=exclude_columns)
|
111 |
+
differences = new_df.loc[closest_models].sub(new_df.loc[target_model]).abs()
|
112 |
# Unstack the differences and sort by the largest absolute difference
|
113 |
top_differences = differences.unstack().nlargest(num_differences)
|
114 |
# Convert the top differences to a DataFrame for display
|
|
|
121 |
return top_differences_table, unique_top_differences_tasks
|
122 |
|
123 |
|
124 |
+
# def find_top_differences_table(df, target_model, closest_models, num_differences=10, exclude_columns=['Parameters', 'organization']):
|
125 |
+
# # Drop specified columns and create a new DataFrame
|
126 |
+
# new_df = df.drop(columns=exclude_columns)
|
127 |
+
|
128 |
+
# # Compute differences between target model and closest models, taking absolute values
|
129 |
+
# differences = new_df.loc[closest_models].sub(new_df.loc[target_model]).abs()
|
130 |
+
|
131 |
+
# # Unstack the differences
|
132 |
+
# unstacked_differences = differences.unstack()
|
133 |
+
|
134 |
+
# # Convert object types to numeric, ignoring errors to leave non-convertible elements as NaN
|
135 |
+
# unstacked_differences = pd.to_numeric(unstacked_differences, errors='coerce')
|
136 |
+
|
137 |
+
# # Find the top num_differences
|
138 |
+
# top_differences = unstacked_differences.nlargest(num_differences)
|
139 |
+
|
140 |
+
# # Convert the top differences to a DataFrame for display
|
141 |
+
# top_differences_table = pd.DataFrame({
|
142 |
+
# 'Task': [idx[0] for idx in top_differences.index],
|
143 |
+
# 'Difference': top_differences.values
|
144 |
+
# })
|
145 |
+
|
146 |
+
# # Ensure that only unique tasks are returned
|
147 |
+
# unique_top_differences_tasks = list(set(top_differences_table['Task'].tolist()))
|
148 |
+
|
149 |
+
# return top_differences_table, unique_top_differences_tasks
|
150 |
+
|
151 |
+
|
152 |
+
|
153 |
+
|
154 |
data_provider = ResultDataProcessor()
|
155 |
|
156 |
# st.title('Model Evaluation Results including MMLU by task')
|
result_data_processor.py
CHANGED
@@ -137,6 +137,8 @@ class ResultDataProcessor:
|
|
137 |
# remove extreme outliers from column harness|truthfulqa:mc1
|
138 |
data = self._remove_mc1_outliers(data)
|
139 |
|
|
|
|
|
140 |
return data
|
141 |
|
142 |
def rank_data(self):
|
|
|
137 |
# remove extreme outliers from column harness|truthfulqa:mc1
|
138 |
data = self._remove_mc1_outliers(data)
|
139 |
|
140 |
+
data = data.drop(columns=['organization'])
|
141 |
+
|
142 |
return data
|
143 |
|
144 |
def rank_data(self):
|