Spaces:
Running
Running
import glob | |
import json | |
import os | |
import shutil | |
import sys | |
import urllib | |
from collections import defaultdict | |
from datetime import datetime | |
from statistics import mean | |
import pandas as pd | |
import requests | |
from constants import BASE_WHISPERKIT_BENCHMARK_URL | |
from text_normalizer import text_normalizer | |
from utils import compute_average_wer, download_dataset | |
def fetch_evaluation_data(url): | |
""" | |
Fetches evaluation data from the given URL. | |
:param url: The URL to fetch the evaluation data from. | |
:returns: The evaluation data as a dictionary. | |
:rauses: sys.exit if the request fails | |
""" | |
response = requests.get(url) | |
if response.status_code == 200: | |
return json.loads(response.text) | |
else: | |
sys.exit(f"Failed to fetch WhisperKit evals: {response.text}") | |
def generate_device_map(base_dir): | |
""" | |
Generates a mapping of device identifiers to their corresponding device models. | |
This function iterates through all summary files in the specified base directory and its subdirectories, | |
extracting device identifier and device model information. It stores this information in a dictionary, | |
where the keys are device identifiers and the values are device models. | |
:param base_dir: The base directory to search for summary files. | |
:returns: A dictionary mapping device identifiers to device models. | |
""" | |
device_map = {} | |
# Find all summary files recursively | |
summary_files = glob.glob(f"{base_dir}/**/*summary*.json", recursive=True) | |
for file_path in summary_files: | |
try: | |
with open(file_path, "r") as f: | |
data = json.load(f) | |
# Extract device information and create simple mapping | |
if "deviceModel" in data and "deviceIdentifier" in data: | |
device_map[data["deviceIdentifier"]] = data["deviceModel"] | |
except json.JSONDecodeError: | |
print(f"Error reading {file_path}") | |
except Exception as e: | |
print(f"Error processing {file_path}: {e}") | |
# Save the device map to project root | |
output_path = "dashboard_data/device_map.json" | |
with open(output_path, "w") as f: | |
json.dump(device_map, f, indent=4, sort_keys=True) | |
return device_map | |
def get_device_name(device): | |
""" | |
Gets the device name from the device map if it exists. | |
:param device: String representing the device name. | |
:returns: The device name from the device map if it exists, otherwise the input device name. | |
""" | |
with open("dashboard_data/device_map.json", "r") as f: | |
device_map = json.load(f) | |
return device_map.get(device, device).replace(" ", "_") | |
def process_benchmark_file(file_path, dataset_dfs, results, releases): | |
""" | |
Processes a single benchmark file and updates the results dictionary. | |
:param file_path: Path to the benchmark JSON file. | |
:param dataset_dfs: Dictionary of DataFrames containing dataset information. | |
:param results: Dictionary to store the processed results. | |
This function reads a benchmark JSON file, extracts relevant information, | |
and updates the results dictionary with various metrics including WER, | |
speed, tokens per second, and quality of inference (QoI). | |
""" | |
with open(file_path, "r") as file: | |
test_results = json.load(file) | |
if len(test_results) == 0: | |
return | |
commit_hash_timestamp = file_path.split("/")[-2] | |
commit_timestamp, commit_hash = commit_hash_timestamp.split("_") | |
if commit_hash not in releases: | |
return | |
first_test_result = test_results[0] | |
model = first_test_result["testInfo"]["model"] | |
device = first_test_result["testInfo"]["device"] | |
dataset_dir = first_test_result["testInfo"]["datasetDir"] | |
if "iPhone" in device or "iPad" in device: | |
version_numbers = first_test_result["staticAttributes"]["osVersion"].split(".") | |
if len(version_numbers) == 3 and version_numbers[-1] == "0": | |
version_numbers.pop() | |
os_info = f"""{'iOS' if 'iPhone' in device else 'iPadOS'}_{".".join(version_numbers)}""" | |
else: | |
os_info = f"macOS_{first_test_result['staticAttributes']['osVersion']}" | |
timestamp = first_test_result["testInfo"]["date"] | |
key = (model, device, os_info, commit_timestamp) | |
dataset_name = dataset_dir | |
for test_result in test_results: | |
test_info = test_result["testInfo"] | |
audio_file_name = test_info["audioFile"] | |
dataset_df = dataset_dfs[dataset_name] | |
wer_entry = { | |
"prediction": text_normalizer(test_info["prediction"]), | |
"reference": text_normalizer(test_info["reference"]), | |
} | |
results[key]["timestamp"] = timestamp | |
results[key]["average_wer"].append(wer_entry) | |
input_audio_seconds = test_info["timings"]["inputAudioSeconds"] | |
full_pipeline = test_info["timings"]["fullPipeline"] | |
total_decoding_loops = test_info["timings"]["totalDecodingLoops"] | |
results[key]["dataset_speed"][dataset_name][ | |
"inputAudioSeconds" | |
] += input_audio_seconds | |
results[key]["dataset_speed"][dataset_name]["fullPipeline"] += full_pipeline | |
results[key]["speed"]["inputAudioSeconds"] += input_audio_seconds | |
results[key]["speed"]["fullPipeline"] += full_pipeline | |
results[key]["commit_hash"] = commit_hash | |
results[key]["commit_timestamp"] = commit_timestamp | |
results[key]["dataset_tokens_per_second"][dataset_name][ | |
"totalDecodingLoops" | |
] += total_decoding_loops | |
results[key]["dataset_tokens_per_second"][dataset_name][ | |
"fullPipeline" | |
] += full_pipeline | |
results[key]["tokens_per_second"]["totalDecodingLoops"] += total_decoding_loops | |
results[key]["tokens_per_second"]["fullPipeline"] += full_pipeline | |
audio = audio_file_name.split(".")[0] | |
if dataset_name == "earnings22-10mins": | |
audio = audio.split("-")[0] | |
dataset_row = dataset_df.loc[dataset_df["file"].str.contains(audio)].iloc[0] | |
reference_wer = dataset_row["wer"] | |
prediction_wer = test_info["wer"] | |
results[key]["qoi"].append(1 if prediction_wer <= reference_wer else 0) | |
def process_summary_file(file_path, results, releases): | |
""" | |
Processes a summary file and updates the results dictionary with device support information. | |
:param file_path: Path to the summary JSON file. | |
:param results: Dictionary to store the processed results. | |
:param releases: Set of release commit hashes to process. | |
This function reads a summary JSON file, extracts information about supported | |
and failed models for a specific device and OS combination, and updates the | |
results dictionary accordingly. It creates separate entries for each release. | |
""" | |
with open(file_path, "r") as file: | |
summary_data = json.load(file) | |
if summary_data["commitHash"] not in releases: | |
return | |
device = summary_data["deviceIdentifier"] | |
os = f"{'iPadOS' if 'iPad' in device else summary_data['osType']} {summary_data['osVersion']}" | |
commit_hash = summary_data["commitHash"] | |
commit_timestamp = summary_data["commitTimestamp"] | |
test_file_name = file_path.split("/")[-1] | |
test_timestamp = test_file_name.split("_")[-1].replace(".json", "") | |
key = (device, os, commit_hash) | |
if key in results: | |
existing_commit_timestamp = results[key]["commitTimestamp"] | |
existing_test_timestamp = results[key]["testTimestamp"] | |
existing_commit_dt = datetime.strptime(existing_commit_timestamp, "%Y-%m-%dT%H%M%S") | |
new_commit_dt = datetime.strptime(commit_timestamp, "%Y-%m-%dT%H%M%S") | |
existing_test_dt = datetime.strptime(existing_test_timestamp, "%Y-%m-%dT%H%M%S") | |
new_test_dt = datetime.strptime(test_timestamp, "%Y-%m-%dT%H%M%S") | |
if new_test_dt < existing_test_dt or new_commit_dt < existing_commit_dt: | |
return | |
else: | |
results[key] = {} | |
supported_models = set(summary_data["modelsTested"]) | |
failed_models = set() | |
dataset_count = 2 | |
for model, value in summary_data["testResults"].items(): | |
if model not in summary_data["failureInfo"]: | |
dataset_count = len(value) | |
break | |
for failed_model in summary_data["failureInfo"]: | |
if ( | |
failed_model in summary_data["testResults"] | |
and len(summary_data["testResults"][failed_model]) == dataset_count | |
): | |
continue | |
supported_models.discard(failed_model) | |
failed_models.add(failed_model) | |
results[key]["supportedModels"] = supported_models | |
results[key]["commitHash"] = commit_hash | |
results[key]["commitTimestamp"] = commit_timestamp | |
results[key]["testTimestamp"] = test_timestamp | |
results[key]["failedModels"] = (failed_models, file_path) | |
results["modelsTested"] |= supported_models | |
results["devices"].add(device) | |
def calculate_and_save_performance_results( | |
performance_results, performance_output_path | |
): | |
""" | |
Calculates final performance metrics and saves them to a JSON file. | |
:param performance_results: Dictionary containing raw performance data. | |
:param performance_output_path: Path to save the processed performance results. | |
This function processes the raw performance data, calculates average metrics, | |
and writes the final results to a JSON file, with each entry representing | |
a unique combination of model, device, and OS. | |
""" | |
not_supported = [] | |
with open(performance_output_path, "w") as performance_file: | |
for key, data in performance_results.items(): | |
model, device, os_info, timestamp = key | |
speed = round( | |
data["speed"]["inputAudioSeconds"] / data["speed"]["fullPipeline"], 2 | |
) | |
if speed < 1.0: | |
not_supported.append((model, device, os_info)) | |
continue | |
performance_entry = { | |
"model": model.replace("_", "/"), | |
"device": get_device_name(device).replace("_", " "), | |
"os": os_info.replace("_", " "), | |
"timestamp": data["timestamp"], | |
"speed": speed, | |
"tokens_per_second": round( | |
data["tokens_per_second"]["totalDecodingLoops"] | |
/ data["tokens_per_second"]["fullPipeline"], | |
2, | |
), | |
"dataset_speed": { | |
dataset: round( | |
speed_info["inputAudioSeconds"] / speed_info["fullPipeline"], 2 | |
) | |
for dataset, speed_info in data["dataset_speed"].items() | |
}, | |
"dataset_tokens_per_second": { | |
dataset: round( | |
tps_info["totalDecodingLoops"] / tps_info["fullPipeline"], 2 | |
) | |
for dataset, tps_info in data["dataset_tokens_per_second"].items() | |
}, | |
"average_wer": compute_average_wer(data["average_wer"]), | |
"qoi": round(mean(data["qoi"]), 2), | |
"commit_hash": data["commit_hash"], | |
"commit_timestamp": data["commit_timestamp"], | |
} | |
json.dump(performance_entry, performance_file) | |
performance_file.write("\n") | |
return not_supported | |
def calculate_and_save_support_results( | |
support_results, not_supported, support_output_path | |
): | |
""" | |
Calculates device support results and saves them to separate CSV files for each release. | |
:param support_results: Dictionary containing device support information. | |
:param support_output_path: Base path to save the processed support results. | |
:param not_supported: List of (model, device, os) tuples that are not supported. | |
This function processes the device support data and creates separate CSV files | |
showing which models are supported on different devices and OS versions, | |
using checkmarks, warning signs, question marks or Not supported to | |
indicate support status. | |
""" | |
all_models = sorted(support_results["modelsTested"]) | |
# Group results by commit hash | |
results_by_commit = {} | |
for key, data in support_results.items(): | |
if key in ["modelsTested", "devices"]: | |
continue | |
device, os, commit_hash = key | |
if commit_hash not in results_by_commit: | |
results_by_commit[commit_hash] = { | |
"data": {}, | |
"devices": set(), | |
"timestamp": data["commitTimestamp"] | |
} | |
results_by_commit[commit_hash]["data"][key] = data | |
results_by_commit[commit_hash]["devices"].add(device) | |
# Generate separate CSV for each commit | |
for commit_hash, commit_data in results_by_commit.items(): | |
commit_devices = sorted(commit_data["devices"]) | |
df = pd.DataFrame(index=all_models, columns=["Model"] + commit_devices) | |
for model in all_models: | |
row = {"Model": model} | |
for device in commit_devices: | |
row[device] = "" | |
for key, data in commit_data["data"].items(): | |
device, os, _ = key | |
supported_models = data["supportedModels"] | |
failed_models, file_path = data["failedModels"] | |
directories = file_path.split("/") | |
commit_file, summary_file = directories[-2], directories[-1] | |
url = f"{BASE_WHISPERKIT_BENCHMARK_URL}/{commit_file}/{urllib.parse.quote(summary_file)}" | |
if model in supported_models: | |
current_value = row[device] | |
new_value = ( | |
f"✅ {os}" | |
if current_value == "" | |
else f"{current_value}<p>✅ {os}</p>" | |
) | |
elif model in failed_models: | |
current_value = row[device] | |
new_value = ( | |
f"""⚠️ <a style='color: #3B82F6; text-decoration: underline; text-decoration-style: dotted;' href={url}>{os}</a>""" | |
if current_value == "" | |
else f"""{current_value}<p>⚠️ <a style='color: #3B82F6; text-decoration: underline; text-decoration-style: dotted;' href={url}>{os}</a></p>""" | |
) | |
else: | |
current_value = row[device] | |
new_value = ( | |
f"? {os}" | |
if current_value == "" | |
else f"{current_value}<p>? {os}</p>" | |
) | |
row[device] = new_value | |
df.loc[model] = row | |
# Mark unsupported combinations for this commit | |
commit_not_supported = [ | |
(model, device, os) | |
for model, device, os in not_supported | |
if any(key[2] == commit_hash for key in support_results if key not in ["modelsTested", "devices"] and model == key[0]) | |
] | |
remove_unsupported_cells(df, commit_not_supported) | |
# Format column headers | |
cols = df.columns.tolist() | |
cols = ["Model"] + [ | |
f"""{get_device_name(col).replace("_", " ")} ({col})""" for col in cols if col != "Model" | |
] | |
df.columns = cols | |
# Save to commit-specific file | |
output_path = support_output_path.replace( | |
".csv", | |
f"_{commit_hash[:7]}.csv" | |
) | |
df.to_csv(output_path, index=True) | |
def remove_unsupported_cells(df, not_supported): | |
""" | |
Updates the DataFrame to mark unsupported model-device combinations. | |
This function reads a configuration file to determine which models are supported | |
on which devices. It then iterates over the DataFrame and sets the value to "Not supported" | |
for any model-device combination that is not supported according to the configuration. | |
:param df: A Pandas DataFrame where the index represents models and columns represent devices. | |
""" | |
with open("dashboard_data/config.json", "r") as file: | |
config_data = json.load(file) | |
device_support = config_data["device_support"] | |
for info in device_support: | |
identifiers = set(info["identifiers"]) | |
supported = set(info["models"]["supported"]) | |
for model in df.index: | |
for device in df.columns: | |
if ( | |
any(identifier in device for identifier in identifiers) | |
and model not in supported | |
): | |
df.at[model, device] = "Not Supported" | |
for model, device, os in not_supported: | |
df.at[model, device] = "Not Supported" | |
def main(): | |
""" | |
Main function to orchestrate the performance data generation process. | |
This function performs the following steps: | |
1. Downloads benchmark data if requested. | |
2. Fetches evaluation data for various datasets. | |
3. Processes benchmark files and summary files. | |
4. Calculates and saves performance and support results. | |
""" | |
source_xcresult_repo = "argmaxinc/whisperkit-evals-dataset" | |
source_xcresult_subfolder = "benchmark_data/" | |
source_xcresult_directory = f"{source_xcresult_repo}/{source_xcresult_subfolder}" | |
if len(sys.argv) > 1 and sys.argv[1] == "download": | |
try: | |
shutil.rmtree(source_xcresult_repo) | |
except: | |
print("Nothing to remove.") | |
download_dataset( | |
source_xcresult_repo, source_xcresult_repo, source_xcresult_subfolder | |
) | |
datasets = { | |
"Earnings-22": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/earnings22/2024-03-04_13%3A39%3A42_GMT-0800.json", | |
"LibriSpeech": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/librispeech/2024-02-28_18%3A45%3A02_GMT-0800.json?download=true", | |
"earnings22-10mins": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/earnings22/2024-03-04_13%3A39%3A42_GMT-0800.json", | |
"librispeech-10mins": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/librispeech/2024-02-28_18%3A45%3A02_GMT-0800.json?download=true", | |
"earnings22-12hours": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/earnings22/2024-03-04_13%3A39%3A42_GMT-0800.json", | |
"librispeech": "https://huggingface.co/datasets/argmaxinc/whisperkit-evals/resolve/main/WhisperOpenAIAPI/openai_whisper-large-v2/librispeech/2024-02-28_18%3A45%3A02_GMT-0800.json?download=true", | |
} | |
dataset_dfs = {} | |
for dataset_name, url in datasets.items(): | |
evals = fetch_evaluation_data(url) | |
dataset_dfs[dataset_name] = pd.json_normalize(evals["results"]) | |
performance_results = defaultdict( | |
lambda: { | |
"average_wer": [], | |
"qoi": [], | |
"speed": {"inputAudioSeconds": 0, "fullPipeline": 0}, | |
"tokens_per_second": {"totalDecodingLoops": 0, "fullPipeline": 0}, | |
"dataset_speed": defaultdict( | |
lambda: {"inputAudioSeconds": 0, "fullPipeline": 0} | |
), | |
"dataset_tokens_per_second": defaultdict( | |
lambda: {"totalDecodingLoops": 0, "fullPipeline": 0} | |
), | |
"timestamp": None, | |
"commit_hash": None, | |
"commit_timestamp": None, | |
"test_timestamp": None, | |
} | |
) | |
support_results = {"modelsTested": set(), "devices": set()} | |
generate_device_map(source_xcresult_directory) | |
with open("dashboard_data/version.json", "r") as f: | |
version = json.load(f) | |
releases = set(version["releases"]) | |
for subdir, _, files in os.walk(source_xcresult_directory): | |
for filename in files: | |
file_path = os.path.join(subdir, filename) | |
if not filename.endswith(".json"): | |
continue | |
elif "summary" in filename: | |
process_summary_file(file_path, support_results, releases) | |
else: | |
process_benchmark_file(file_path, dataset_dfs, performance_results, releases) | |
not_supported = calculate_and_save_performance_results( | |
performance_results, "dashboard_data/performance_data.json" | |
) | |
calculate_and_save_support_results( | |
support_results, not_supported, "dashboard_data/support_data.csv" | |
) | |
if __name__ == "__main__": | |
main() | |