regisss commited on
Commit
ca2da1c
·
1 Parent(s): 3f77013

Add try/catch in bash entrypoint to manage failed experiments

Browse files
Files changed (2) hide show
  1. entrypoint.sh +15 -2
  2. failed_run.py +46 -0
entrypoint.sh CHANGED
@@ -8,12 +8,25 @@ echo "Attempting to run."
8
  python /parse_requests.py | while read line; do
9
  IFS="," read backend_model experiment_name <<< $(echo ${line})
10
  echo "Benchmarking Model: ${backend_model}, Task: ${experiment_name}"
11
- optimum-benchmark --config-name ${experiment_name} --config-dir /optimum-benchmark/examples/energy_star/ backend.model=${backend_model} backend.processor=${backend_model} hydra.run.dir="./runs/${experiment_name}/${backend_model}/${now:%Y-%m-%d-%H-%M-%S}"
 
 
 
 
 
 
 
 
 
 
 
12
  done
 
 
13
  echo "Finished; uploading dataset results"
14
  python /create_results.py ./runs
 
15
  # Pausing space
16
  echo "Pausing space."
17
  python /pause_space.py
18
  echo "Done."
19
- #fi
 
8
  python /parse_requests.py | while read line; do
9
  IFS="," read backend_model experiment_name <<< $(echo ${line})
10
  echo "Benchmarking Model: ${backend_model}, Task: ${experiment_name}"
11
+ export run_dir= "./runs/${experiment_name}/${backend_model}/${now:%Y-%m-%d-%H-%M-%S}"
12
+
13
+ optimum-benchmark --config-name ${experiment_name} --config-dir /optimum-benchmark/examples/energy_star/ backend.model=${backend_model} backend.processor=${backend_model} hydra.run.dir=${run_dir} 2> $run_dir/error.log
14
+
15
+ if [ -s $run_dir/error.log ]; then
16
+ # error.log is not-empty, an error was raised
17
+ echo "An error was raised while benchmarking the model..."
18
+ python /failed_run.py --run_dir $run_dir --model_name $backend_model
19
+
20
+ # Delete the current run directory so that it is not pushed by create_results.py later
21
+ rm -rf $run_dir
22
+ fi
23
  done
24
+
25
+ # The file is empty, so no error
26
  echo "Finished; uploading dataset results"
27
  python /create_results.py ./runs
28
+
29
  # Pausing space
30
  echo "Pausing space."
31
  python /pause_space.py
32
  echo "Done."
 
failed_run.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+
4
+ from datasets import load_dataset, Dataset
5
+ from huggingface_hub import HfApi
6
+
7
+
8
+ TOKEN = os.environ.get("DEBUG")
9
+ api = HfApi(token=TOKEN)
10
+
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument(
13
+ "--run_dir",
14
+ default=None,
15
+ type=str,
16
+ required=True,
17
+ help="Path to the run directory.",
18
+ )
19
+ parser.add_argument(
20
+ "--model_name",
21
+ default=None,
22
+ type=str,
23
+ required=True,
24
+ help="Model to benchmark.",
25
+ )
26
+ args = parser.parse_args()
27
+
28
+ # Updating request
29
+ dataset = load_dataset("EnergyStarAI/requests_debug", split="test", token=TOKEN).to_pandas()
30
+
31
+ # Set benchmark to failed
32
+ dataset.loc[dataset["model"].isin(args.model_name), ['status']] = "FAILED"
33
+
34
+ # Read error message
35
+ with open(f"{args.run_dir}/error.log", 'r') as file:
36
+ error_message = file.read()
37
+
38
+ # Add a new column for the error message if necessary
39
+ if "error_message" not in dataset.columns:
40
+ dataset["error_message"] = ""
41
+ dataset.loc[dataset["model"].isin(args.model_name), ['error_message']] = error_message
42
+
43
+ updated_dataset = Dataset.from_pandas(dataset)
44
+ updated_dataset.push_to_hub("EnergyStarAI/requests_debug", split="test", token=TOKEN)
45
+
46
+ print("Status set to FAILED")