Corey Morris commited on
Commit
0a77c60
·
1 Parent(s): 6251f5a

WIP. Updated download file. Can now download all files. Need to integrate that code to loop through all files to download or combine files first into a single dataframe and then save that

Browse files
details_data_processor.py CHANGED
@@ -7,6 +7,7 @@ import numpy as np
7
  import requests
8
  from urllib.parse import quote
9
  from datetime import datetime
 
10
 
11
 
12
 
@@ -17,8 +18,6 @@ class DetailsDataProcessor:
17
  def __init__(self, directory='results', pattern='results*.json'):
18
  self.directory = directory
19
  self.pattern = pattern
20
- # self.data = self.process_data()
21
- # self.ranked_data = self.rank_data()
22
 
23
  def _find_files(self, directory='results', pattern='results*.json'):
24
  matching_files = [] # List to hold matching filenames
@@ -29,29 +28,22 @@ class DetailsDataProcessor:
29
  matching_files.append(filename) # Append the matching filename to the list
30
  return matching_files # Return the list of matching filenames
31
 
32
- # download a file from a single url and save it to a local directory
33
- # @staticmethod
34
- # def download_file(url, file_path):
35
- # #TODO: I may not need to save the file. I can just read it in and convert to a dataframe
36
- # r = requests.get(url, allow_redirects=True)
37
- # open(file_path, 'wb').write(r.content)
38
- # # return dataframe
39
- # df = pd.DataFrame(r.content)
40
- # return df
41
-
42
-
43
  @staticmethod
44
  def download_file(url, save_file_path):
 
45
  # Get the current date and time
46
- timestamp = datetime.now()
 
 
47
 
48
  # Format the timestamp as a string, suitable for use in a filename
49
- filename_timestamp = timestamp.strftime("%Y-%m-%dT%H-%M-%S")
50
 
51
- # Construct the full save file path
52
- save_file_path = save_file_path + filename_timestamp + ".json"
53
 
54
- print(save_file_path) # Output will be something like "results_2023-08-20T12-34-56.txt"
 
55
 
56
  try:
57
  # Sending a GET request
@@ -62,21 +54,16 @@ class DetailsDataProcessor:
62
  with open(save_file_path, 'wb') as file:
63
  file.write(r.content)
64
 
65
- print(f"Successfully downloaded file: {save_file_path}")
66
  except requests.ConnectionError as e:
67
- print(f"Failed to connect to the URL: {url}")
68
- raise e
69
  except requests.HTTPError as e:
70
- print(f"HTTP error occurred: {e}")
71
- raise e
72
  except FileNotFoundError as e:
73
- print(f"File not found at path: {save_file_path}")
74
- raise e
75
  except Exception as e:
76
- print(f"An unexpected error occurred: {e}")
77
- raise e
78
-
79
- return None
80
 
81
 
82
 
@@ -95,10 +82,14 @@ class DetailsDataProcessor:
95
  segments = file_path.split('/')
96
  bits = segments[1]
97
  model_name = segments[2]
98
- timestamp = segments[3].split('_')[1]
 
 
 
 
 
99
 
100
  url = f'https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/{bits}/{model_name}/details_harness%7ChendrycksTest-moral_scenarios%7C5_{quote(timestamp, safe="")}'
101
- print(url)
102
  return url
103
 
104
  def pipeline(self):
 
7
  import requests
8
  from urllib.parse import quote
9
  from datetime import datetime
10
+ import uuid
11
 
12
 
13
 
 
18
  def __init__(self, directory='results', pattern='results*.json'):
19
  self.directory = directory
20
  self.pattern = pattern
 
 
21
 
22
  def _find_files(self, directory='results', pattern='results*.json'):
23
  matching_files = [] # List to hold matching filenames
 
28
  matching_files.append(filename) # Append the matching filename to the list
29
  return matching_files # Return the list of matching filenames
30
 
 
 
 
 
 
 
 
 
 
 
 
31
  @staticmethod
32
  def download_file(url, save_file_path):
33
+ #TODO: I may not need to save the file. I can just read it in and convert to a dataframe
34
  # Get the current date and time
35
+ error_count = 0
36
+ success_count = 0
37
+ # timestamp = datetime.now()
38
 
39
  # Format the timestamp as a string, suitable for use in a filename
40
+ # filename_timestamp = timestamp.strftime("%Y-%m-%dT%H-%M-%S")
41
 
42
+ # Generate a unique UUID
43
+ unique_id = uuid.uuid4()
44
 
45
+ # Append the UUID to the filename
46
+ save_file_path = save_file_path + "_" + str(unique_id) + ".json"
47
 
48
  try:
49
  # Sending a GET request
 
54
  with open(save_file_path, 'wb') as file:
55
  file.write(r.content)
56
 
57
+ success_count += 1
58
  except requests.ConnectionError as e:
59
+ error_count += 1
 
60
  except requests.HTTPError as e:
61
+ error_count += 1
 
62
  except FileNotFoundError as e:
63
+ error_count += 1
 
64
  except Exception as e:
65
+ error_count += 1
66
+ return error_count, success_count
 
 
67
 
68
 
69
 
 
82
  segments = file_path.split('/')
83
  bits = segments[1]
84
  model_name = segments[2]
85
+
86
+ try:
87
+ timestamp = segments[3].split('_')[1]
88
+ except IndexError:
89
+ print(f"Error: {file_path}")
90
+ return None
91
 
92
  url = f'https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/{bits}/{model_name}/details_harness%7ChendrycksTest-moral_scenarios%7C5_{quote(timestamp, safe="")}'
 
93
  return url
94
 
95
  def pipeline(self):
test_details_data_processing.py CHANGED
@@ -58,6 +58,19 @@ class TestDetailsDataProcessor(unittest.TestCase):
58
  # print(files)
59
  self.assertIsInstance(files, list)
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  if __name__ == '__main__':
63
  unittest.main()
 
58
  # print(files)
59
  self.assertIsInstance(files, list)
60
 
61
+ def test_build_url_harness_types(self):
62
+ test_cases = [
63
+ ('results/shaohang/Sparse0.5_OPT-1.3/results_2023-07-19T19:10:31.005235.json', 'details',
64
+ 'https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/shaohang/Sparse0.5_OPT-1.3/details_harness%7ChendrycksTest-moral_scenarios%7C5_2023-07-19T19%3A10%3A31.005235.json'),
65
+ ('results/shaohang/Sparse0.5_OPT-1.3/results_2023-07-19T19:10:31.005235.json', 'queries',
66
+ 'https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/shaohang/Sparse0.5_OPT-1.3/queries_harness%7ChendrycksTest-moral_scenarios%7C5_2023-07-19T19%3A10%3A31.005235.json')
67
+ ]
68
+
69
+ for file_path, harness_type, expected in test_cases:
70
+ self.assertEqual(self.processor.build_url(file_path, harness_type), expected,
71
+ f"Test failed for file_path: {file_path}, harness_type: {harness_type}")
72
+
73
+
74
 
75
  if __name__ == '__main__':
76
  unittest.main()