File size: 4,129 Bytes
ee9e25e 513e813 cc32c4f b58e1f0 ee9e25e 513e813 83a34f0 ee9e25e 30fa96a ee9e25e f228d38 c32735e 513e813 b58e1f0 513e813 b58e1f0 9adae3c b58e1f0 9adae3c b58e1f0 9adae3c b58e1f0 9adae3c b58e1f0 9adae3c b58e1f0 9adae3c b58e1f0 9adae3c b58e1f0 9adae3c b58e1f0 513e813 9adae3c 83a34f0 cc32c4f c32735e 30fa96a c32735e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import pandas as pd
import os
import fnmatch
import json
import re
import numpy as np
import requests
from urllib.parse import quote
from datetime import datetime
class DetailsDataProcessor:
# Download
#url example https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/64bits/LexPodLM-13B/details_harness%7ChendrycksTest-moral_scenarios%7C5_2023-07-25T13%3A41%3A51.227672.json
def __init__(self, directory='results', pattern='results*.json'):
self.directory = directory
self.pattern = pattern
# self.data = self.process_data()
# self.ranked_data = self.rank_data()
def _find_files(self, directory='results', pattern='results*.json'):
matching_files = [] # List to hold matching filenames
for root, dirs, files in os.walk(directory):
for basename in files:
if fnmatch.fnmatch(basename, pattern):
filename = os.path.join(root, basename)
matching_files.append(filename) # Append the matching filename to the list
return matching_files # Return the list of matching filenames
# download a file from a single url and save it to a local directory
# @staticmethod
# def download_file(url, file_path):
# #TODO: I may not need to save the file. I can just read it in and convert to a dataframe
# r = requests.get(url, allow_redirects=True)
# open(file_path, 'wb').write(r.content)
# # return dataframe
# df = pd.DataFrame(r.content)
# return df
@staticmethod
def download_file(url, save_file_path):
# Get the current date and time
timestamp = datetime.now()
# Format the timestamp as a string, suitable for use in a filename
filename_timestamp = timestamp.strftime("%Y-%m-%dT%H-%M-%S")
# Construct the full save file path
save_file_path = save_file_path + filename_timestamp + ".json"
print(save_file_path) # Output will be something like "results_2023-08-20T12-34-56.txt"
try:
# Sending a GET request
r = requests.get(url, allow_redirects=True)
r.raise_for_status() # Raises an HTTPError if the HTTP request returned an unsuccessful status code
# Writing the content to the specified file
with open(save_file_path, 'wb') as file:
file.write(r.content)
print(f"Successfully downloaded file: {save_file_path}")
except requests.ConnectionError as e:
print(f"Failed to connect to the URL: {url}")
raise e
except requests.HTTPError as e:
print(f"HTTP error occurred: {e}")
raise e
except FileNotFoundError as e:
print(f"File not found at path: {save_file_path}")
raise e
except Exception as e:
print(f"An unexpected error occurred: {e}")
raise e
return None
@staticmethod
def single_file_pipeline(url, filename):
DetailsDataProcessor.download_file(url, filename)
# read file
with open(filename) as f:
data = json.load(f)
# convert to dataframe
df = pd.DataFrame(data)
return df
@staticmethod
def build_url(file_path):
segments = file_path.split('/')
bits = segments[1]
model_name = segments[2]
timestamp = segments[3].split('_')[1]
url = f'https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/{bits}/{model_name}/details_harness%7ChendrycksTest-moral_scenarios%7C5_{quote(timestamp, safe="")}'
print(url)
return url
def pipeline(self):
dataframes = []
file_paths = self._find_files(self.directory, self.pattern)
for file_path in file_paths:
print(file_path)
url = self.generate_url(file_path)
file_path = file_path.split('/')[-1]
df = self.single_file_pipeline(url, file_path)
dataframes.append(df)
return dataframes
|