import os from datetime import datetime, timedelta import json from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer from urllib.parse import parse_qs, urlparse from huggingface_hub import list_datasets, set_access_token, HfFolder from datasets import load_dataset import numpy as np HF_TOKEN = os.environ['HF_TOKEN'] set_access_token(HF_TOKEN) HfFolder.save_token(HF_TOKEN) datasets = { # "stars": load_dataset("open-source-metrics/stars"), "issues": load_dataset("open-source-metrics/issues"), "pip": load_dataset("open-source-metrics/pip") } def running_mean(x, N, total_length=-1): cumsum = np.cumsum(np.insert(x, 0, 0)) to_pad = max(total_length - len(cumsum), 0) return np.pad(cumsum[N:] - cumsum[:-N], (to_pad, 0)) / float(N) class RequestHandler(SimpleHTTPRequestHandler): def do_GET(self): print(self.path) if self.path == "/": self.path = "index.html" return SimpleHTTPRequestHandler.do_GET(self) if self.path.startswith("/initialize"): dataset_keys = {k: set(v.keys()) for k, v in datasets.items()} dataset_keys['issues'].remove('transformers') dataset_with_most_splits = max([d for d in dataset_keys.values()], key=len) warnings = [] for k, v in dataset_keys.items(): if len(v.keys()) < len(dataset_with_most_splits): warnings.extend(f"The {k} dataset does not contain all splits. Missing: {dataset_with_most_splits - v}") self.send_response(200) self.send_header("Content-Type", "application/json") self.end_headers() self.wfile.write(json.dumps(dataset_keys).encode("utf-8")) return SimpleHTTPRequestHandler if self.path.startswith("/retrievePipInstalls"): url = urlparse(self.path) query = parse_qs(url.query) library_names = query.get("input", None)[0] library_names = library_names.split(',') returned_values = {} for library_name in library_names: dataset = load_dataset(f"open-source-metrics/{library_name}-pip-installs", use_auth_token=True)['train'] for i in dataset: if i['day'] in returned_values: returned_values[i['day']][library_name] = i['num_downloads'] else: returned_values[i['day']] = {library_name: i['num_downloads']} for library_name in library_names: for i in returned_values.keys(): if library_name not in returned_values[i]: returned_values[i][library_name] = 0 output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names} output['day'] = list(returned_values.keys())[::-1] self.send_response(200) self.send_header("Content-Type", "application/json") self.end_headers() self.wfile.write(json.dumps(output).encode("utf-8")) return SimpleHTTPRequestHandler if self.path.startswith("/retrieveStars"): url = urlparse(self.path) query = parse_qs(url.query) library_names = query.get("input", None)[0] library_names = library_names.split(',') returned_values = {} dataset_dict = load_dataset(f"open-source-metrics/stars", use_auth_token=True, revision='90cb31b2db73c8c4291bcf317d831595e4fb2a91').sort('dates') for library_name in library_names: dataset = dataset_dict[library_name] n = 0 for i in dataset: n += 1 if i['dates'] in returned_values: returned_values[i['dates']][library_name] = n else: returned_values[i['dates']] = {library_name: n} for library_name in library_names: for i in returned_values.keys(): if library_name not in returned_values[i]: returned_values[i][library_name] = None output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names} output['day'] = list(returned_values.keys())[::-1] self.send_response(200) self.send_header("Content-Type", "application/json") self.end_headers() self.wfile.write(json.dumps(output).encode("utf-8")) return SimpleHTTPRequestHandler return SimpleHTTPRequestHandler.do_GET(self) server = ThreadingHTTPServer(("", 7860), RequestHandler) print("Running on port 7860") server.serve_forever()