|
import contextlib |
|
import re |
|
import tempfile |
|
from functools import lru_cache |
|
from typing import Optional |
|
|
|
import gradio as gr |
|
from git import Repo |
|
from httpx import Client |
|
from huggingface_hub import create_repo, upload_folder |
|
from toolz import groupby |
|
import kagglehub |
|
from kagglehub import KaggleDatasetAdapter |
|
|
|
client = Client() |
|
|
|
|
|
def clone_into_temp_dir(github_repo_url): |
|
temp_dir = tempfile.TemporaryDirectory() |
|
return Repo.clone_from(github_repo_url, temp_dir), temp_dir |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def upload_directory_to_hf( |
|
repo_id: str, |
|
directory: str, |
|
oauth_token: str, |
|
): |
|
private = False |
|
url = create_repo( |
|
repo_id, |
|
token=oauth_token, |
|
exist_ok=True, |
|
repo_type="dataset", |
|
private=private, |
|
) |
|
|
|
commit_url = upload_folder( |
|
repo_id=repo_id, |
|
folder_path=directory, |
|
path_in_repo="data", |
|
repo_type="dataset", |
|
token=oauth_token, |
|
commit_message="Migrated from GitHub", |
|
ignore_patterns=[ |
|
"*.git*", |
|
|
|
"*.DS_Store", |
|
"*.env", |
|
], |
|
) |
|
|
|
|
|
def push_to_hf( |
|
source_github_repository, |
|
destination_hf_hub_repository, |
|
subdirectory, |
|
oauth_token: gr.OAuthToken, |
|
): |
|
gr.Info("Cloning source GitHub repository...") |
|
repo, temporary_directory = clone_into_temp_dir(source_github_repository) |
|
gr.Info("Cloning source GitHub repository...Done") |
|
gr.Info("Syncing with Hugging Face Hub...") |
|
if subdirectory: |
|
src_directory = f"{repo.working_dir}/{subdirectory[0]}" |
|
else: |
|
src_directory = repo.working_dir |
|
upload_directory_to_hf( |
|
repo_id=destination_hf_hub_repository, |
|
directory=src_directory, |
|
oauth_token=oauth_token.token, |
|
) |
|
gr.Info("Syncing with Hugging Face Hub...Done") |
|
temporary_directory.cleanup() |
|
return f"Pushed the dataset to [{destination_hf_hub_repository}](https://huggingface.co/datasets/{destination_hf_hub_repository})" |
|
|
|
|
|
def extract_user_name_and_repo_from_url(github_url: str): |
|
pattern = r"https://github.com/([^/]+)/([^/]+)" |
|
if match := re.search(pattern, github_url): |
|
return match[1], match[2] |
|
print("No match found in the GitHub URL.") |
|
return None |
|
|
|
|
|
def get_files_and_directories(response): |
|
data = response.json() |
|
grouped_by_type = groupby(lambda item: item["type"], data["tree"]) |
|
files = grouped_by_type.get("blob", []) |
|
directories = grouped_by_type.get("tree", []) |
|
if files: |
|
files = [file["path"] for file in files] |
|
if directories: |
|
directories = [directory["path"] for directory in directories] |
|
return {"files": files, "directories": directories} |
|
|
|
|
|
@lru_cache(maxsize=128) |
|
def list_git_repo_files_and_directories(repo_url: str, branch: str = "main"): |
|
user_name_and_repo = extract_user_name_and_repo_from_url(repo_url) |
|
if user_name_and_repo is None: |
|
return None |
|
user_name, repo_name = user_name_and_repo |
|
url = f"https://api.github.com/repos/{user_name}/{repo_name}/git/trees/{branch}" |
|
response = client.get(url) |
|
if response.status_code == 200: |
|
return get_files_and_directories(response) |
|
|
|
|
|
def show_files_and_directories(url: str): |
|
with contextlib.suppress(Exception): |
|
files_and_directories = list_git_repo_files_and_directories(url) |
|
directories = files_and_directories.get("directories", []) |
|
files = files_and_directories.get("files", []) |
|
print(directories) |
|
return gr.Dropdown( |
|
label="Directories", |
|
choices=directories, |
|
max_choices=1, |
|
visible=True, |
|
interactive=True, |
|
multiselect=True, |
|
), gr.Dropdown( |
|
label="Files", |
|
choices=files, |
|
max_choices=None, |
|
visible=True, |
|
interactive=True, |
|
multiselect=True, |
|
) |
|
|
|
|
|
def push_kaggle_to_hf( |
|
source_kaggle_dataset: str, |
|
destination_hf_hub_repository: str, |
|
file_path: str, |
|
oauth_token: gr.OAuthToken, |
|
): |
|
"""Pushes a Kaggle dataset to HuggingFace Hub using the HF dataset adapter""" |
|
if not file_path: |
|
raise ValueError("File path must be specified for Kaggle datasets") |
|
|
|
gr.Info("Loading Kaggle dataset...") |
|
dataset = kagglehub.load_dataset( |
|
KaggleDatasetAdapter.HUGGING_FACE, |
|
source_kaggle_dataset, |
|
file_path, |
|
) |
|
gr.Info("Loading Kaggle dataset...Done") |
|
|
|
gr.Info("Pushing to Hugging Face Hub...") |
|
dataset.push_to_hub( |
|
destination_hf_hub_repository, |
|
token=oauth_token.token, |
|
) |
|
gr.Info("Pushing to Hugging Face Hub...Done") |
|
|
|
return f"Pushed the dataset to [{destination_hf_hub_repository}](https://huggingface.co/datasets/{destination_hf_hub_repository})" |
|
|
|
|
|
html_text_app_description = """ |
|
While GitHub and Kaggle are great platforms, the Hugging Face Datasets Hub is a better place to host and share datasets. |
|
Some of the benefits of hosting datasets on the Hugging Face Datasets Hub are: |
|
<br> |
|
<ul> |
|
<li>Hosting for large datasets</li> |
|
<li>An interactive preview of your dataset</li> |
|
<li>Access to the dataset via many tools and libraries including; datasets, pandas, polars, dask and DuckDB</li> |
|
<li>Seamless integration with machine learning workflows</li> |
|
<li>Version control and dataset versioning</li> |
|
</ul> |
|
|
|
<br> |
|
This app will help you migrate datasets currently hosted on GitHub or Kaggle to the Hugging Face Datasets Hub. |
|
|
|
Make sure you consider the license of the dataset when migrating it to the Hugging Face Datasets Hub 🤗. |
|
<br> |
|
<br> |
|
<i>Note: the Kaggle implementation is experimental and may not work for all datasets. Feel free to open a PR to improve it!</i> |
|
""" |
|
|
|
with gr.Blocks(theme=gr.themes.Base()) as demo: |
|
gr.HTML( |
|
"""<h1 style='text-align: center;'> Dataset Migration Tool</h1> |
|
<center><i> ✨ Migrate datasets to Hugging Face Hub in a few steps ✨</i></center>""" |
|
) |
|
|
|
gr.HTML(html_text_app_description) |
|
|
|
with gr.Row(): |
|
gr.LoginButton(size="sm") |
|
|
|
with gr.Tabs() as tabs: |
|
with gr.Tab("GitHub"): |
|
gr.Markdown("### Location of existing dataset") |
|
gr.Markdown( |
|
"URL for the GitHub repository where the dataset is currently hosted" |
|
) |
|
source_github_repository = gr.Textbox( |
|
lines=1, label="Source GitHub Repository URL" |
|
) |
|
|
|
with gr.Accordion("Advanced Options", open=False): |
|
gr.Markdown("### Select files and folder to migrate") |
|
gr.Markdown( |
|
"(Optional): select a specific folder and/or files to migrate from the GitHub repository. If you select a folder all the files in that folder will be migrated." |
|
) |
|
folder_in_github_repo = gr.Dropdown( |
|
None, |
|
label="Folder in the GitHub Repository to migrate", |
|
allow_custom_value=True, |
|
visible=True, |
|
) |
|
files_in_github_repo = gr.Dropdown( |
|
None, |
|
label="Files in GitHub Repository to migrate", |
|
allow_custom_value=True, |
|
visible=True, |
|
) |
|
source_github_repository.change( |
|
show_files_and_directories, |
|
[source_github_repository], |
|
[folder_in_github_repo, files_in_github_repo], |
|
) |
|
|
|
gr.Markdown("### Destination for your migrated dataset") |
|
destination_hf_hub_repository = gr.Textbox( |
|
label="Destination Hugging Face Repository", |
|
placeholder="i.e. <hugging face username>/<repository_name>", |
|
) |
|
|
|
github_submit_btn = gr.Button("Migrate GitHub Dataset") |
|
github_result = gr.Markdown(label="Summary", visible=True) |
|
|
|
github_submit_btn.click( |
|
push_to_hf, |
|
[ |
|
source_github_repository, |
|
destination_hf_hub_repository, |
|
folder_in_github_repo, |
|
], |
|
[github_result], |
|
) |
|
|
|
with gr.Tab("Kaggle"): |
|
gr.Markdown("### Source Kaggle Dataset") |
|
gr.Markdown("Enter the Kaggle dataset name and file path") |
|
source_kaggle_dataset = gr.Textbox( |
|
lines=1, |
|
label="Source Kaggle Dataset", |
|
placeholder="username/dataset-name", |
|
) |
|
kaggle_file_path = gr.Textbox( |
|
label="File path in dataset", |
|
placeholder="e.g., train.csv", |
|
info="Specify the file to migrate from the dataset", |
|
) |
|
|
|
gr.Markdown("### Destination for your migrated dataset") |
|
kaggle_destination_hf_hub = gr.Textbox( |
|
label="Destination Hugging Face Repository", |
|
placeholder="i.e. <hugging face username>/<repository_name>", |
|
) |
|
|
|
kaggle_submit_btn = gr.Button("Migrate Kaggle Dataset") |
|
kaggle_result = gr.Markdown(label="Summary", visible=True) |
|
|
|
kaggle_submit_btn.click( |
|
push_kaggle_to_hf, |
|
[ |
|
source_kaggle_dataset, |
|
kaggle_destination_hf_hub, |
|
kaggle_file_path, |
|
], |
|
[kaggle_result], |
|
) |
|
|
|
gr.Markdown( |
|
"""You should add a dataset card for your dataset to help people discover and understand your dataset. You can find instructions for creating a dataset card [here](https://huggingface.co/docs/datasets/dataset_card). |
|
If you have any questions or feedback feel free to reach out to us on using the [Discussion tab](https://huggingface.co/spaces/librarian-bots/github-to-huggingface-dataset-migration-tool/discussions/1)""" |
|
) |
|
|
|
demo.launch() |
|
|