File size: 6,367 Bytes
3f7cfab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import shutil
import pandas as pd
import os
import huggingface_hub
import pytest
from datasets import load_dataset
@pytest.mark.parametrize(
"dataset_name, link_to_source",
[
(
"h2ogpt-oig-instruct-cleaned",
"""
- [Original LAION OIG Dataset](https://github.com/LAION-AI/Open-Instruction-Generalist)
- [LAION OIG data detoxed and filtered down by scripts in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/b8f15efcc305a953c52a0ee25b8b4897ceb68c0a/scrape_dai_docs.py)
"""
),
(
"h2ogpt-oig-instruct-cleaned-v2",
"""
- [Original LAION OIG Dataset](https://github.com/LAION-AI/Open-Instruction-Generalist)
- [LAION OIG data detoxed and filtered down by scripts in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/40c217f610766715acec297a5535eb440ac2f2e2/create_data.py)
"""
),
(
"h2ogpt-oig-instruct-cleaned-v3",
"""
- [Original LAION OIG Dataset](https://github.com/LAION-AI/Open-Instruction-Generalist)
- [LAION OIG data detoxed and filtered down by scripts in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/bfc3778c8db938761ce2093351bf2bf82159291e/create_data.py)
"""
),
(
"openassistant_oasst1",
"""
- [Original Open Assistant data in tree structure](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [This flattened dataset created by script in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/45e6183171fb16691ad7d3ab006fad973f971e98/create_data.py#L1253)
"""
),
(
"h2ogpt-oig-oasst1-instruct-cleaned-v1",
"""
- [Original LAION OIG Dataset](https://github.com/LAION-AI/Open-Instruction-Generalist)
- [LAION OIG data detoxed and filtered down by scripts in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/main/docs/FINETUNE.md#high-quality-oig-based-instruct-data)
- [Original Open Assistant data in tree structure](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [This flattened dataset created by script in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/5fc91911bc2bfaaf3b6c2de577c4b0ae45a07a4a/create_data.py#L1253)
"""
),
(
"h2ogpt-oig-oasst1-instruct-cleaned-v2",
"""
- [Original LAION OIG Dataset](https://github.com/LAION-AI/Open-Instruction-Generalist)
- [LAION OIG data detoxed and filtered down by scripts in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/main/docs/FINETUNE.md#high-quality-oig-based-instruct-data)
- [Original Open Assistant data in tree structure](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [This flattened dataset created by script in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/0e70c2fbb16410bd8e6992d879b4c55cd981211f/create_data.py#L1375-L1415)
"""
),
(
"h2ogpt-oig-oasst1-instruct-cleaned-v3",
"""
- [Original LAION OIG Dataset](https://github.com/LAION-AI/Open-Instruction-Generalist)
- [LAION OIG data detoxed and filtered down by scripts in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/main/docs/FINETUNE.md#high-quality-oig-based-instruct-data)
- [Original Open Assistant data in tree structure](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [This flattened dataset created by script in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/6728938a262d3eb5e8db1f252bbcd7de838da452/create_data.py#L1415)
"""
),
(
"openassistant_oasst1_h2ogpt",
"""
- [Original Open Assistant data in tree structure](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [This flattened dataset created by script in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/83857fcf7d3b712aad5db32207e6db0ab0f780f9/create_data.py#L1252)
"""
),
(
"openassistant_oasst1_h2ogpt_graded",
"""
- [Original Open Assistant data in tree structure](https://huggingface.co/datasets/OpenAssistant/oasst1)
- [This flattened dataset created by script in h2oGPT repository](https://github.com/h2oai/h2ogpt/blob/d1f8ce975a46056d41135d126dd33de8499aa26e/create_data.py#L1259)
"""
),
(
"h2ogpt-fortune2000-personalized",
"""
- [Fortune 2000 companies from Wikipedia](https://github.com/h2oai/h2ogpt/blob/b1ea74c0088884ebff97f1ccddbfb3f393e29e44/create_data.py#L1743)
"""
),
],
)
def test_create_data_cards(dataset_name, link_to_source):
if dataset_name != "h2ogpt-fortune2000-personalized":
return
#
assert os.path.exists("README-template.md"), "must be running this test from the data dir."
shutil.rmtree(dataset_name, ignore_errors=True)
try:
repo = huggingface_hub.Repository(
local_dir=dataset_name,
clone_from="h2oai/%s" % dataset_name,
repo_type="dataset",
skip_lfs_files=True,
token=True,
)
repo.git_pull()
except Exception as e:
print(str(e))
print("call 'huggingface_cli login' first and provide access token with write permission")
dataset = load_dataset("h2oai/%s" % dataset_name)["train"]
pd.set_option('display.max_columns', None)
with open("README-template.md", "r") as f:
content = f.read()
assert "<<DATASET_NAME>>" in content
content = content.replace("<<DATASET_NAME>>", dataset_name)
assert "<<NROWS>>" in content
content = content.replace("<<NROWS>>", str(dataset.num_rows))
assert "<<NCOLS>>" in content
content = content.replace("<<NCOLS>>", str(dataset.num_columns))
assert "<<COLNAMES>>" in content
content = content.replace("<<COLNAMES>>", str(dataset.column_names))
# assert "<<PREVIEW>>" in content
# content = content.replace("<<PREVIEW>>", str(dataset.to_pandas().iloc[:5, :]))
assert "<<SOURCE_LINK>>" in content
content = content.replace("<<SOURCE_LINK>>", link_to_source)
assert "<<" not in content
assert ">>" not in content
with open(os.path.join(dataset_name, "README.md"), "w") as f:
f.write(content)
try:
repo.commit("Update README.md")
repo.push_to_hub()
except Exception as e:
print(str(e))
|