CoSTA / ST /evaluate /tests /conftest.py
bhavanishankarpullela's picture
Upload 360 files
b817ab5 verified
import csv
import json
import lzma
import os
import tarfile
import textwrap
import datasets
import pyarrow as pa
import pyarrow.parquet as pq
import pytest
from datasets import config
from datasets.arrow_dataset import Dataset
from datasets.features import ClassLabel, Features, Sequence, Value
@pytest.fixture(autouse=True)
def set_test_cache_config(tmp_path_factory, monkeypatch):
# test_hf_cache_home = tmp_path_factory.mktemp("cache") # TODO: why a cache dir per test function does not work?
test_hf_cache_home = tmp_path_factory.getbasetemp() / "cache"
test_hf_evaluate_cache = test_hf_cache_home / "datasets"
test_hf_metrics_cache = test_hf_cache_home / "metrics"
test_hf_modules_cache = test_hf_cache_home / "modules"
monkeypatch.setattr("evaluate.config.HF_EVALUATE_CACHE", str(test_hf_evaluate_cache))
monkeypatch.setattr("evaluate.config.HF_METRICS_CACHE", str(test_hf_metrics_cache))
monkeypatch.setattr("evaluate.config.HF_MODULES_CACHE", str(test_hf_modules_cache))
test_DOWNLOADED_EVALUATE_PATH = test_hf_evaluate_cache / "downloads"
monkeypatch.setattr("evaluate.config.DOWNLOADED_EVALUATE_PATH", str(test_DOWNLOADED_EVALUATE_PATH))
test_EXTRACTED_EVALUATE_PATH = test_hf_evaluate_cache / "downloads" / "extracted"
monkeypatch.setattr("evaluate.config.EXTRACTED_EVALUATE_PATH", str(test_EXTRACTED_EVALUATE_PATH))
@pytest.fixture(autouse=True, scope="session")
def disable_tqdm_output():
datasets.disable_progress_bar()
@pytest.fixture(autouse=True)
def set_update_download_counts_to_false(monkeypatch):
# don't take tests into account when counting downloads
monkeypatch.setattr("evaluate.config.HF_UPDATE_DOWNLOAD_COUNTS", False)
monkeypatch.setattr("datasets.config.HF_UPDATE_DOWNLOAD_COUNTS", False)
FILE_CONTENT = """\
Text data.
Second line of data."""
@pytest.fixture(scope="session")
def dataset():
n = 10
features = Features(
{
"tokens": Sequence(Value("string")),
"labels": Sequence(ClassLabel(names=["negative", "positive"])),
"answers": Sequence(
{
"text": Value("string"),
"answer_start": Value("int32"),
}
),
"id": Value("int64"),
}
)
dataset = Dataset.from_dict(
{
"tokens": [["foo"] * 5] * n,
"labels": [[1] * 5] * n,
"answers": [{"answer_start": [97], "text": ["1976"]}] * 10,
"id": list(range(n)),
},
features=features,
)
return dataset
@pytest.fixture(scope="session")
def arrow_file(tmp_path_factory, dataset):
filename = str(tmp_path_factory.mktemp("data") / "file.arrow")
dataset.map(cache_file_name=filename)
return filename
@pytest.fixture(scope="session")
def text_file(tmp_path_factory):
filename = tmp_path_factory.mktemp("data") / "file.txt"
data = FILE_CONTENT
with open(filename, "w") as f:
f.write(data)
return filename
@pytest.fixture(scope="session")
def xz_file(tmp_path_factory):
filename = tmp_path_factory.mktemp("data") / "file.txt.xz"
data = bytes(FILE_CONTENT, "utf-8")
with lzma.open(filename, "wb") as f:
f.write(data)
return filename
@pytest.fixture(scope="session")
def gz_file(tmp_path_factory):
import gzip
path = str(tmp_path_factory.mktemp("data") / "file.txt.gz")
data = bytes(FILE_CONTENT, "utf-8")
with gzip.open(path, "wb") as f:
f.write(data)
return path
@pytest.fixture(scope="session")
def bz2_file(tmp_path_factory):
import bz2
path = tmp_path_factory.mktemp("data") / "file.txt.bz2"
data = bytes(FILE_CONTENT, "utf-8")
with bz2.open(path, "wb") as f:
f.write(data)
return path
@pytest.fixture(scope="session")
def zstd_file(tmp_path_factory):
if config.ZSTANDARD_AVAILABLE:
import zstandard as zstd
path = tmp_path_factory.mktemp("data") / "file.txt.zst"
data = bytes(FILE_CONTENT, "utf-8")
with zstd.open(path, "wb") as f:
f.write(data)
return path
@pytest.fixture(scope="session")
def lz4_file(tmp_path_factory):
if config.LZ4_AVAILABLE:
import lz4.frame
path = tmp_path_factory.mktemp("data") / "file.txt.lz4"
data = bytes(FILE_CONTENT, "utf-8")
with lz4.frame.open(path, "wb") as f:
f.write(data)
return path
@pytest.fixture(scope="session")
def xml_file(tmp_path_factory):
filename = tmp_path_factory.mktemp("data") / "file.xml"
data = textwrap.dedent(
"""\
<?xml version="1.0" encoding="UTF-8" ?>
<tmx version="1.4">
<header segtype="sentence" srclang="ca" />
<body>
<tu>
<tuv xml:lang="ca"><seg>Contingut 1</seg></tuv>
<tuv xml:lang="en"><seg>Content 1</seg></tuv>
</tu>
<tu>
<tuv xml:lang="ca"><seg>Contingut 2</seg></tuv>
<tuv xml:lang="en"><seg>Content 2</seg></tuv>
</tu>
<tu>
<tuv xml:lang="ca"><seg>Contingut 3</seg></tuv>
<tuv xml:lang="en"><seg>Content 3</seg></tuv>
</tu>
<tu>
<tuv xml:lang="ca"><seg>Contingut 4</seg></tuv>
<tuv xml:lang="en"><seg>Content 4</seg></tuv>
</tu>
<tu>
<tuv xml:lang="ca"><seg>Contingut 5</seg></tuv>
<tuv xml:lang="en"><seg>Content 5</seg></tuv>
</tu>
</body>
</tmx>"""
)
with open(filename, "w") as f:
f.write(data)
return filename
DATA = [
{"col_1": "0", "col_2": 0, "col_3": 0.0},
{"col_1": "1", "col_2": 1, "col_3": 1.0},
{"col_1": "2", "col_2": 2, "col_3": 2.0},
{"col_1": "3", "col_2": 3, "col_3": 3.0},
]
DATA2 = [
{"col_1": "4", "col_2": 4, "col_3": 4.0},
{"col_1": "5", "col_2": 5, "col_3": 5.0},
]
DATA_DICT_OF_LISTS = {
"col_1": ["0", "1", "2", "3"],
"col_2": [0, 1, 2, 3],
"col_3": [0.0, 1.0, 2.0, 3.0],
}
DATA_312 = [
{"col_3": 0.0, "col_1": "0", "col_2": 0},
{"col_3": 1.0, "col_1": "1", "col_2": 1},
]
DATA_STR = [
{"col_1": "s0", "col_2": 0, "col_3": 0.0},
{"col_1": "s1", "col_2": 1, "col_3": 1.0},
{"col_1": "s2", "col_2": 2, "col_3": 2.0},
{"col_1": "s3", "col_2": 3, "col_3": 3.0},
]
@pytest.fixture(scope="session")
def dataset_dict():
return DATA_DICT_OF_LISTS
@pytest.fixture(scope="session")
def arrow_path(tmp_path_factory):
dataset = Dataset.from_dict(DATA_DICT_OF_LISTS)
path = str(tmp_path_factory.mktemp("data") / "dataset.arrow")
dataset.map(cache_file_name=path)
return path
@pytest.fixture(scope="session")
def csv_path(tmp_path_factory):
path = str(tmp_path_factory.mktemp("data") / "dataset.csv")
with open(path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"])
writer.writeheader()
for item in DATA:
writer.writerow(item)
return path
@pytest.fixture(scope="session")
def csv2_path(tmp_path_factory):
path = str(tmp_path_factory.mktemp("data") / "dataset2.csv")
with open(path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=["col_1", "col_2", "col_3"])
writer.writeheader()
for item in DATA:
writer.writerow(item)
return path
@pytest.fixture(scope="session")
def bz2_csv_path(csv_path, tmp_path_factory):
import bz2
path = tmp_path_factory.mktemp("data") / "dataset.csv.bz2"
with open(csv_path, "rb") as f:
data = f.read()
# data = bytes(FILE_CONTENT, "utf-8")
with bz2.open(path, "wb") as f:
f.write(data)
return path
@pytest.fixture(scope="session")
def zip_csv_path(csv_path, csv2_path, tmp_path_factory):
import zipfile
path = tmp_path_factory.mktemp("data") / "dataset.csv.zip"
with zipfile.ZipFile(path, "w") as f:
f.write(csv_path, arcname=os.path.basename(csv_path))
f.write(csv2_path, arcname=os.path.basename(csv2_path))
return path
@pytest.fixture(scope="session")
def zip_csv_with_dir_path(csv_path, csv2_path, tmp_path_factory):
import zipfile
path = tmp_path_factory.mktemp("data") / "dataset_with_dir.csv.zip"
with zipfile.ZipFile(path, "w") as f:
f.write(csv_path, arcname=os.path.join("main_dir", os.path.basename(csv_path)))
f.write(csv2_path, arcname=os.path.join("main_dir", os.path.basename(csv2_path)))
return path
@pytest.fixture(scope="session")
def parquet_path(tmp_path_factory):
path = str(tmp_path_factory.mktemp("data") / "dataset.parquet")
schema = pa.schema(
{
"col_1": pa.string(),
"col_2": pa.int64(),
"col_3": pa.float64(),
}
)
with open(path, "wb") as f:
writer = pq.ParquetWriter(f, schema=schema)
pa_table = pa.Table.from_pydict({k: [DATA[i][k] for i in range(len(DATA))] for k in DATA[0]}, schema=schema)
writer.write_table(pa_table)
writer.close()
return path
@pytest.fixture(scope="session")
def json_list_of_dicts_path(tmp_path_factory):
path = str(tmp_path_factory.mktemp("data") / "dataset.json")
data = {"data": DATA}
with open(path, "w") as f:
json.dump(data, f)
return path
@pytest.fixture(scope="session")
def json_dict_of_lists_path(tmp_path_factory):
path = str(tmp_path_factory.mktemp("data") / "dataset.json")
data = {"data": DATA_DICT_OF_LISTS}
with open(path, "w") as f:
json.dump(data, f)
return path
@pytest.fixture(scope="session")
def jsonl_path(tmp_path_factory):
path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl")
with open(path, "w") as f:
for item in DATA:
f.write(json.dumps(item) + "\n")
return path
@pytest.fixture(scope="session")
def jsonl2_path(tmp_path_factory):
path = str(tmp_path_factory.mktemp("data") / "dataset2.jsonl")
with open(path, "w") as f:
for item in DATA:
f.write(json.dumps(item) + "\n")
return path
@pytest.fixture(scope="session")
def jsonl_312_path(tmp_path_factory):
path = str(tmp_path_factory.mktemp("data") / "dataset_312.jsonl")
with open(path, "w") as f:
for item in DATA_312:
f.write(json.dumps(item) + "\n")
return path
@pytest.fixture(scope="session")
def jsonl_str_path(tmp_path_factory):
path = str(tmp_path_factory.mktemp("data") / "dataset-str.jsonl")
with open(path, "w") as f:
for item in DATA_STR:
f.write(json.dumps(item) + "\n")
return path
@pytest.fixture(scope="session")
def text_gz_path(tmp_path_factory, text_path):
import gzip
path = str(tmp_path_factory.mktemp("data") / "dataset.txt.gz")
with open(text_path, "rb") as orig_file:
with gzip.open(path, "wb") as zipped_file:
zipped_file.writelines(orig_file)
return path
@pytest.fixture(scope="session")
def jsonl_gz_path(tmp_path_factory, jsonl_path):
import gzip
path = str(tmp_path_factory.mktemp("data") / "dataset.jsonl.gz")
with open(jsonl_path, "rb") as orig_file:
with gzip.open(path, "wb") as zipped_file:
zipped_file.writelines(orig_file)
return path
@pytest.fixture(scope="session")
def zip_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory):
import zipfile
path = tmp_path_factory.mktemp("data") / "dataset.jsonl.zip"
with zipfile.ZipFile(path, "w") as f:
f.write(jsonl_path, arcname=os.path.basename(jsonl_path))
f.write(jsonl2_path, arcname=os.path.basename(jsonl2_path))
return path
@pytest.fixture(scope="session")
def zip_jsonl_with_dir_path(jsonl_path, jsonl2_path, tmp_path_factory):
import zipfile
path = tmp_path_factory.mktemp("data") / "dataset_with_dir.jsonl.zip"
with zipfile.ZipFile(path, "w") as f:
f.write(jsonl_path, arcname=os.path.join("main_dir", os.path.basename(jsonl_path)))
f.write(jsonl2_path, arcname=os.path.join("main_dir", os.path.basename(jsonl2_path)))
return path
@pytest.fixture(scope="session")
def tar_jsonl_path(jsonl_path, jsonl2_path, tmp_path_factory):
path = tmp_path_factory.mktemp("data") / "dataset.jsonl.tar"
with tarfile.TarFile(path, "w") as f:
f.add(jsonl_path, arcname=os.path.basename(jsonl_path))
f.add(jsonl2_path, arcname=os.path.basename(jsonl2_path))
return path
@pytest.fixture(scope="session")
def tar_nested_jsonl_path(tar_jsonl_path, jsonl_path, jsonl2_path, tmp_path_factory):
path = tmp_path_factory.mktemp("data") / "dataset_nested.jsonl.tar"
with tarfile.TarFile(path, "w") as f:
f.add(tar_jsonl_path, arcname=os.path.join("nested", os.path.basename(tar_jsonl_path)))
return path
@pytest.fixture(scope="session")
def text_path(tmp_path_factory):
data = ["0", "1", "2", "3"]
path = str(tmp_path_factory.mktemp("data") / "dataset.txt")
with open(path, "w") as f:
for item in data:
f.write(item + "\n")
return path
@pytest.fixture(scope="session")
def text2_path(tmp_path_factory):
data = ["0", "1", "2", "3"]
path = str(tmp_path_factory.mktemp("data") / "dataset2.txt")
with open(path, "w") as f:
for item in data:
f.write(item + "\n")
return path
@pytest.fixture(scope="session")
def zip_text_path(text_path, text2_path, tmp_path_factory):
import zipfile
path = tmp_path_factory.mktemp("data") / "dataset.text.zip"
with zipfile.ZipFile(path, "w") as f:
f.write(text_path, arcname=os.path.basename(text_path))
f.write(text2_path, arcname=os.path.basename(text2_path))
return path
@pytest.fixture(scope="session")
def zip_text_with_dir_path(text_path, text2_path, tmp_path_factory):
import zipfile
path = tmp_path_factory.mktemp("data") / "dataset_with_dir.text.zip"
with zipfile.ZipFile(path, "w") as f:
f.write(text_path, arcname=os.path.join("main_dir", os.path.basename(text_path)))
f.write(text2_path, arcname=os.path.join("main_dir", os.path.basename(text2_path)))
return path
@pytest.fixture(scope="session")
def text_path_with_unicode_new_lines(tmp_path_factory):
text = "\n".join(["First", "Second\u2029with Unicode new line", "Third"])
path = str(tmp_path_factory.mktemp("data") / "dataset_with_unicode_new_lines.txt")
with open(path, "w", encoding="utf-8") as f:
f.write(text)
return path