File size: 1,279 Bytes
7f1a0e5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | import json
from pathlib import Path
from datasets import load_dataset
def load_hoho_dataset(testing: bool):
if testing:
params_path = Path("params.json")
if params_path.exists():
with params_path.open() as f:
params = json.load(f)
dataset_name = params["dataset"]
else:
dataset_name = "usm3d/hoho25k_test_x"
data_path = Path("/tmp/data")
from huggingface_hub import snapshot_download
_ = snapshot_download(
repo_id=dataset_name,
local_dir=str(data_path),
repo_type="dataset",
)
data_files = {
"validation": [str(p) for p in data_path.rglob("*public*/**/*.tar")],
"test": [str(p) for p in data_path.rglob("*private*/**/*.tar")],
}
dataset = load_dataset(
str(data_path / "hoho25k_test_x.py"),
data_files=data_files,
streaming=True,
trust_remote_code=True,
writer_batch_size=100,
)
return dataset
else:
dataset = load_dataset(
"usm3d/hoho25k",
streaming=True,
trust_remote_code=True,
writer_batch_size=100,
)
return dataset
|