File size: 1,279 Bytes
7f1a0e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import json
from pathlib import Path

from datasets import load_dataset


def load_hoho_dataset(testing: bool):
    if testing:
        params_path = Path("params.json")
        if params_path.exists():
            with params_path.open() as f:
                params = json.load(f)
            dataset_name = params["dataset"]
        else:
            dataset_name = "usm3d/hoho25k_test_x"
        data_path = Path("/tmp/data")

        from huggingface_hub import snapshot_download

        _ = snapshot_download(
            repo_id=dataset_name,
            local_dir=str(data_path),
            repo_type="dataset",
        )

        data_files = {
            "validation": [str(p) for p in data_path.rglob("*public*/**/*.tar")],
            "test": [str(p) for p in data_path.rglob("*private*/**/*.tar")],
        }
        dataset = load_dataset(
            str(data_path / "hoho25k_test_x.py"),
            data_files=data_files,
            streaming=True,
            trust_remote_code=True,
            writer_batch_size=100,
        )

        return dataset
    else:
        dataset = load_dataset(
            "usm3d/hoho25k",
            streaming=True,
            trust_remote_code=True,
            writer_batch_size=100,
        )

        return dataset