File size: 1,967 Bytes
be2d363
 
 
 
 
 
 
 
 
6d06ff9
be2d363
 
6d06ff9
be2d363
 
6d06ff9
be2d363
6d06ff9
be2d363
 
6d06ff9
 
be2d363
 
6d06ff9
be2d363
6d06ff9
 
 
 
 
 
 
be2d363
 
 
 
6d06ff9
71ceb2a
be2d363
6d06ff9
be2d363
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import os
from pathlib import Path
import uuid
import json
from huggingface_hub import HfApi, HfFileSystem

DATASET_REPO = "EarthSpeciesProject/naturelm-audio-space-logs"
SPLIT = "test"
TESTING = os.getenv("TESTING", "0") == "1"
api = HfApi(token=os.getenv("HF_TOKEN", None))
# Upload audio
# check if file exists
hf_fs = HfFileSystem(token=os.getenv("HF_TOKEN", None))


def upload_data(audio: str | Path, user_text: str, model_response: str, session_id: str = ""):
    data_id = str(uuid.uuid4())

    if TESTING:
        data_id = "test-" + data_id
        session_id = "test-" + session_id

    # Audio path in repo
    suffix = Path(audio).suffix
    audio_p = f"{SPLIT}/audio/" + session_id + suffix

    if not hf_fs.exists(f"datasets/{DATASET_REPO}/{audio_p}"):
        api.upload_file(
            path_or_fileobj=str(audio),
            path_in_repo=audio_p,
            repo_id=DATASET_REPO,
            repo_type="dataset",
        )

    text = {
        "user_message": user_text,
        "model_response": model_response,
        "file_name": "audio/" + session_id + suffix,  # has to be relative to metadata.jsonl
        "original_fn": os.path.basename(audio),
        "id": data_id,
        "session_id": session_id,
    }

    # Append to a jsonl file in the repo
    # APPEND DOESNT WORK, have to open first
    if hf_fs.exists(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl"):
        with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "r") as f:
            lines = f.readlines()
        lines.append(json.dumps(text) + "\n")
        with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "w") as f:
            f.writelines(lines)
    else:
        with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "w") as f:
            f.write(json.dumps(text) + "\n")

    # Write a separate file instead
    # with hf_fs.open(f"datasets/{DATASET_REPO}/{data_id}.json", "w") as f:
    #     json.dump(text, f)