Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,967 Bytes
be2d363 6d06ff9 be2d363 6d06ff9 be2d363 6d06ff9 be2d363 6d06ff9 be2d363 6d06ff9 be2d363 6d06ff9 be2d363 6d06ff9 be2d363 6d06ff9 71ceb2a be2d363 6d06ff9 be2d363 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import os
from pathlib import Path
import uuid
import json
from huggingface_hub import HfApi, HfFileSystem
DATASET_REPO = "EarthSpeciesProject/naturelm-audio-space-logs"
SPLIT = "test"
TESTING = os.getenv("TESTING", "0") == "1"
api = HfApi(token=os.getenv("HF_TOKEN", None))
# Upload audio
# check if file exists
hf_fs = HfFileSystem(token=os.getenv("HF_TOKEN", None))
def upload_data(audio: str | Path, user_text: str, model_response: str, session_id: str = ""):
data_id = str(uuid.uuid4())
if TESTING:
data_id = "test-" + data_id
session_id = "test-" + session_id
# Audio path in repo
suffix = Path(audio).suffix
audio_p = f"{SPLIT}/audio/" + session_id + suffix
if not hf_fs.exists(f"datasets/{DATASET_REPO}/{audio_p}"):
api.upload_file(
path_or_fileobj=str(audio),
path_in_repo=audio_p,
repo_id=DATASET_REPO,
repo_type="dataset",
)
text = {
"user_message": user_text,
"model_response": model_response,
"file_name": "audio/" + session_id + suffix, # has to be relative to metadata.jsonl
"original_fn": os.path.basename(audio),
"id": data_id,
"session_id": session_id,
}
# Append to a jsonl file in the repo
# APPEND DOESNT WORK, have to open first
if hf_fs.exists(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl"):
with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "r") as f:
lines = f.readlines()
lines.append(json.dumps(text) + "\n")
with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "w") as f:
f.writelines(lines)
else:
with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "w") as f:
f.write(json.dumps(text) + "\n")
# Write a separate file instead
# with hf_fs.open(f"datasets/{DATASET_REPO}/{data_id}.json", "w") as f:
# json.dump(text, f)
|