Spaces:

avojarot
/

HITL-KG

Sleeping

App Files Files Community

HITL-KG / src /core /config.py

avojarot

Upload 22 files

c5880fb verified 9 days ago

raw

history blame contribute delete

5.22 kB

	"""
	HITL-KG Configuration Module
	Centralized configuration for the entire system.
	"""

	import os
	from dataclasses import dataclass, field
	from typing import Dict, List, Optional, Any
	from pathlib import Path
	import yaml
	import logging

	logger = logging.getLogger(__name__)


	@dataclass
	class EmbeddingConfig:
	"""Configuration for embedding-based search."""
	model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
	cache_dir: str = "./data/embeddings"
	dimension: int = 384
	batch_size: int = 32
	device: str = "cpu" # "cpu", "cuda", "mps"


	@dataclass
	class DatasetConfig:
	"""Configuration for a single dataset."""
	name: str
	source_type: str # "obo", "owl", "csv", "json"
	source_url: Optional[str] = None
	source_path: Optional[str] = None
	entity_category: str = "entity"
	cache_enabled: bool = True
	cache_max_age_days: int = 7


	@dataclass
	class LLMConfig:
	"""Configuration for LLM providers."""
	provider: str = "local" # "openai", "local"
	model: str = "gpt-4o-mini"
	temperature: float = 0.7
	max_tokens: int = 2048
	api_key: Optional[str] = None


	@dataclass
	class AppConfig:
	"""Main application configuration."""
	# Paths
	data_dir: Path = field(default_factory=lambda: Path("./data"))
	cache_dir: Path = field(default_factory=lambda: Path("./data/cache"))
	sessions_dir: Path = field(default_factory=lambda: Path("./data/sessions"))

	# Server
	host: str = "0.0.0.0"
	port: int = 7860
	debug: bool = False

	# Sessions
	session_max_age_hours: int = 24
	session_cleanup_interval_minutes: int = 5
	max_sessions: int = 1000

	# Embeddings
	embedding: EmbeddingConfig = field(default_factory=EmbeddingConfig)

	# LLM
	llm: LLMConfig = field(default_factory=LLMConfig)

	# Datasets
	datasets: List[DatasetConfig] = field(default_factory=list)

	# UI
	default_language: str = "en"
	supported_languages: List[str] = field(
	default_factory=lambda: ["en", "uk", "ru", "es", "de", "fr"]
	)

	def __post_init__(self):
	"""Ensure directories exist."""
	for path in [self.data_dir, self.cache_dir, self.sessions_dir]:
	path.mkdir(parents=True, exist_ok=True)


	# Default medical datasets configuration
	DEFAULT_MEDICAL_DATASETS = [
	DatasetConfig(
	name="disease_ontology",
	source_type="obo",
	source_url="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo",
	entity_category="disease",
	),
	DatasetConfig(
	name="symptom_ontology",
	source_type="obo",
	source_url="https://raw.githubusercontent.com/DiseaseOntology/SymptomOntology/main/symp.obo",
	entity_category="symptom",
	),
	]


	def load_config(config_path: Optional[str] = None) -> AppConfig:
	"""
	Load configuration from file or use defaults.

	Priority:
	1. Explicit config file path
	2. Environment variable HITL_KG_CONFIG
	3. ./config.yaml if exists
	4. Default configuration
	"""
	config_file = None

	if config_path:
	config_file = Path(config_path)
	elif os.environ.get("HITL_KG_CONFIG"):
	config_file = Path(os.environ["HITL_KG_CONFIG"])
	elif Path("./config.yaml").exists():
	config_file = Path("./config.yaml")

	if config_file and config_file.exists():
	try:
	with open(config_file) as f:
	data = yaml.safe_load(f)
	return _parse_config(data)
	except Exception as e:
	logger.warning(f"Failed to load config from {config_file}: {e}")

	# Return default config with medical datasets
	config = AppConfig()
	config.datasets = DEFAULT_MEDICAL_DATASETS

	# Override from environment
	if os.environ.get("OPENAI_API_KEY"):
	config.llm.api_key = os.environ["OPENAI_API_KEY"]
	config.llm.provider = "openai"

	if os.environ.get("PORT"):
	config.port = int(os.environ["PORT"])

	return config


	def _parse_config(data: Dict[str, Any]) -> AppConfig:
	"""Parse configuration dictionary into AppConfig."""
	config = AppConfig()

	# Parse simple fields
	for key in ["host", "port", "debug", "default_language"]:
	if key in data:
	setattr(config, key, data[key])

	# Parse paths
	for key in ["data_dir", "cache_dir", "sessions_dir"]:
	if key in data:
	setattr(config, key, Path(data[key]))

	# Parse embedding config
	if "embedding" in data:
	config.embedding = EmbeddingConfig(**data["embedding"])

	# Parse LLM config
	if "llm" in data:
	config.llm = LLMConfig(**data["llm"])

	# Parse datasets
	if "datasets" in data:
	config.datasets = [DatasetConfig(**ds) for ds in data["datasets"]]
	else:
	config.datasets = DEFAULT_MEDICAL_DATASETS

	return config


	# Global config instance (lazy loaded)
	_config: Optional[AppConfig] = None


	def get_config() -> AppConfig:
	"""Get the global configuration instance."""
	global _config
	if _config is None:
	_config = load_config()
	return _config