HITL-KG / src /core /config.py
avojarot's picture
Upload 22 files
c5880fb verified
"""
HITL-KG Configuration Module
Centralized configuration for the entire system.
"""
import os
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Any
from pathlib import Path
import yaml
import logging
logger = logging.getLogger(__name__)
@dataclass
class EmbeddingConfig:
"""Configuration for embedding-based search."""
model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
cache_dir: str = "./data/embeddings"
dimension: int = 384
batch_size: int = 32
device: str = "cpu" # "cpu", "cuda", "mps"
@dataclass
class DatasetConfig:
"""Configuration for a single dataset."""
name: str
source_type: str # "obo", "owl", "csv", "json"
source_url: Optional[str] = None
source_path: Optional[str] = None
entity_category: str = "entity"
cache_enabled: bool = True
cache_max_age_days: int = 7
@dataclass
class LLMConfig:
"""Configuration for LLM providers."""
provider: str = "local" # "openai", "local"
model: str = "gpt-4o-mini"
temperature: float = 0.7
max_tokens: int = 2048
api_key: Optional[str] = None
@dataclass
class AppConfig:
"""Main application configuration."""
# Paths
data_dir: Path = field(default_factory=lambda: Path("./data"))
cache_dir: Path = field(default_factory=lambda: Path("./data/cache"))
sessions_dir: Path = field(default_factory=lambda: Path("./data/sessions"))
# Server
host: str = "0.0.0.0"
port: int = 7860
debug: bool = False
# Sessions
session_max_age_hours: int = 24
session_cleanup_interval_minutes: int = 5
max_sessions: int = 1000
# Embeddings
embedding: EmbeddingConfig = field(default_factory=EmbeddingConfig)
# LLM
llm: LLMConfig = field(default_factory=LLMConfig)
# Datasets
datasets: List[DatasetConfig] = field(default_factory=list)
# UI
default_language: str = "en"
supported_languages: List[str] = field(
default_factory=lambda: ["en", "uk", "ru", "es", "de", "fr"]
)
def __post_init__(self):
"""Ensure directories exist."""
for path in [self.data_dir, self.cache_dir, self.sessions_dir]:
path.mkdir(parents=True, exist_ok=True)
# Default medical datasets configuration
DEFAULT_MEDICAL_DATASETS = [
DatasetConfig(
name="disease_ontology",
source_type="obo",
source_url="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo",
entity_category="disease",
),
DatasetConfig(
name="symptom_ontology",
source_type="obo",
source_url="https://raw.githubusercontent.com/DiseaseOntology/SymptomOntology/main/symp.obo",
entity_category="symptom",
),
]
def load_config(config_path: Optional[str] = None) -> AppConfig:
"""
Load configuration from file or use defaults.
Priority:
1. Explicit config file path
2. Environment variable HITL_KG_CONFIG
3. ./config.yaml if exists
4. Default configuration
"""
config_file = None
if config_path:
config_file = Path(config_path)
elif os.environ.get("HITL_KG_CONFIG"):
config_file = Path(os.environ["HITL_KG_CONFIG"])
elif Path("./config.yaml").exists():
config_file = Path("./config.yaml")
if config_file and config_file.exists():
try:
with open(config_file) as f:
data = yaml.safe_load(f)
return _parse_config(data)
except Exception as e:
logger.warning(f"Failed to load config from {config_file}: {e}")
# Return default config with medical datasets
config = AppConfig()
config.datasets = DEFAULT_MEDICAL_DATASETS
# Override from environment
if os.environ.get("OPENAI_API_KEY"):
config.llm.api_key = os.environ["OPENAI_API_KEY"]
config.llm.provider = "openai"
if os.environ.get("PORT"):
config.port = int(os.environ["PORT"])
return config
def _parse_config(data: Dict[str, Any]) -> AppConfig:
"""Parse configuration dictionary into AppConfig."""
config = AppConfig()
# Parse simple fields
for key in ["host", "port", "debug", "default_language"]:
if key in data:
setattr(config, key, data[key])
# Parse paths
for key in ["data_dir", "cache_dir", "sessions_dir"]:
if key in data:
setattr(config, key, Path(data[key]))
# Parse embedding config
if "embedding" in data:
config.embedding = EmbeddingConfig(**data["embedding"])
# Parse LLM config
if "llm" in data:
config.llm = LLMConfig(**data["llm"])
# Parse datasets
if "datasets" in data:
config.datasets = [DatasetConfig(**ds) for ds in data["datasets"]]
else:
config.datasets = DEFAULT_MEDICAL_DATASETS
return config
# Global config instance (lazy loaded)
_config: Optional[AppConfig] = None
def get_config() -> AppConfig:
"""Get the global configuration instance."""
global _config
if _config is None:
_config = load_config()
return _config