|
|
""" |
|
|
HITL-KG Configuration Module |
|
|
Centralized configuration for the entire system. |
|
|
""" |
|
|
|
|
|
import os |
|
|
from dataclasses import dataclass, field |
|
|
from typing import Dict, List, Optional, Any |
|
|
from pathlib import Path |
|
|
import yaml |
|
|
import logging |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class EmbeddingConfig: |
|
|
"""Configuration for embedding-based search.""" |
|
|
model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" |
|
|
cache_dir: str = "./data/embeddings" |
|
|
dimension: int = 384 |
|
|
batch_size: int = 32 |
|
|
device: str = "cpu" |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class DatasetConfig: |
|
|
"""Configuration for a single dataset.""" |
|
|
name: str |
|
|
source_type: str |
|
|
source_url: Optional[str] = None |
|
|
source_path: Optional[str] = None |
|
|
entity_category: str = "entity" |
|
|
cache_enabled: bool = True |
|
|
cache_max_age_days: int = 7 |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class LLMConfig: |
|
|
"""Configuration for LLM providers.""" |
|
|
provider: str = "local" |
|
|
model: str = "gpt-4o-mini" |
|
|
temperature: float = 0.7 |
|
|
max_tokens: int = 2048 |
|
|
api_key: Optional[str] = None |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class AppConfig: |
|
|
"""Main application configuration.""" |
|
|
|
|
|
data_dir: Path = field(default_factory=lambda: Path("./data")) |
|
|
cache_dir: Path = field(default_factory=lambda: Path("./data/cache")) |
|
|
sessions_dir: Path = field(default_factory=lambda: Path("./data/sessions")) |
|
|
|
|
|
|
|
|
host: str = "0.0.0.0" |
|
|
port: int = 7860 |
|
|
debug: bool = False |
|
|
|
|
|
|
|
|
session_max_age_hours: int = 24 |
|
|
session_cleanup_interval_minutes: int = 5 |
|
|
max_sessions: int = 1000 |
|
|
|
|
|
|
|
|
embedding: EmbeddingConfig = field(default_factory=EmbeddingConfig) |
|
|
|
|
|
|
|
|
llm: LLMConfig = field(default_factory=LLMConfig) |
|
|
|
|
|
|
|
|
datasets: List[DatasetConfig] = field(default_factory=list) |
|
|
|
|
|
|
|
|
default_language: str = "en" |
|
|
supported_languages: List[str] = field( |
|
|
default_factory=lambda: ["en", "uk", "ru", "es", "de", "fr"] |
|
|
) |
|
|
|
|
|
def __post_init__(self): |
|
|
"""Ensure directories exist.""" |
|
|
for path in [self.data_dir, self.cache_dir, self.sessions_dir]: |
|
|
path.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_MEDICAL_DATASETS = [ |
|
|
DatasetConfig( |
|
|
name="disease_ontology", |
|
|
source_type="obo", |
|
|
source_url="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo", |
|
|
entity_category="disease", |
|
|
), |
|
|
DatasetConfig( |
|
|
name="symptom_ontology", |
|
|
source_type="obo", |
|
|
source_url="https://raw.githubusercontent.com/DiseaseOntology/SymptomOntology/main/symp.obo", |
|
|
entity_category="symptom", |
|
|
), |
|
|
] |
|
|
|
|
|
|
|
|
def load_config(config_path: Optional[str] = None) -> AppConfig: |
|
|
""" |
|
|
Load configuration from file or use defaults. |
|
|
|
|
|
Priority: |
|
|
1. Explicit config file path |
|
|
2. Environment variable HITL_KG_CONFIG |
|
|
3. ./config.yaml if exists |
|
|
4. Default configuration |
|
|
""" |
|
|
config_file = None |
|
|
|
|
|
if config_path: |
|
|
config_file = Path(config_path) |
|
|
elif os.environ.get("HITL_KG_CONFIG"): |
|
|
config_file = Path(os.environ["HITL_KG_CONFIG"]) |
|
|
elif Path("./config.yaml").exists(): |
|
|
config_file = Path("./config.yaml") |
|
|
|
|
|
if config_file and config_file.exists(): |
|
|
try: |
|
|
with open(config_file) as f: |
|
|
data = yaml.safe_load(f) |
|
|
return _parse_config(data) |
|
|
except Exception as e: |
|
|
logger.warning(f"Failed to load config from {config_file}: {e}") |
|
|
|
|
|
|
|
|
config = AppConfig() |
|
|
config.datasets = DEFAULT_MEDICAL_DATASETS |
|
|
|
|
|
|
|
|
if os.environ.get("OPENAI_API_KEY"): |
|
|
config.llm.api_key = os.environ["OPENAI_API_KEY"] |
|
|
config.llm.provider = "openai" |
|
|
|
|
|
if os.environ.get("PORT"): |
|
|
config.port = int(os.environ["PORT"]) |
|
|
|
|
|
return config |
|
|
|
|
|
|
|
|
def _parse_config(data: Dict[str, Any]) -> AppConfig: |
|
|
"""Parse configuration dictionary into AppConfig.""" |
|
|
config = AppConfig() |
|
|
|
|
|
|
|
|
for key in ["host", "port", "debug", "default_language"]: |
|
|
if key in data: |
|
|
setattr(config, key, data[key]) |
|
|
|
|
|
|
|
|
for key in ["data_dir", "cache_dir", "sessions_dir"]: |
|
|
if key in data: |
|
|
setattr(config, key, Path(data[key])) |
|
|
|
|
|
|
|
|
if "embedding" in data: |
|
|
config.embedding = EmbeddingConfig(**data["embedding"]) |
|
|
|
|
|
|
|
|
if "llm" in data: |
|
|
config.llm = LLMConfig(**data["llm"]) |
|
|
|
|
|
|
|
|
if "datasets" in data: |
|
|
config.datasets = [DatasetConfig(**ds) for ds in data["datasets"]] |
|
|
else: |
|
|
config.datasets = DEFAULT_MEDICAL_DATASETS |
|
|
|
|
|
return config |
|
|
|
|
|
|
|
|
|
|
|
_config: Optional[AppConfig] = None |
|
|
|
|
|
|
|
|
def get_config() -> AppConfig: |
|
|
"""Get the global configuration instance.""" |
|
|
global _config |
|
|
if _config is None: |
|
|
_config = load_config() |
|
|
return _config |
|
|
|