Spaces:

Yurikks
/

yoruba-tts

Sleeping

App Files Files Community

Yurikks commited on 20 days ago

Commit

a3fa620

1 Parent(s): d1848fb

Deploy Yoruba TTS API with facebook/mms-tts-yor

Browse files

Files changed (6) hide show

Dockerfile +31 -0
README.md +75 -6
cache.py +96 -0
main.py +95 -0
requirements.txt +14 -0
tts_service.py +73 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+# YorubaApp TTS Backend - Hugging Face Spaces
+FROM python:3.11-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Pre-download model at build time (avoids timeout on startup)
+RUN python -c "from transformers import VitsModel, AutoTokenizer; \
+    print('Downloading facebook/mms-tts-yor model...'); \
+    VitsModel.from_pretrained('facebook/mms-tts-yor'); \
+    AutoTokenizer.from_pretrained('facebook/mms-tts-yor'); \
+    print('Model downloaded successfully!')"
+# Copy application code
+COPY . .
+# Hugging Face Spaces uses port 7860
+EXPOSE 7860
+# Run the application
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,11 +1,80 @@
 ---
-title: Yoruba Tts
-emoji: 🌍
-colorFrom: red
-colorTo: yellow
 sdk: docker
 pinned: false
-short_description: yorubaapp
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Yoruba TTS API
+emoji: "\U0001F5E3\uFE0F"
+colorFrom: yellow
+colorTo: orange
 sdk: docker
+app_port: 7860
 pinned: false
+license: cc-by-nc-4.0
 ---
+# Yoruba TTS API
+Text-to-Speech API for Yoruba language using the `facebook/mms-tts-yor` model.
+## Model Information
+- **Model**: [facebook/mms-tts-yor](https://huggingface.co/facebook/mms-tts-yor)
+- **Architecture**: VITS (Variational Inference TTS)
+- **Parameters**: 36.3M
+- **License**: CC-BY-NC 4.0 (non-commercial use)
+## API Endpoints
+### POST /tts
+Generate speech from Yoruba text.
+**Request:**
+```json
+{
+  "text": "Bawo ni"
+}
+```
+**Response:**
+```json
+{
+  "audio": "UklGRiQAAABXQVZFZm10...",
+  "cached": false
+}
+```
+The `audio` field contains base64-encoded WAV audio.
+### GET /health
+Check service health.
+**Response:**
+```json
+{
+  "status": "healthy",
+  "model": "facebook/mms-tts-yor"
+}
+```
+## Usage Example
+```python
+import requests
+import base64
+response = requests.post(
+    "https://YOUR-SPACE.hf.space/tts",
+    json={"text": "Bawo ni"}
+)
+audio_b64 = response.json()["audio"]
+audio_bytes = base64.b64decode(audio_b64)
+with open("output.wav", "wb") as f:
+    f.write(audio_bytes)
+```
+## Limitations
+- Maximum text length: 500 characters
+- Audio format: WAV (16-bit PCM)
+- Sample rate: Model default (~22050 Hz)

cache.py ADDED Viewed

	@@ -0,0 +1,96 @@

+"""
+TTS Cache using Redis or in-memory fallback
+"""
+import hashlib
+import logging
+import os
+from typing import Optional
+logger = logging.getLogger(__name__)
+# Try to import redis, fallback to in-memory cache
+try:
+    import redis.asyncio as redis
+    REDIS_AVAILABLE = True
+except ImportError:
+    REDIS_AVAILABLE = False
+    logger.warning("Redis not available, using in-memory cache")
+class TTSCache:
+    def __init__(self):
+        self.ttl = 86400 * 7  # 7 days
+        self.redis_client = None
+        self.memory_cache: dict[str, str] = {}
+        self.max_memory_items = 1000
+        # Try to connect to Redis
+        redis_url = os.environ.get("REDIS_URL", "redis://localhost:6379")
+        if REDIS_AVAILABLE:
+            try:
+                self.redis_client = redis.from_url(redis_url, decode_responses=True)
+                logger.info(f"Redis cache initialized: {redis_url}")
+            except Exception as e:
+                logger.warning(f"Redis connection failed, using memory cache: {e}")
+                self.redis_client = None
+    def _key(self, text: str) -> str:
+        """Generate cache key from text hash"""
+        return f"tts:{hashlib.md5(text.encode()).hexdigest()}"
+    async def get(self, text: str) -> Optional[str]:
+        """Get cached audio (base64) for text"""
+        key = self._key(text)
+        # Try Redis first
+        if self.redis_client:
+            try:
+                result = await self.redis_client.get(key)
+                if result:
+                    logger.debug(f"Redis cache hit for key: {key}")
+                    return result
+            except Exception as e:
+                logger.warning(f"Redis get failed: {e}")
+        # Fallback to memory cache
+        result = self.memory_cache.get(key)
+        if result:
+            logger.debug(f"Memory cache hit for key: {key}")
+        return result
+    async def set(self, text: str, audio_b64: str):
+        """Cache audio (base64) for text"""
+        key = self._key(text)
+        # Try Redis first
+        if self.redis_client:
+            try:
+                await self.redis_client.setex(key, self.ttl, audio_b64)
+                logger.debug(f"Cached to Redis: {key}")
+                return
+            except Exception as e:
+                logger.warning(f"Redis set failed: {e}")
+        # Fallback to memory cache with LRU eviction
+        if len(self.memory_cache) >= self.max_memory_items:
+            # Remove oldest item (simple FIFO, not true LRU)
+            oldest_key = next(iter(self.memory_cache))
+            del self.memory_cache[oldest_key]
+            logger.debug(f"Evicted from memory cache: {oldest_key}")
+        self.memory_cache[key] = audio_b64
+        logger.debug(f"Cached to memory: {key}")
+    async def clear(self):
+        """Clear all cached items"""
+        self.memory_cache.clear()
+        if self.redis_client:
+            try:
+                # Clear only TTS keys
+                async for key in self.redis_client.scan_iter("tts:*"):
+                    await self.redis_client.delete(key)
+            except Exception as e:
+                logger.warning(f"Redis clear failed: {e}")

main.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""
+TTS Backend for YorubaApp
+Uses facebook/mms-tts-yor model for Yoruba text-to-speech
+"""
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+import base64
+import logging
+from tts_service import TTSService
+from cache import TTSCache
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI(
+    title="YorubaApp TTS API",
+    description="Text-to-Speech API for Yoruba language using MMS-TTS-YOR",
+    version="1.0.0"
+)
+# CORS - allow requests from Expo dev server and production
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Configure for production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Initialize services
+tts = TTSService()
+cache = TTSCache()
+class TTSRequest(BaseModel):
+    text: str
+class TTSResponse(BaseModel):
+    audio: str  # base64 encoded WAV
+    cached: bool
+@app.get("/")
+async def root():
+    return {"status": "ok", "service": "YorubaApp TTS API"}
+@app.get("/health")
+async def health():
+    return {"status": "healthy", "model": "facebook/mms-tts-yor"}
+@app.post("/tts", response_model=TTSResponse)
+async def text_to_speech(request: TTSRequest):
+    text = request.text.strip()
+    if not text:
+        raise HTTPException(status_code=400, detail="Text is required")
+    if len(text) > 500:
+        raise HTTPException(status_code=400, detail="Text too long (max 500 characters)")
+    logger.info(f"TTS request for text: {text[:50]}...")
+    # Check cache first
+    cached_audio = await cache.get(text)
+    if cached_audio:
+        logger.info("Returning cached audio")
+        return TTSResponse(audio=cached_audio, cached=True)
+    try:
+        # Generate audio
+        audio_bytes = await tts.synthesize(text)
+        audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')
+        # Cache result
+        await cache.set(text, audio_b64)
+        logger.info(f"Generated audio: {len(audio_bytes)} bytes")
+        return TTSResponse(audio=audio_b64, cached=False)
+    except Exception as e:
+        logger.error(f"TTS synthesis failed: {e}")
+        raise HTTPException(status_code=500, detail=f"TTS synthesis failed: {str(e)}")
+if __name__ == "__main__":
+    import uvicorn
+    # Port 7860 is the default for Hugging Face Spaces
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+# FastAPI and server
+fastapi==0.115.6
+uvicorn[standard]==0.34.0
+pydantic==2.10.3
+# TTS Model (transformers >= 4.33 REQUIRED for MMS-TTS)
+torch>=2.0.0
+transformers>=4.33.0
+accelerate>=0.21.0
+scipy>=1.14.0
+numpy>=1.26.0
+# Utilities
+python-dotenv>=1.0.0

tts_service.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+TTS Service using facebook/mms-tts-yor (Yoruba)
+"""
+import io
+import logging
+import asyncio
+from functools import lru_cache
+import torch
+import numpy as np
+import scipy.io.wavfile as wavfile
+from transformers import VitsModel, AutoTokenizer
+logger = logging.getLogger(__name__)
+class TTSService:
+    def __init__(self):
+        logger.info("Loading MMS-TTS-YOR model...")
+        # Load model and tokenizer
+        self.model = VitsModel.from_pretrained("facebook/mms-tts-yor")
+        self.tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-yor")
+        # Set to evaluation mode
+        self.model.eval()
+        # Use GPU if available
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model = self.model.to(self.device)
+        logger.info(f"Model loaded on {self.device}")
+        logger.info(f"Sampling rate: {self.model.config.sampling_rate}")
+    async def synthesize(self, text: str) -> bytes:
+        """
+        Synthesize speech from Yoruba text.
+        Returns WAV audio bytes.
+        """
+        # Run synthesis in thread pool to avoid blocking
+        loop = asyncio.get_event_loop()
+        return await loop.run_in_executor(None, self._synthesize_sync, text)
+    def _synthesize_sync(self, text: str) -> bytes:
+        """Synchronous synthesis (runs in thread pool)"""
+        # Tokenize input
+        inputs = self.tokenizer(text, return_tensors="pt")
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        # Generate audio
+        with torch.no_grad():
+            output = self.model(**inputs).waveform
+        # Convert to numpy
+        waveform = output.squeeze().cpu().numpy()
+        # Normalize to 16-bit PCM
+        waveform = np.clip(waveform, -1.0, 1.0)
+        waveform_int16 = (waveform * 32767).astype(np.int16)
+        # Write to WAV buffer
+        buffer = io.BytesIO()
+        wavfile.write(buffer, rate=self.model.config.sampling_rate, data=waveform_int16)
+        return buffer.getvalue()
+# Singleton instance
+@lru_cache(maxsize=1)
+def get_tts_service() -> TTSService:
+    return TTSService()