import gradio as gr import numpy as np import sherpa_onnx import time import os import urllib.request import tarfile # Download and extract model if not present model_dir = "sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8" if not os.path.exists(model_dir): url = "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8.tar.bz2" urllib.request.urlretrieve(url, "model.tar.bz2") with tarfile.open("model.tar.bz2") as tar: tar.extractall() os.remove("model.tar.bz2") # Configure endpoint detection for natural pauses endpoint_config = sherpa_onnx.EndpointConfig( rule1_min_trailing_silence=1.0, # Activate on 1s silence rule2_min_trailing_silence=0.5, # After speech, 0.5s silence rule3_min_utterance_length=30.0 # Max 30s utterance ) # Create OnlineRecognizer config = sherpa_onnx.OnlineRecognizerConfig( feat_config=sherpa_onnx.FeatureConfig(sample_rate=16000), model_config=sherpa_onnx.OnlineTransducerModelConfig( encoder=os.path.join(model_dir, "encoder.int8.onnx"), decoder=os.path.join(model_dir, "decoder.int8.onnx"), joiner=os.path.join(model_dir, "joiner.int8.onnx") ), tokens=os.path.join(model_dir, "tokens.txt"), provider="cpu", num_threads=2, # Match HF free-tier cores endpoint_config=endpoint_config ) recognizer = sherpa_onnx.OnlineRecognizer(config) def transcribe(state, audio_chunk): if state is None: state = { "stream": recognizer.create_stream(), "transcript": "", "current_partial": "", "log": "", "last_time": time.time() } try: sr, y = audio_chunk if y.ndim > 1: y = np.mean(y, axis=1) y = y.astype(np.float32) if np.max(np.abs(y)) > 0: y /= np.max(np.abs(y)) # Normalize to [-1, 1] else: state["log"] += "Weak signal detected.\n" return state, state["transcript"] + state["current_partial"], state["log"] state["stream"].accept_waveform(sr, y) while recognizer.is_ready(state["stream"]): recognizer.decode_stream(state["stream"]) result = recognizer.get_result(state["stream"]) current_text = result.text.strip() if current_text != state["current_partial"]: state["current_partial"] = current_text latency = time.time() - state["last_time"] state["log"] += f"Partial update (latency: {latency:.2f}s): {current_text}\n" state["last_time"] = time.time() if recognizer.is_endpoint(state["stream"]): if current_text: state["transcript"] += current_text + " " state["log"] += f"Endpoint detected, committed: {current_text}\n" recognizer.reset(state["stream"]) state["current_partial"] = "" except Exception as e: state["log"] += f"Error: {str(e)}\n" return state, state["transcript"] + state["current_partial"], state["log"] with gr.Blocks() as demo: gr.Markdown("# Real-Time Multilingual Microphone Transcription") with gr.Row(): audio = gr.Audio(source="microphone", type="numpy", streaming=True, label="Speak here") transcript = gr.Textbox(label="Transcription", interactive=False) logs = gr.Textbox(label="Debug Logs", interactive=False, lines=5) state = gr.State() audio.stream(transcribe, [state, audio], [state, transcript, logs]) demo.launch()