Spaces:

WJ88
/

RealTime-Mic-Transcription-Multilingual

Paused

App Files Files Community

WJ88 commited on Nov 8

Commit

5b490e7

verified ·

1 Parent(s): c054158

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -68

app.py CHANGED Viewed

@@ -7,10 +7,32 @@ import warnings
 import torch
 import logging
 import io
-import librosa
 warnings.filterwarnings("ignore")
-logging.getLogger("nemo").setLevel(logging.ERROR)  # Suppress NeMo logs
 # Global model loader
 model = None
@@ -18,11 +40,13 @@ model = None
 def load_model():
     global model
     if model is None:
         model = nemo_asr.models.ASRModel.from_pretrained(
             model_name="nvidia/parakeet-tdt-0.6b-v3",
             map_location="cpu"
         )
         model.eval()
     return model
 class TranscriptionState:
@@ -33,15 +57,21 @@ class TranscriptionState:
 def transcribe_segment(segment_array: np.ndarray):
     """Transcribe a normalized audio segment."""
     load_model()
     with torch.no_grad(), warnings.catch_warnings():
         warnings.simplefilter("ignore")
         output = model.transcribe([segment_array])
     return output[0]
 def process_live_audio(chunk_bytes, state: TranscriptionState):
     """Process live mic PCM bytes chunk with VAD and buffer management."""
     if chunk_bytes is None or len(chunk_bytes) == 0:
-        return state.text, state
     # Create AudioSegment from raw PCM bytes (16kHz mono int16)
     try:
@@ -52,22 +82,26 @@ def process_live_audio(chunk_bytes, state: TranscriptionState):
             channels=1
         )
     except Exception as e:
-        print(f"Chunk creation error: {e}")
-        return state.text, state
     # Append to buffer
     if state.buffer is None:
         state.buffer = new_segment
     else:
         state.buffer += new_segment
     # Trim buffer to prevent accumulation (keep last 60s)
-    if state.buffer.duration_seconds > 60:
-        # Re-transcribe full current buffer before trimming
         full_array = np.array(state.buffer.get_array_of_samples(), dtype=np.float32) / 32768.0
         state.text = transcribe_segment(full_array)
-        # Trim to last 30s for ongoing buffer
         state.buffer = state.buffer[-30000:]
     # VAD: Detect pauses in current buffer
     silent_windows = detect_silence(
@@ -79,40 +113,25 @@ def process_live_audio(chunk_bytes, state: TranscriptionState):
     if len(silent_windows) > 0:
         last_silence_end = silent_windows[-1][1]
         if last_silence_end < len(state.buffer):
-            # Transcribe up to end of last silence
             segment = state.buffer[:last_silence_end]
             segment_array = np.array(segment.get_array_of_samples(), dtype=np.float32) / 32768.0
             partial_text = transcribe_segment(segment_array)
             state.text = partial_text
-            # Keep remaining as buffer
             state.buffer = state.buffer[last_silence_end:]
-    return state.text, state
-def transcribe_file(audio_path):
-    """Batch transcribe uploaded file path."""
-    if audio_path is None or not os.path.exists(audio_path):
-        return ""
-    try:
-        audio_data, sr = librosa.load(audio_path, sr=16000, mono=True)
-        if len(audio_data) == 0:
-            return ""
-    except Exception:
-        return "Error loading file."
-    load_model()
-    with torch.no_grad(), warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        output = model.transcribe([audio_data])
-    return output[0]
 def clear_session(state: TranscriptionState):
     """Reset session."""
     state.buffer = None
     state.text = ""
-    return "", state
-# Gradio UI with Blocks for tabs
-with gr.Blocks(title="Parakeet v3 Real-Time Transcription") as demo:
     gr.Markdown(
         """
         # NVIDIA Parakeet-TDT 0.6B v3 Real-Time Transcription
@@ -120,47 +139,43 @@ with gr.Blocks(title="Parakeet v3 Real-Time Transcription") as demo:
         """
     )
-    with gr.Tab("Live Microphone"):
-        state = gr.State(TranscriptionState())
-        audio_input = gr.Audio(
-            sources=["microphone"],
-            type="bytes",
-            streaming=True,
-            label="Speak now—updates on pauses"
-        )
-        output_text = gr.Textbox(
-            label="Live Transcription",
-            lines=10,
-            interactive=False
-        )
-        clear_btn = gr.Button("Clear Session", variant="secondary")
-        # Stream updates on each chunk
-        audio_input.change(
-            process_live_audio,
-            inputs=[audio_input, state],
-            outputs=[output_text, state],
-            show_progress=False  # Avoid UI flicker during fast chunks
-        )
-        clear_btn.click(
-            clear_session,
-            inputs=state,
-            outputs=[output_text, state]
-        )
-    with gr.Tab("File Upload"):
-        file_input = gr.Audio(sources=["upload"], type="filepath")
-        file_output = gr.Textbox(label="File Transcription", lines=10)
-        transcribe_btn = gr.Button("Transcribe File")
-        transcribe_btn.click(
-            transcribe_file,
-            inputs=file_input,
-            outputs=file_output
-        )
     gr.Markdown(
         """
-        **Tips:** Speak clearly with brief pauses for instant updates. Long monologues auto-update every 60s. Clear resets buffer for fresh starts.
         """
     )

 import torch
 import logging
 import io
+import os
+import datetime
 warnings.filterwarnings("ignore")
+# Setup file-based logging for persistence
+LOG_FILE = "/tmp/app_logs.txt"
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler(LOG_FILE, mode='a'),
+        logging.StreamHandler()  # Also to console for HF logs
+    ]
+)
+logger = logging.getLogger(__name__)
+def append_log(message):
+    """Append log message to file and return updated log content."""
+    logger.info(message)
+    try:
+        with open(LOG_FILE, 'r') as f:
+            logs = f.read()
+    except FileNotFoundError:
+        logs = ""
+    return logs
 # Global model loader
 model = None
 def load_model():
     global model
     if model is None:
+        logger.info("Loading Parakeet v3 model...")
         model = nemo_asr.models.ASRModel.from_pretrained(
             model_name="nvidia/parakeet-tdt-0.6b-v3",
             map_location="cpu"
         )
         model.eval()
+        logger.info("Model loaded successfully.")
     return model
 class TranscriptionState:
 def transcribe_segment(segment_array: np.ndarray):
     """Transcribe a normalized audio segment."""
     load_model()
+    logger.info(f"Transcribing segment of length {len(segment_array)} samples.")
     with torch.no_grad(), warnings.catch_warnings():
         warnings.simplefilter("ignore")
         output = model.transcribe([segment_array])
+    logger.info(f"Transcription complete: '{output[0][:50]}...'")
     return output[0]
 def process_live_audio(chunk_bytes, state: TranscriptionState):
     """Process live mic PCM bytes chunk with VAD and buffer management."""
     if chunk_bytes is None or len(chunk_bytes) == 0:
+        logger.debug("Empty chunk received.")
+        return state.text, state, append_log("Empty chunk skipped.")
+    chunk_size = len(chunk_bytes)
+    logger.debug(f"Received chunk of {chunk_size} bytes.")
     # Create AudioSegment from raw PCM bytes (16kHz mono int16)
     try:
             channels=1
         )
     except Exception as e:
+        logger.error(f"Chunk creation error: {e}")
+        return state.text, state, append_log(f"Chunk error: {e}")
     # Append to buffer
     if state.buffer is None:
         state.buffer = new_segment
+        logger.debug("Initialized new buffer.")
     else:
         state.buffer += new_segment
+    buffer_dur = state.buffer.duration_seconds
+    logger.debug(f"Buffer duration: {buffer_dur:.1f}s")
     # Trim buffer to prevent accumulation (keep last 60s)
+    if buffer_dur > 60:
+        logger.info("Buffer exceeded 60s; trimming and re-transcribing.")
         full_array = np.array(state.buffer.get_array_of_samples(), dtype=np.float32) / 32768.0
         state.text = transcribe_segment(full_array)
         state.buffer = state.buffer[-30000:]
+        return state.text, state, append_log("Buffer trimmed at 60s.")
     # VAD: Detect pauses in current buffer
     silent_windows = detect_silence(
     if len(silent_windows) > 0:
         last_silence_end = silent_windows[-1][1]
         if last_silence_end < len(state.buffer):
+            logger.info(f"VAD detected pause at {last_silence_end}ms; transcribing up to pause.")
             segment = state.buffer[:last_silence_end]
             segment_array = np.array(segment.get_array_of_samples(), dtype=np.float32) / 32768.0
             partial_text = transcribe_segment(segment_array)
             state.text = partial_text
             state.buffer = state.buffer[last_silence_end:]
+            return state.text, state, append_log(f"VAD update: Pause detected, transcribed '{partial_text[:50]}...'")
+    return state.text, state, append_log(f"Chunk appended; buffer at {buffer_dur:.1f}s, awaiting pause.")
 def clear_session(state: TranscriptionState):
     """Reset session."""
     state.buffer = None
     state.text = ""
+    logger.info("Session cleared by user.")
+    return "", state, append_log("Session cleared.")
+# Gradio UI (mic-only)
+with gr.Blocks(title="Parakeet v3 Real-Time Mic Transcription") as demo:
     gr.Markdown(
         """
         # NVIDIA Parakeet-TDT 0.6B v3 Real-Time Transcription
         """
     )
+    state = gr.State(TranscriptionState())
+    audio_input = gr.Audio(
+        sources=["microphone"],
+        type="bytes",
+        streaming=True,
+        label="Speak now—updates on pauses",
+        waveform_options={"show_recording_waveform": True}
+    )
+    output_text = gr.Textbox(
+        label="Live Transcription",
+        lines=10,
+        interactive=False
+    )
+    log_text = gr.Textbox(
+        label="Debug Logs (Persistent)",
+        lines=15,
+        interactive=False,
+        show_copy_button=True
+    )
+    clear_btn = gr.Button("Clear Session", variant="secondary")
+    # Stream updates on each chunk
+    audio_input.change(
+        process_live_audio,
+        inputs=[audio_input, state],
+        outputs=[output_text, state, log_text],
+        show_progress="minimal"
+    )
+    clear_btn.click(
+        clear_session,
+        inputs=state,
+        outputs=[output_text, state, log_text]
+    )
     gr.Markdown(
         """
+        **Tips:** Speak clearly with brief pauses for instant updates. Long monologues auto-update every 60s. Logs show real-time debug info.
         """
     )