WJ88 commited on
Commit
5b490e7
·
verified ·
1 Parent(s): c054158

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -68
app.py CHANGED
@@ -7,10 +7,32 @@ import warnings
7
  import torch
8
  import logging
9
  import io
10
- import librosa
 
11
 
12
  warnings.filterwarnings("ignore")
13
- logging.getLogger("nemo").setLevel(logging.ERROR) # Suppress NeMo logs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # Global model loader
16
  model = None
@@ -18,11 +40,13 @@ model = None
18
  def load_model():
19
  global model
20
  if model is None:
 
21
  model = nemo_asr.models.ASRModel.from_pretrained(
22
  model_name="nvidia/parakeet-tdt-0.6b-v3",
23
  map_location="cpu"
24
  )
25
  model.eval()
 
26
  return model
27
 
28
  class TranscriptionState:
@@ -33,15 +57,21 @@ class TranscriptionState:
33
  def transcribe_segment(segment_array: np.ndarray):
34
  """Transcribe a normalized audio segment."""
35
  load_model()
 
36
  with torch.no_grad(), warnings.catch_warnings():
37
  warnings.simplefilter("ignore")
38
  output = model.transcribe([segment_array])
 
39
  return output[0]
40
 
41
  def process_live_audio(chunk_bytes, state: TranscriptionState):
42
  """Process live mic PCM bytes chunk with VAD and buffer management."""
43
  if chunk_bytes is None or len(chunk_bytes) == 0:
44
- return state.text, state
 
 
 
 
45
 
46
  # Create AudioSegment from raw PCM bytes (16kHz mono int16)
47
  try:
@@ -52,22 +82,26 @@ def process_live_audio(chunk_bytes, state: TranscriptionState):
52
  channels=1
53
  )
54
  except Exception as e:
55
- print(f"Chunk creation error: {e}")
56
- return state.text, state
57
 
58
  # Append to buffer
59
  if state.buffer is None:
60
  state.buffer = new_segment
 
61
  else:
62
  state.buffer += new_segment
63
 
 
 
 
64
  # Trim buffer to prevent accumulation (keep last 60s)
65
- if state.buffer.duration_seconds > 60:
66
- # Re-transcribe full current buffer before trimming
67
  full_array = np.array(state.buffer.get_array_of_samples(), dtype=np.float32) / 32768.0
68
  state.text = transcribe_segment(full_array)
69
- # Trim to last 30s for ongoing buffer
70
  state.buffer = state.buffer[-30000:]
 
71
 
72
  # VAD: Detect pauses in current buffer
73
  silent_windows = detect_silence(
@@ -79,40 +113,25 @@ def process_live_audio(chunk_bytes, state: TranscriptionState):
79
  if len(silent_windows) > 0:
80
  last_silence_end = silent_windows[-1][1]
81
  if last_silence_end < len(state.buffer):
82
- # Transcribe up to end of last silence
83
  segment = state.buffer[:last_silence_end]
84
  segment_array = np.array(segment.get_array_of_samples(), dtype=np.float32) / 32768.0
85
  partial_text = transcribe_segment(segment_array)
86
  state.text = partial_text
87
- # Keep remaining as buffer
88
  state.buffer = state.buffer[last_silence_end:]
 
89
 
90
- return state.text, state
91
-
92
- def transcribe_file(audio_path):
93
- """Batch transcribe uploaded file path."""
94
- if audio_path is None or not os.path.exists(audio_path):
95
- return ""
96
- try:
97
- audio_data, sr = librosa.load(audio_path, sr=16000, mono=True)
98
- if len(audio_data) == 0:
99
- return ""
100
- except Exception:
101
- return "Error loading file."
102
- load_model()
103
- with torch.no_grad(), warnings.catch_warnings():
104
- warnings.simplefilter("ignore")
105
- output = model.transcribe([audio_data])
106
- return output[0]
107
 
108
  def clear_session(state: TranscriptionState):
109
  """Reset session."""
110
  state.buffer = None
111
  state.text = ""
112
- return "", state
 
113
 
114
- # Gradio UI with Blocks for tabs
115
- with gr.Blocks(title="Parakeet v3 Real-Time Transcription") as demo:
116
  gr.Markdown(
117
  """
118
  # NVIDIA Parakeet-TDT 0.6B v3 Real-Time Transcription
@@ -120,47 +139,43 @@ with gr.Blocks(title="Parakeet v3 Real-Time Transcription") as demo:
120
  """
121
  )
122
 
123
- with gr.Tab("Live Microphone"):
124
- state = gr.State(TranscriptionState())
125
- audio_input = gr.Audio(
126
- sources=["microphone"],
127
- type="bytes",
128
- streaming=True,
129
- label="Speak now—updates on pauses"
130
- )
131
- output_text = gr.Textbox(
132
- label="Live Transcription",
133
- lines=10,
134
- interactive=False
135
- )
136
- clear_btn = gr.Button("Clear Session", variant="secondary")
137
-
138
- # Stream updates on each chunk
139
- audio_input.change(
140
- process_live_audio,
141
- inputs=[audio_input, state],
142
- outputs=[output_text, state],
143
- show_progress=False # Avoid UI flicker during fast chunks
144
- )
145
- clear_btn.click(
146
- clear_session,
147
- inputs=state,
148
- outputs=[output_text, state]
149
- )
150
-
151
- with gr.Tab("File Upload"):
152
- file_input = gr.Audio(sources=["upload"], type="filepath")
153
- file_output = gr.Textbox(label="File Transcription", lines=10)
154
- transcribe_btn = gr.Button("Transcribe File")
155
- transcribe_btn.click(
156
- transcribe_file,
157
- inputs=file_input,
158
- outputs=file_output
159
- )
160
 
161
  gr.Markdown(
162
  """
163
- **Tips:** Speak clearly with brief pauses for instant updates. Long monologues auto-update every 60s. Clear resets buffer for fresh starts.
164
  """
165
  )
166
 
 
7
  import torch
8
  import logging
9
  import io
10
+ import os
11
+ import datetime
12
 
13
  warnings.filterwarnings("ignore")
14
+
15
+ # Setup file-based logging for persistence
16
+ LOG_FILE = "/tmp/app_logs.txt"
17
+ logging.basicConfig(
18
+ level=logging.INFO,
19
+ format='%(asctime)s - %(levelname)s - %(message)s',
20
+ handlers=[
21
+ logging.FileHandler(LOG_FILE, mode='a'),
22
+ logging.StreamHandler() # Also to console for HF logs
23
+ ]
24
+ )
25
+ logger = logging.getLogger(__name__)
26
+
27
+ def append_log(message):
28
+ """Append log message to file and return updated log content."""
29
+ logger.info(message)
30
+ try:
31
+ with open(LOG_FILE, 'r') as f:
32
+ logs = f.read()
33
+ except FileNotFoundError:
34
+ logs = ""
35
+ return logs
36
 
37
  # Global model loader
38
  model = None
 
40
  def load_model():
41
  global model
42
  if model is None:
43
+ logger.info("Loading Parakeet v3 model...")
44
  model = nemo_asr.models.ASRModel.from_pretrained(
45
  model_name="nvidia/parakeet-tdt-0.6b-v3",
46
  map_location="cpu"
47
  )
48
  model.eval()
49
+ logger.info("Model loaded successfully.")
50
  return model
51
 
52
  class TranscriptionState:
 
57
  def transcribe_segment(segment_array: np.ndarray):
58
  """Transcribe a normalized audio segment."""
59
  load_model()
60
+ logger.info(f"Transcribing segment of length {len(segment_array)} samples.")
61
  with torch.no_grad(), warnings.catch_warnings():
62
  warnings.simplefilter("ignore")
63
  output = model.transcribe([segment_array])
64
+ logger.info(f"Transcription complete: '{output[0][:50]}...'")
65
  return output[0]
66
 
67
  def process_live_audio(chunk_bytes, state: TranscriptionState):
68
  """Process live mic PCM bytes chunk with VAD and buffer management."""
69
  if chunk_bytes is None or len(chunk_bytes) == 0:
70
+ logger.debug("Empty chunk received.")
71
+ return state.text, state, append_log("Empty chunk skipped.")
72
+
73
+ chunk_size = len(chunk_bytes)
74
+ logger.debug(f"Received chunk of {chunk_size} bytes.")
75
 
76
  # Create AudioSegment from raw PCM bytes (16kHz mono int16)
77
  try:
 
82
  channels=1
83
  )
84
  except Exception as e:
85
+ logger.error(f"Chunk creation error: {e}")
86
+ return state.text, state, append_log(f"Chunk error: {e}")
87
 
88
  # Append to buffer
89
  if state.buffer is None:
90
  state.buffer = new_segment
91
+ logger.debug("Initialized new buffer.")
92
  else:
93
  state.buffer += new_segment
94
 
95
+ buffer_dur = state.buffer.duration_seconds
96
+ logger.debug(f"Buffer duration: {buffer_dur:.1f}s")
97
+
98
  # Trim buffer to prevent accumulation (keep last 60s)
99
+ if buffer_dur > 60:
100
+ logger.info("Buffer exceeded 60s; trimming and re-transcribing.")
101
  full_array = np.array(state.buffer.get_array_of_samples(), dtype=np.float32) / 32768.0
102
  state.text = transcribe_segment(full_array)
 
103
  state.buffer = state.buffer[-30000:]
104
+ return state.text, state, append_log("Buffer trimmed at 60s.")
105
 
106
  # VAD: Detect pauses in current buffer
107
  silent_windows = detect_silence(
 
113
  if len(silent_windows) > 0:
114
  last_silence_end = silent_windows[-1][1]
115
  if last_silence_end < len(state.buffer):
116
+ logger.info(f"VAD detected pause at {last_silence_end}ms; transcribing up to pause.")
117
  segment = state.buffer[:last_silence_end]
118
  segment_array = np.array(segment.get_array_of_samples(), dtype=np.float32) / 32768.0
119
  partial_text = transcribe_segment(segment_array)
120
  state.text = partial_text
 
121
  state.buffer = state.buffer[last_silence_end:]
122
+ return state.text, state, append_log(f"VAD update: Pause detected, transcribed '{partial_text[:50]}...'")
123
 
124
+ return state.text, state, append_log(f"Chunk appended; buffer at {buffer_dur:.1f}s, awaiting pause.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  def clear_session(state: TranscriptionState):
127
  """Reset session."""
128
  state.buffer = None
129
  state.text = ""
130
+ logger.info("Session cleared by user.")
131
+ return "", state, append_log("Session cleared.")
132
 
133
+ # Gradio UI (mic-only)
134
+ with gr.Blocks(title="Parakeet v3 Real-Time Mic Transcription") as demo:
135
  gr.Markdown(
136
  """
137
  # NVIDIA Parakeet-TDT 0.6B v3 Real-Time Transcription
 
139
  """
140
  )
141
 
142
+ state = gr.State(TranscriptionState())
143
+ audio_input = gr.Audio(
144
+ sources=["microphone"],
145
+ type="bytes",
146
+ streaming=True,
147
+ label="Speak now—updates on pauses",
148
+ waveform_options={"show_recording_waveform": True}
149
+ )
150
+ output_text = gr.Textbox(
151
+ label="Live Transcription",
152
+ lines=10,
153
+ interactive=False
154
+ )
155
+ log_text = gr.Textbox(
156
+ label="Debug Logs (Persistent)",
157
+ lines=15,
158
+ interactive=False,
159
+ show_copy_button=True
160
+ )
161
+ clear_btn = gr.Button("Clear Session", variant="secondary")
162
+
163
+ # Stream updates on each chunk
164
+ audio_input.change(
165
+ process_live_audio,
166
+ inputs=[audio_input, state],
167
+ outputs=[output_text, state, log_text],
168
+ show_progress="minimal"
169
+ )
170
+ clear_btn.click(
171
+ clear_session,
172
+ inputs=state,
173
+ outputs=[output_text, state, log_text]
174
+ )
 
 
 
 
175
 
176
  gr.Markdown(
177
  """
178
+ **Tips:** Speak clearly with brief pauses for instant updates. Long monologues auto-update every 60s. Logs show real-time debug info.
179
  """
180
  )
181