Spaces:

audeering
/

emotional-attributes

Running

App Files Files Community

Dionyssos commited on Aug 5

Commit

e0f0baf

1 Parent(s): 95ad439

display dawn / teacher

Browse files

Files changed (7) hide show

README.md +6 -6
app.py +273 -0
female-20-happy.wav +0 -0
female-46-neutral.wav +0 -0
male-27-sad.wav +0 -0
male-60-angry.wav +0 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,14 +1,14 @@
 ---
-title: Emotional Attributes
-emoji: 📚
-colorFrom: indigo
-colorTo: indigo
 sdk: gradio
-sdk_version: 5.41.0
 app_file: app.py
 pinned: false
 license: cc-by-nc-4.0
 short_description: Perceive speech Arousal / Dominance / Valence
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Wav2Vec2 / Wav2small
+emoji: 🎵
+colorFrom: blue
+colorTo: pink
 sdk: gradio
+sdk_version: 5.25.2
 app_file: app.py
 pinned: false
 license: cc-by-nc-4.0
 short_description: Perceive speech Arousal / Dominance / Valence
 ---
+A space for [Dawn](https://huggingface.co/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim) and [wav2small](https://huggingface.co/dkounadis/wav2small). Follows this [paper](https://arxiv.org/abs/2408.13920).

app.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import gradio as gr
+import torch.nn as nn
+import audresample
+import matplotlib.pyplot as plt
+from matplotlib import colors as mcolors
+import torch
+import librosa
+import numpy as np
+import types
+from transformers import AutoModelForAudioClassification
+from transformers.models.wav2vec2.modeling_wav2vec2 import (Wav2Vec2Model,
+                                                  Wav2Vec2PreTrainedModel)
+plt.style.use('seaborn-v0_8-whitegrid')
+def _prenorm(x, attention_mask=None):
+    '''mean/var'''
+    if attention_mask is not None:
+        N = attention_mask.sum(1, keepdim=True)  # 0=ignored 1=valid
+        x -= x.sum(1, keepdim=True) / N
+        var = (x * x).sum(1, keepdim=True) / N
+    else:
+        x -= x.mean(1, keepdim=True)  # mean is an onnx operator reducemean saves some ops compared to casting integer N to float and the div
+        var = (x * x).mean(1, keepdim=True)
+    return x / torch.sqrt(var + 1e-7)
+class ADV(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+    def forward(self, x):
+        x = self.dense(x)
+        x = torch.tanh(x)
+        return self.out_proj(x)
+class Dawn(Wav2Vec2PreTrainedModel):
+    r"""https://arxiv.org/abs/2203.07378"""
+    def __init__(self, config):
+        super().__init__(config)
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.classifier = ADV(config)
+    def forward(self, x):
+        x -= x.mean(1, keepdim=True)
+        variance = (x * x).mean(1, keepdim=True) + 1e-7
+        x = self.wav2vec2(x / variance.sqrt())
+        return self.classifier(x.last_hidden_state.mean(1))
+def _forward(self, x):
+    '''x: (batch, audio-samples-16KHz)'''
+    x = (x + self.config.mean) / self.config.std  # sgn
+    x = self.ssl_model(x, attention_mask=None).last_hidden_state
+    # pool
+    h = self.pool_model.sap_linear(x).tanh()
+    w = torch.matmul(h, self.pool_model.attention).softmax(1)
+    mu = (x * w).sum(1)
+    x = torch.cat(
+        [
+            mu,
+            ((x * x * w).sum(1) - mu * mu).clamp(min=1e-7).sqrt()
+        ], 1)
+    return self.ser_model(x)
+# WavLM
+device = 'cpu'
+base = AutoModelForAudioClassification.from_pretrained(
+        '3loi/SER-Odyssey-Baseline-WavLM-Multi-Attributes',
+        trust_remote_code=True).to(device).eval()
+base.forward = types.MethodType(_forward, base)
+# Wav2Vec2
+dawn = Dawn.from_pretrained(
+    'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
+).to(device).eval()
+def wav2small(x):
+    return .5 * dawn(x) + .5 * base(x)
+fig_error, ax = plt.subplots(figsize=(8, 6))
+# Set the text to display
+error_message = "Error: No .wav or Mic. audio provided."
+# Add the text to the plot. We'll place it in the center of the plot
+ax.text(0.5, 0.5, error_message,
+        ha='center',
+        va='center',
+        fontsize=24,
+        color='gray',
+        fontweight='bold',
+        transform=ax.transAxes)
+# Hide the axis ticks and labels for a cleaner look
+ax.set_xticks([])
+ax.set_yticks([])
+ax.set_xticklabels([])
+ax.set_yticklabels([])
+# Optional: Add a border around the text to make it stand out more
+ax.set_frame_on(True)
+ax.spines['top'].set_visible(False)
+ax.spines['right'].set_visible(False)
+ax.spines['bottom'].set_visible(False)
+ax.spines['left'].set_visible(False)
+def process_audio(audio_filepath):
+    if audio_filepath is None:
+        return fig_error
+    # Load the audio file
+    waveform, sample_rate = librosa.load(audio_filepath)
+    # Ensure audio is mono: if stereo, take the mean across channels
+    # Resample audio to 16kHz if necessary
+    if sample_rate != 16000:
+        resampled_waveform_np = audresample.resample(waveform, sample_rate, 16000)
+        x = torch.from_numpy(resampled_waveform_np)
+    x = x[:, :64000]  # 4s
+    with torch.no_grad():
+        logits_dawn = dawn(x).cpu().numpy()[0, :]
+        logits_wavlm = base(x).cpu().numpy()[0, :]
+        logits_wav2small = .5 * logits_dawn + .5 * logits_wavlm
+    # left_bars_data = np.array([0.75, 0.5, 0.9])
+    # right_bars_data = np.array([0.3, 0.8, 0.65])
+    left_bars_data = logits_dawn.clip(0, 1)
+    right_bars_data = logits_wav2small.clip(0, 1)
+    bar_labels = ['\nArousal', '\nDominance', '\nValence']
+    y_pos = np.arange(len(bar_labels))
+    # Define the base colormaps for each category to ensure a different color per row
+    # Using Greys for Dominance as requested
+    category_colormaps = [plt.cm.Blues, plt.cm.Greys, plt.cm.Oranges]
+    # Define color shades for left and right for each category
+    left_filled_colors = []
+    right_filled_colors = []
+    background_colors = []
+    for i, cmap in enumerate(category_colormaps):
+        # Pick a darker shade for the left filled bar
+        left_filled_colors.append(cmap(0.74))  # 0.7
+        # Pick a slightly lighter shade for the right filled bar
+        right_filled_colors.append(cmap(0.64))  # 0.5
+        # Pick a very light shade for the transparent background bar
+        background_colors.append(cmap(0.1))
+    # Set up the figure and axes
+    fig, ax = plt.subplots(figsize=(10, 6))
+    # Plot the background bars with transparency
+    for i in range(len(bar_labels)):
+        # Left background bar (transparent, light shade of category color)
+        ax.barh(y_pos[i], -1, color=background_colors[i], alpha=0.3, height=0.6)
+        # Right background bar (transparent, light shade of category color)
+        ax.barh(y_pos[i], 1, color=background_colors[i], alpha=0.3, height=0.6)
+    # Plot the filled bars for the left and right side
+    for i in range(len(bar_labels)):
+        # Left filled bar (opaque, darker shade of category color)
+        ax.barh(y_pos[i], -left_bars_data[i], color=left_filled_colors[i], alpha=1, height=0.6)
+        # Right filled bar (opaque, lighter shade of category color)
+        ax.barh(y_pos[i], right_bars_data[i], color=right_filled_colors[i], alpha=1, height=0.6)
+    # Add a central axis divider
+    ax.axvline(0, color='black', linewidth=0.8, linestyle='--')
+    # Set x-axis limits and y-axis ticks
+    ax.set_xlim(-1, 1)
+    ax.set_yticks(y_pos)
+    ax.set_yticklabels(bar_labels, fontsize=12)
+    def abs_tick_formatter(x, pos):
+        return f'{int(abs(x) * 100)}%'
+    ax.xaxis.set_major_formatter(plt.FuncFormatter(abs_tick_formatter))
+    # Add a clean title and labels
+    ax.set_title('', fontsize=16, pad=20)
+    ax.set_xlabel('Outputs of Wav2Vev2                                               Outputs of Wav2Small Teacher', fontsize=12)
+    # Remove the top and right spines for a cleaner look
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['left'].set_visible(False)
+    # Add annotations to the filled bars for clarity
+    for i in range(len(bar_labels)):
+        # Left annotation (uses left_filled_colors for text color)
+        ax.text(-left_bars_data[i] - 0.05, y_pos[i], f'{int(left_bars_data[i] * 100)}%',
+                va='center', ha='right', color=left_filled_colors[i], fontweight='bold')
+        # Right annotation (uses right_filled_colors for text color)
+        ax.text(right_bars_data[i] + 0.05, y_pos[i], f'{int(right_bars_data[i] * 100)}%',
+                va='center', ha='left', color=right_filled_colors[i], fontweight='bold')
+    return fig
+iface = gr.Interface(
+    fn=process_audio,
+    inputs=gr.Audio(
+        sources=["microphone", "upload"],
+        type="filepath",                  # Input type is file path
+        label=''
+    ),
+    outputs=[
+        gr.Plot(label="Arousal / Dominance / Valence Plots"),
+    ],
+    title='',
+    description='',
+    flagging_mode="never",  # save audio and .csv in the machine ?
+    examples=[
+                        "female-46-neutral.wav",
+                        "female-20-happy.wav",
+                        "male-60-angry.wav",
+                        "male-27-sad.wav",
+            ],
+    css="footer {visibility: hidden}"
+)
+with gr.Blocks() as demo:
+    # https://discuss.huggingface.co/t/how-to-get-the-microphone-streaming-input-file-when-using-blocks/37204/3
+    with gr.Tab(label="Arousal / Dominance / Valence"):
+        iface.render()
+    with gr.Tab(label="CCC"):
+       gr.Markdown('''<table style="width:500px"><tr><th colspan=5 >CCC MSP Podcast v1.7</th></tr>
+  <tr> <td> </td><td>Arousal</td> <td>Dominance</td> <td>Valence</td> <td> Associated Paper </td> </tr>
+  <tr> <td> <a href="https://huggingface.co/audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim">Wav2Vec2</a></td><td>0.744</td><td>0.655</td><td> 0.638 </td><td> <a href="https://arxiv.org/abs/2203.07378">arXiv</a> </td> </tr>
+  <tr> <td> <a href="https://huggingface.co/dkounadis/wav2small">Wav2Small Teacher</a></td><td> 0.762 </td> <td> 0.684 </td><td> 0.676 </td><td> <a href="https://arxiv.org/abs/2408.13920">arXiv</a> </td> </tr>
+</table>
+''')
+if __name__ == "__main__":
+    demo.launch(share=False)

female-20-happy.wav ADDED Viewed

Binary file (51 kB). View file

female-46-neutral.wav ADDED Viewed

Binary file (37.6 kB). View file

male-27-sad.wav ADDED Viewed

Binary file (50.4 kB). View file

male-60-angry.wav ADDED Viewed

Binary file (60.5 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+audresample
+matplotlib
+torch
+transformers
+librosa