DBD-research-group
/

ConvNeXT-Base-BirdSet-XCL

Image Classification

Model card Files Files and versions

RaphaelSchwinger commited on Apr 22, 2025

Commit

136d36f

·

verified ·

1 Parent(s): cfa17bd

update preprocessing

Files changed (1) hide show

README.md +24 -13

README.md CHANGED Viewed

@@ -107,6 +107,19 @@ class PowerToDB(torch.nn.Module):
         return log_spec
 def preprocess(audio, sample_rate_of_audio):
     """
     Preprocess the audio to the format that the model expects
@@ -115,30 +128,28 @@ def preprocess(audio, sample_rate_of_audio):
     - Normalize the melscale spectrogram with mean: -4.268, std: 4.569 (from AudioSet)
     """
-    powerToDB = PowerToDB()
-    # Resample to 32kHz
-    resample = torchaudio.transforms.Resample(
-        orig_freq=sample_rate_of_audio, new_freq=32000
-    )
-    audio = resample(audio)
-    spectrogram = torchaudio.transforms.Spectrogram(
-        n_fft=1024, hop_length=320, power=2.0
-    )(audio)
-    melspec = torchaudio.transforms.MelScale(n_mels=128, n_stft=513)(spectrogram)
     dbscale = powerToDB(melspec)
-    normalized_dbscale = transforms.Normalize((-4.268,), (4.569,))(dbscale)
     return normalized_dbscale
 preprocessed_audio = preprocess(audio, sample_rate)
-logits = model(preprocessed_audio.unsqueeze(0)).logits
 print("Logits shape: ", logits.shape)
 top5 = torch.topk(logits, 5)
 print("Top 5 logits:", top5.values)
 print("Top 5 predicted classes:")
 print([model.config.id2label[i] for i in top5.indices.squeeze().tolist()])
 ```
 ## Model Source

         return log_spec
+# Initialize the transformations
+spectrogram_converter = torchaudio.transforms.Spectrogram(
+    n_fft=1024, hop_length=320, power=2.0
+)
+mel_converter = torchaudio.transforms.MelScale(
+    n_mels=128, n_stft=513, sample_rate=32_000
+)
+normalizer = transforms.Normalize((-4.268,), (4.569,))
+powerToDB = PowerToDB(top_db=80)
 def preprocess(audio, sample_rate_of_audio):
     """
     Preprocess the audio to the format that the model expects
     - Normalize the melscale spectrogram with mean: -4.268, std: 4.569 (from AudioSet)
     """
+    # convert waveform to spectrogram
+    spectrogram = spectrogram_converter(audio)
+    spectrogram = spectrogram.to(torch.float32)
+    melspec = mel_converter(spectrogram)
     dbscale = powerToDB(melspec)
+    normalized_dbscale = normalizer(dbscale)
+    # add dimension 3 from left
+    normalized_dbscale = normalized_dbscale.unsqueeze(-3)
     return normalized_dbscale
 preprocessed_audio = preprocess(audio, sample_rate)
+print("Preprocessed_audio shape:", preprocessed_audio.shape)
+logits = model(preprocessed_audio).logits
 print("Logits shape: ", logits.shape)
 top5 = torch.topk(logits, 5)
 print("Top 5 logits:", top5.values)
 print("Top 5 predicted classes:")
 print([model.config.id2label[i] for i in top5.indices.squeeze().tolist()])
 ```
 ## Model Source