Merge branch 'main' into patch-1

2025-03-30 14:28:27 +00:00 · 2024-12-24 14:09:32 +03:00 · 2024-12-24 14:09:32 +03:00 · accb6b18f0
commit accb6b18f0
parent 586705c138 90db0de189
4 changed files with 6 additions and 4 deletions
--- a/README.md
+++ b/README.md
@ -126,7 +126,7 @@ audio = whisper.load_audio("audio.mp3")
 audio = whisper.pad_or_trim(audio)

 # make log-Mel spectrogram and move to the same device as the model
-mel = whisper.log_mel_spectrogram(audio).to(model.device)
+mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)

 # detect the spoken language
 _, probs = model.detect_language(mel)
--- a/data/README.md
+++ b/data/README.md
@ -45,7 +45,7 @@ We downloaded the [CHiME-5 dataset](https://spandh.dcs.shef.ac.uk//chime_challen

 ### AMI-IHM, AMI-SDM1

-We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 ad 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b).
+We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 and 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b).


 ## Long-form English-only datasets
--- a/whisper/audio.py
+++ b/whisper/audio.py
@ -122,7 +122,7 @@ def log_mel_spectrogram(
        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz

    n_mels: int
-        The number of Mel-frequency filters, only 80 is supported
+        The number of Mel-frequency filters, only 80 and 128 are supported

    padding: int
        Number of zero samples to pad to the right
@ -132,7 +132,7 @@ def log_mel_spectrogram(

    Returns
    -------
-    torch.Tensor, shape = (80, n_frames)
+    torch.Tensor, shape = (n_mels, n_frames)
        A Tensor that contains the Mel spectrogram
    """
    if not torch.is_tensor(audio):
--- a/whisper/transcribe.py
+++ b/whisper/transcribe.py
@ -218,6 +218,8 @@ def transcribe(
            if (
                no_speech_threshold is not None
                and decode_result.no_speech_prob > no_speech_threshold
+                and logprob_threshold is not None
+                and decode_result.avg_logprob < logprob_threshold
            ):
                needs_fallback = False  # silence
            if (