From 173ff7dd1d9fb1c4fddea0d41d704cfefeb8908c Mon Sep 17 00:00:00 2001
From: f1sh <71207078+YuZekai@users.noreply.github.com>
Date: Wed, 13 Nov 2024 08:35:54 +0800
Subject: [PATCH 1/3] fix typo data/README.md (#2433)

---
 data/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/README.md b/data/README.md
index 3b4aea1..fcb3200 100644
--- a/data/README.md
+++ b/data/README.md
@@ -45,7 +45,7 @@ We downloaded the [CHiME-5 dataset](https://spandh.dcs.shef.ac.uk//chime_challen
 
 ### AMI-IHM, AMI-SDM1
 
-We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 ad 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b).
+We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 and 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b).
 
 
 ## Long-form English-only datasets

From fc5ded7d9045c693692f13853857c3f8baea3a7b Mon Sep 17 00:00:00 2001
From: Lowell Vaughn <lowell@vaughnresearch.com>
Date: Tue, 26 Nov 2024 09:37:01 -0800
Subject: [PATCH 2/3] Updating README and doc strings to reflect that n_mels
 can now be 128 (#2049)

---
 README.md        | 2 +-
 whisper/audio.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 1a661d7..696869c 100644
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ audio = whisper.load_audio("audio.mp3")
 audio = whisper.pad_or_trim(audio)
 
 # make log-Mel spectrogram and move to the same device as the model
-mel = whisper.log_mel_spectrogram(audio).to(model.device)
+mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)
 
 # detect the spoken language
 _, probs = model.detect_language(mel)
diff --git a/whisper/audio.py b/whisper/audio.py
index cf6c66a..826250f 100644
--- a/whisper/audio.py
+++ b/whisper/audio.py
@@ -122,7 +122,7 @@ def log_mel_spectrogram(
         The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
 
     n_mels: int
-        The number of Mel-frequency filters, only 80 is supported
+        The number of Mel-frequency filters, only 80 and 128 are supported
 
     padding: int
         Number of zero samples to pad to the right
@@ -132,7 +132,7 @@ def log_mel_spectrogram(
 
     Returns
     -------
-    torch.Tensor, shape = (80, n_frames)
+    torch.Tensor, shape = (n_mels, n_frames)
         A Tensor that contains the Mel spectrogram
     """
     if not torch.is_tensor(audio):

From 90db0de1896c23cbfaf0c58bc2d30665f709f170 Mon Sep 17 00:00:00 2001
From: Purfview <69023953+Purfview@users.noreply.github.com>
Date: Sun, 1 Dec 2024 05:47:01 +0000
Subject: [PATCH 3/3] Bugfix: Illogical "Avoid computing higher temperatures on
 no_speech" (#1903)

* Bugfix: Illogical "Avoid computing higher temperatures on no_speech"

Bugfix for https://github.com/openai/whisper/pull/1279

It's "silence" when decoding has failed due to `compression_ratio_threshold` too, when further down the code it's not "silence" anymore.

"Silence" should be only when decoding has failed due to `logprob_threshold`.

Like described there:
https://github.com/openai/whisper/blob/8bc8860694949db53c42ba47ddc23786c2e02a8b/whisper/transcribe.py#L421

And in code there:
https://github.com/openai/whisper/blob/8bc8860694949db53c42ba47ddc23786c2e02a8b/whisper/transcribe.py#L243-L251

* Fix if "logprob_threshold=None"

---------

Co-authored-by: Jong Wook Kim <jongwook@openai.com>
---
 whisper/transcribe.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/whisper/transcribe.py b/whisper/transcribe.py
index 8eb6a71..0a4cc36 100644
--- a/whisper/transcribe.py
+++ b/whisper/transcribe.py
@@ -214,6 +214,8 @@ def transcribe(
             if (
                 no_speech_threshold is not None
                 and decode_result.no_speech_prob > no_speech_threshold
+                and logprob_threshold is not None
+                and decode_result.avg_logprob < logprob_threshold
             ):
                 needs_fallback = False  # silence
             if not needs_fallback: