From fc5ded7d9045c693692f13853857c3f8baea3a7b Mon Sep 17 00:00:00 2001 From: Lowell Vaughn Date: Tue, 26 Nov 2024 09:37:01 -0800 Subject: [PATCH 1/2] Updating README and doc strings to reflect that n_mels can now be 128 (#2049) --- README.md | 2 +- whisper/audio.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1a661d7..696869c 100644 --- a/README.md +++ b/README.md @@ -126,7 +126,7 @@ audio = whisper.load_audio("audio.mp3") audio = whisper.pad_or_trim(audio) # make log-Mel spectrogram and move to the same device as the model -mel = whisper.log_mel_spectrogram(audio).to(model.device) +mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device) # detect the spoken language _, probs = model.detect_language(mel) diff --git a/whisper/audio.py b/whisper/audio.py index cf6c66a..826250f 100644 --- a/whisper/audio.py +++ b/whisper/audio.py @@ -122,7 +122,7 @@ def log_mel_spectrogram( The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz n_mels: int - The number of Mel-frequency filters, only 80 is supported + The number of Mel-frequency filters, only 80 and 128 are supported padding: int Number of zero samples to pad to the right @@ -132,7 +132,7 @@ def log_mel_spectrogram( Returns ------- - torch.Tensor, shape = (80, n_frames) + torch.Tensor, shape = (n_mels, n_frames) A Tensor that contains the Mel spectrogram """ if not torch.is_tensor(audio): From 90db0de1896c23cbfaf0c58bc2d30665f709f170 Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Sun, 1 Dec 2024 05:47:01 +0000 Subject: [PATCH 2/2] Bugfix: Illogical "Avoid computing higher temperatures on no_speech" (#1903) * Bugfix: Illogical "Avoid computing higher temperatures on no_speech" Bugfix for https://github.com/openai/whisper/pull/1279 It's "silence" when decoding has failed due to `compression_ratio_threshold` too, when further down the code it's not "silence" anymore. "Silence" should be only when decoding has failed due to `logprob_threshold`. Like described there: https://github.com/openai/whisper/blob/8bc8860694949db53c42ba47ddc23786c2e02a8b/whisper/transcribe.py#L421 And in code there: https://github.com/openai/whisper/blob/8bc8860694949db53c42ba47ddc23786c2e02a8b/whisper/transcribe.py#L243-L251 * Fix if "logprob_threshold=None" --------- Co-authored-by: Jong Wook Kim --- whisper/transcribe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/whisper/transcribe.py b/whisper/transcribe.py index 8eb6a71..0a4cc36 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -214,6 +214,8 @@ def transcribe( if ( no_speech_threshold is not None and decode_result.no_speech_prob > no_speech_threshold + and logprob_threshold is not None + and decode_result.avg_logprob < logprob_threshold ): needs_fallback = False # silence if not needs_fallback: