Updating README and doc strings to reflect that n_mels can now be 128 (#2049)

This commit is contained in:
Lowell Vaughn 2024-11-26 09:37:01 -08:00 committed by GitHub
parent 173ff7dd1d
commit fc5ded7d90
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 3 additions and 3 deletions

View File

@ -126,7 +126,7 @@ audio = whisper.load_audio("audio.mp3")
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)

View File

@ -122,7 +122,7 @@ def log_mel_spectrogram(
The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
n_mels: int
The number of Mel-frequency filters, only 80 is supported
The number of Mel-frequency filters, only 80 and 128 are supported
padding: int
Number of zero samples to pad to the right
@ -132,7 +132,7 @@ def log_mel_spectrogram(
Returns
-------
torch.Tensor, shape = (80, n_frames)
torch.Tensor, shape = (n_mels, n_frames)
A Tensor that contains the Mel spectrogram
"""
if not torch.is_tensor(audio):