mirror of
https://github.com/openai/whisper.git
synced 2025-03-30 14:28:27 +00:00
Merge branch 'main' into patch-1
This commit is contained in:
commit
accb6b18f0
@ -126,7 +126,7 @@ audio = whisper.load_audio("audio.mp3")
|
||||
audio = whisper.pad_or_trim(audio)
|
||||
|
||||
# make log-Mel spectrogram and move to the same device as the model
|
||||
mel = whisper.log_mel_spectrogram(audio).to(model.device)
|
||||
mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)
|
||||
|
||||
# detect the spoken language
|
||||
_, probs = model.detect_language(mel)
|
||||
|
@ -45,7 +45,7 @@ We downloaded the [CHiME-5 dataset](https://spandh.dcs.shef.ac.uk//chime_challen
|
||||
|
||||
### AMI-IHM, AMI-SDM1
|
||||
|
||||
We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 ad 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b).
|
||||
We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 and 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b).
|
||||
|
||||
|
||||
## Long-form English-only datasets
|
||||
|
@ -122,7 +122,7 @@ def log_mel_spectrogram(
|
||||
The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
|
||||
|
||||
n_mels: int
|
||||
The number of Mel-frequency filters, only 80 is supported
|
||||
The number of Mel-frequency filters, only 80 and 128 are supported
|
||||
|
||||
padding: int
|
||||
Number of zero samples to pad to the right
|
||||
@ -132,7 +132,7 @@ def log_mel_spectrogram(
|
||||
|
||||
Returns
|
||||
-------
|
||||
torch.Tensor, shape = (80, n_frames)
|
||||
torch.Tensor, shape = (n_mels, n_frames)
|
||||
A Tensor that contains the Mel spectrogram
|
||||
"""
|
||||
if not torch.is_tensor(audio):
|
||||
|
@ -218,6 +218,8 @@ def transcribe(
|
||||
if (
|
||||
no_speech_threshold is not None
|
||||
and decode_result.no_speech_prob > no_speech_threshold
|
||||
and logprob_threshold is not None
|
||||
and decode_result.avg_logprob < logprob_threshold
|
||||
):
|
||||
needs_fallback = False # silence
|
||||
if (
|
||||
|
Loading…
x
Reference in New Issue
Block a user