From e0704ddeba84a475ff09674fea043bfe8eb25b48 Mon Sep 17 00:00:00 2001 From: Kent Slaney Date: Sun, 14 Jul 2024 16:24:14 -0600 Subject: [PATCH] add parameter documentation back in --- whisper/transcribe.py | 91 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/whisper/transcribe.py b/whisper/transcribe.py index 17ec3cc..0964c8a 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -273,6 +273,74 @@ class Transcriber(metaclass=PassthroughPropertyDefaults): clip_timestamps: Union[str, List[float]] = "0", hallucination_silence_threshold: Optional[float] = None, **decode_options): + """ + Transcribe an audio file using Whisper + + Parameters + ---------- + model: Whisper + The Whisper model instance + + verbose: bool + Whether to display the text being decoded to the console. If True, + displays all the details, If False, displays minimal details. If + None, does not display anything + + temperature: Union[float, Tuple[float, ...]] + Temperature for sampling. It can be a tuple of temperatures, which + will be successively used upon failures according to either + `compression_ratio_threshold` or `logprob_threshold`. + + compression_ratio_threshold: float + If the gzip compression ratio is above this value, treat as failed + + logprob_threshold: float + If the average log probability over sampled tokens is below this + value, treat as failed + + no_speech_threshold: float + If the no_speech probability is higher than this value AND the + average log probability over sampled tokens is below + `logprob_threshold`, consider the segment as silent + + condition_on_previous_text: bool + if True, the previous output of the model is provided as a prompt + for the next window; disabling may make the text inconsistent across + windows, but the model becomes less prone to getting stuck in a + failure loop, such as repetition looping or timestamps going out of + sync. + + word_timestamps: bool + Extract word-level timestamps using the cross-attention pattern and + dynamic time warping, and include the timestamps for each word in + each segment. + + prepend_punctuations: str + If word_timestamps is True, merge these punctuation symbols with the + next word + + append_punctuations: str + If word_timestamps is True, merge these punctuation symbols with the + previous word + + initial_prompt: Optional[str] + Optional text to provide as a prompt for the first window. This can + be used to provide, or "prompt-engineer" a context for + transcription, e.g. custom vocabularies or proper nouns to make it + more likely to predict those word correctly. + + decode_options: dict + Keyword arguments to construct `DecodingOptions` instances + + clip_timestamps: Union[str, List[float]] + Comma-separated list start,end,start,end,... timestamps (in seconds) + of clips to process. The last end timestamp defaults to the end of + the file. + + hallucination_silence_threshold: Optional[float] + When word_timestamps is True, skip silent periods longer than this + threshold (in seconds) when a possible hallucination is detected + """ self.model = model self.verbose = verbose self.temperature = temperature @@ -523,6 +591,10 @@ class Transcriber(metaclass=PassthroughPropertyDefaults): self.latest, self.frame_offset = mel, offset content_frames = mel.shape[-1] - N_FRAMES + offset content_duration = float(content_frames * HOP_LENGTH / SAMPLE_RATE) + # NOTE: This loop is obscurely flattened to make the diff readable. + # A later commit should turn this into a simpler nested loop. + # for seek_clip_start, seek_clip_end in seek_clips: + # while seek < seek_clip_end while self.clip_idx < len(self.seek_clips): seek_clip_start, seek_clip_end = self.seek_clips[self.clip_idx] seek_clip_end = content_frames if seek_clip_end is None else \ @@ -685,6 +757,8 @@ class ProgressTranscriber(MinimalTranscriber): if self._pbar is None: n = self.latest.shape[-1] if self.duration is None \ else -int(self.duration * -FRAMES_PER_SECOND) + # show the progress bar when verbose is False + # (if True, transcribed text will be printed) self._pbar = tqdm.tqdm( total=n, unit="frames", disable=self.verbose is not False) self._pbar.__enter__() @@ -714,6 +788,23 @@ def transcribe( model: "Whisper", audio: Union[str, np.ndarray, torch.Tensor], **kw): + """ + Transcribe an audio file using Whisper + + Parameters + ---------- + model: Whisper + The Whisper model instance + + audio: Union[str, np.ndarray, torch.Tensor] + The path to the audio file to open, or the audio waveform + + Returns + ------- + A dictionary containing the resulting text ("text") and segment-level + details ("segments"), and the spoken language ("language"), which is + detected when `decode_options["language"]` is None. + """ return ProgressTranscriber(model, **kw)(audio_tensor(audio))