add parameter documentation back in

This commit is contained in:
Kent Slaney 2024-07-14 16:24:14 -06:00
parent b4fd954955
commit e0704ddeba

View File

@ -273,6 +273,74 @@ class Transcriber(metaclass=PassthroughPropertyDefaults):
clip_timestamps: Union[str, List[float]] = "0", clip_timestamps: Union[str, List[float]] = "0",
hallucination_silence_threshold: Optional[float] = None, hallucination_silence_threshold: Optional[float] = None,
**decode_options): **decode_options):
"""
Transcribe an audio file using Whisper
Parameters
----------
model: Whisper
The Whisper model instance
verbose: bool
Whether to display the text being decoded to the console. If True,
displays all the details, If False, displays minimal details. If
None, does not display anything
temperature: Union[float, Tuple[float, ...]]
Temperature for sampling. It can be a tuple of temperatures, which
will be successively used upon failures according to either
`compression_ratio_threshold` or `logprob_threshold`.
compression_ratio_threshold: float
If the gzip compression ratio is above this value, treat as failed
logprob_threshold: float
If the average log probability over sampled tokens is below this
value, treat as failed
no_speech_threshold: float
If the no_speech probability is higher than this value AND the
average log probability over sampled tokens is below
`logprob_threshold`, consider the segment as silent
condition_on_previous_text: bool
if True, the previous output of the model is provided as a prompt
for the next window; disabling may make the text inconsistent across
windows, but the model becomes less prone to getting stuck in a
failure loop, such as repetition looping or timestamps going out of
sync.
word_timestamps: bool
Extract word-level timestamps using the cross-attention pattern and
dynamic time warping, and include the timestamps for each word in
each segment.
prepend_punctuations: str
If word_timestamps is True, merge these punctuation symbols with the
next word
append_punctuations: str
If word_timestamps is True, merge these punctuation symbols with the
previous word
initial_prompt: Optional[str]
Optional text to provide as a prompt for the first window. This can
be used to provide, or "prompt-engineer" a context for
transcription, e.g. custom vocabularies or proper nouns to make it
more likely to predict those word correctly.
decode_options: dict
Keyword arguments to construct `DecodingOptions` instances
clip_timestamps: Union[str, List[float]]
Comma-separated list start,end,start,end,... timestamps (in seconds)
of clips to process. The last end timestamp defaults to the end of
the file.
hallucination_silence_threshold: Optional[float]
When word_timestamps is True, skip silent periods longer than this
threshold (in seconds) when a possible hallucination is detected
"""
self.model = model self.model = model
self.verbose = verbose self.verbose = verbose
self.temperature = temperature self.temperature = temperature
@ -523,6 +591,10 @@ class Transcriber(metaclass=PassthroughPropertyDefaults):
self.latest, self.frame_offset = mel, offset self.latest, self.frame_offset = mel, offset
content_frames = mel.shape[-1] - N_FRAMES + offset content_frames = mel.shape[-1] - N_FRAMES + offset
content_duration = float(content_frames * HOP_LENGTH / SAMPLE_RATE) content_duration = float(content_frames * HOP_LENGTH / SAMPLE_RATE)
# NOTE: This loop is obscurely flattened to make the diff readable.
# A later commit should turn this into a simpler nested loop.
# for seek_clip_start, seek_clip_end in seek_clips:
# while seek < seek_clip_end
while self.clip_idx < len(self.seek_clips): while self.clip_idx < len(self.seek_clips):
seek_clip_start, seek_clip_end = self.seek_clips[self.clip_idx] seek_clip_start, seek_clip_end = self.seek_clips[self.clip_idx]
seek_clip_end = content_frames if seek_clip_end is None else \ seek_clip_end = content_frames if seek_clip_end is None else \
@ -685,6 +757,8 @@ class ProgressTranscriber(MinimalTranscriber):
if self._pbar is None: if self._pbar is None:
n = self.latest.shape[-1] if self.duration is None \ n = self.latest.shape[-1] if self.duration is None \
else -int(self.duration * -FRAMES_PER_SECOND) else -int(self.duration * -FRAMES_PER_SECOND)
# show the progress bar when verbose is False
# (if True, transcribed text will be printed)
self._pbar = tqdm.tqdm( self._pbar = tqdm.tqdm(
total=n, unit="frames", disable=self.verbose is not False) total=n, unit="frames", disable=self.verbose is not False)
self._pbar.__enter__() self._pbar.__enter__()
@ -714,6 +788,23 @@ def transcribe(
model: "Whisper", model: "Whisper",
audio: Union[str, np.ndarray, torch.Tensor], audio: Union[str, np.ndarray, torch.Tensor],
**kw): **kw):
"""
Transcribe an audio file using Whisper
Parameters
----------
model: Whisper
The Whisper model instance
audio: Union[str, np.ndarray, torch.Tensor]
The path to the audio file to open, or the audio waveform
Returns
-------
A dictionary containing the resulting text ("text") and segment-level
details ("segments"), and the spoken language ("language"), which is
detected when `decode_options["language"]` is None.
"""
return ProgressTranscriber(model, **kw)(audio_tensor(audio)) return ProgressTranscriber(model, **kw)(audio_tensor(audio))