mirror of
https://github.com/openai/whisper.git
synced 2025-11-24 06:26:03 +00:00
add parameter documentation back in
This commit is contained in:
parent
b4fd954955
commit
e0704ddeba
@ -273,6 +273,74 @@ class Transcriber(metaclass=PassthroughPropertyDefaults):
|
||||
clip_timestamps: Union[str, List[float]] = "0",
|
||||
hallucination_silence_threshold: Optional[float] = None,
|
||||
**decode_options):
|
||||
"""
|
||||
Transcribe an audio file using Whisper
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model: Whisper
|
||||
The Whisper model instance
|
||||
|
||||
verbose: bool
|
||||
Whether to display the text being decoded to the console. If True,
|
||||
displays all the details, If False, displays minimal details. If
|
||||
None, does not display anything
|
||||
|
||||
temperature: Union[float, Tuple[float, ...]]
|
||||
Temperature for sampling. It can be a tuple of temperatures, which
|
||||
will be successively used upon failures according to either
|
||||
`compression_ratio_threshold` or `logprob_threshold`.
|
||||
|
||||
compression_ratio_threshold: float
|
||||
If the gzip compression ratio is above this value, treat as failed
|
||||
|
||||
logprob_threshold: float
|
||||
If the average log probability over sampled tokens is below this
|
||||
value, treat as failed
|
||||
|
||||
no_speech_threshold: float
|
||||
If the no_speech probability is higher than this value AND the
|
||||
average log probability over sampled tokens is below
|
||||
`logprob_threshold`, consider the segment as silent
|
||||
|
||||
condition_on_previous_text: bool
|
||||
if True, the previous output of the model is provided as a prompt
|
||||
for the next window; disabling may make the text inconsistent across
|
||||
windows, but the model becomes less prone to getting stuck in a
|
||||
failure loop, such as repetition looping or timestamps going out of
|
||||
sync.
|
||||
|
||||
word_timestamps: bool
|
||||
Extract word-level timestamps using the cross-attention pattern and
|
||||
dynamic time warping, and include the timestamps for each word in
|
||||
each segment.
|
||||
|
||||
prepend_punctuations: str
|
||||
If word_timestamps is True, merge these punctuation symbols with the
|
||||
next word
|
||||
|
||||
append_punctuations: str
|
||||
If word_timestamps is True, merge these punctuation symbols with the
|
||||
previous word
|
||||
|
||||
initial_prompt: Optional[str]
|
||||
Optional text to provide as a prompt for the first window. This can
|
||||
be used to provide, or "prompt-engineer" a context for
|
||||
transcription, e.g. custom vocabularies or proper nouns to make it
|
||||
more likely to predict those word correctly.
|
||||
|
||||
decode_options: dict
|
||||
Keyword arguments to construct `DecodingOptions` instances
|
||||
|
||||
clip_timestamps: Union[str, List[float]]
|
||||
Comma-separated list start,end,start,end,... timestamps (in seconds)
|
||||
of clips to process. The last end timestamp defaults to the end of
|
||||
the file.
|
||||
|
||||
hallucination_silence_threshold: Optional[float]
|
||||
When word_timestamps is True, skip silent periods longer than this
|
||||
threshold (in seconds) when a possible hallucination is detected
|
||||
"""
|
||||
self.model = model
|
||||
self.verbose = verbose
|
||||
self.temperature = temperature
|
||||
@ -523,6 +591,10 @@ class Transcriber(metaclass=PassthroughPropertyDefaults):
|
||||
self.latest, self.frame_offset = mel, offset
|
||||
content_frames = mel.shape[-1] - N_FRAMES + offset
|
||||
content_duration = float(content_frames * HOP_LENGTH / SAMPLE_RATE)
|
||||
# NOTE: This loop is obscurely flattened to make the diff readable.
|
||||
# A later commit should turn this into a simpler nested loop.
|
||||
# for seek_clip_start, seek_clip_end in seek_clips:
|
||||
# while seek < seek_clip_end
|
||||
while self.clip_idx < len(self.seek_clips):
|
||||
seek_clip_start, seek_clip_end = self.seek_clips[self.clip_idx]
|
||||
seek_clip_end = content_frames if seek_clip_end is None else \
|
||||
@ -685,6 +757,8 @@ class ProgressTranscriber(MinimalTranscriber):
|
||||
if self._pbar is None:
|
||||
n = self.latest.shape[-1] if self.duration is None \
|
||||
else -int(self.duration * -FRAMES_PER_SECOND)
|
||||
# show the progress bar when verbose is False
|
||||
# (if True, transcribed text will be printed)
|
||||
self._pbar = tqdm.tqdm(
|
||||
total=n, unit="frames", disable=self.verbose is not False)
|
||||
self._pbar.__enter__()
|
||||
@ -714,6 +788,23 @@ def transcribe(
|
||||
model: "Whisper",
|
||||
audio: Union[str, np.ndarray, torch.Tensor],
|
||||
**kw):
|
||||
"""
|
||||
Transcribe an audio file using Whisper
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model: Whisper
|
||||
The Whisper model instance
|
||||
|
||||
audio: Union[str, np.ndarray, torch.Tensor]
|
||||
The path to the audio file to open, or the audio waveform
|
||||
|
||||
Returns
|
||||
-------
|
||||
A dictionary containing the resulting text ("text") and segment-level
|
||||
details ("segments"), and the spoken language ("language"), which is
|
||||
detected when `decode_options["language"]` is None.
|
||||
"""
|
||||
return ProgressTranscriber(model, **kw)(audio_tensor(audio))
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user