mirror of
https://github.com/openai/whisper.git
synced 2025-11-24 14:35:57 +00:00
add parameter documentation back in
This commit is contained in:
parent
b4fd954955
commit
e0704ddeba
@ -273,6 +273,74 @@ class Transcriber(metaclass=PassthroughPropertyDefaults):
|
|||||||
clip_timestamps: Union[str, List[float]] = "0",
|
clip_timestamps: Union[str, List[float]] = "0",
|
||||||
hallucination_silence_threshold: Optional[float] = None,
|
hallucination_silence_threshold: Optional[float] = None,
|
||||||
**decode_options):
|
**decode_options):
|
||||||
|
"""
|
||||||
|
Transcribe an audio file using Whisper
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
model: Whisper
|
||||||
|
The Whisper model instance
|
||||||
|
|
||||||
|
verbose: bool
|
||||||
|
Whether to display the text being decoded to the console. If True,
|
||||||
|
displays all the details, If False, displays minimal details. If
|
||||||
|
None, does not display anything
|
||||||
|
|
||||||
|
temperature: Union[float, Tuple[float, ...]]
|
||||||
|
Temperature for sampling. It can be a tuple of temperatures, which
|
||||||
|
will be successively used upon failures according to either
|
||||||
|
`compression_ratio_threshold` or `logprob_threshold`.
|
||||||
|
|
||||||
|
compression_ratio_threshold: float
|
||||||
|
If the gzip compression ratio is above this value, treat as failed
|
||||||
|
|
||||||
|
logprob_threshold: float
|
||||||
|
If the average log probability over sampled tokens is below this
|
||||||
|
value, treat as failed
|
||||||
|
|
||||||
|
no_speech_threshold: float
|
||||||
|
If the no_speech probability is higher than this value AND the
|
||||||
|
average log probability over sampled tokens is below
|
||||||
|
`logprob_threshold`, consider the segment as silent
|
||||||
|
|
||||||
|
condition_on_previous_text: bool
|
||||||
|
if True, the previous output of the model is provided as a prompt
|
||||||
|
for the next window; disabling may make the text inconsistent across
|
||||||
|
windows, but the model becomes less prone to getting stuck in a
|
||||||
|
failure loop, such as repetition looping or timestamps going out of
|
||||||
|
sync.
|
||||||
|
|
||||||
|
word_timestamps: bool
|
||||||
|
Extract word-level timestamps using the cross-attention pattern and
|
||||||
|
dynamic time warping, and include the timestamps for each word in
|
||||||
|
each segment.
|
||||||
|
|
||||||
|
prepend_punctuations: str
|
||||||
|
If word_timestamps is True, merge these punctuation symbols with the
|
||||||
|
next word
|
||||||
|
|
||||||
|
append_punctuations: str
|
||||||
|
If word_timestamps is True, merge these punctuation symbols with the
|
||||||
|
previous word
|
||||||
|
|
||||||
|
initial_prompt: Optional[str]
|
||||||
|
Optional text to provide as a prompt for the first window. This can
|
||||||
|
be used to provide, or "prompt-engineer" a context for
|
||||||
|
transcription, e.g. custom vocabularies or proper nouns to make it
|
||||||
|
more likely to predict those word correctly.
|
||||||
|
|
||||||
|
decode_options: dict
|
||||||
|
Keyword arguments to construct `DecodingOptions` instances
|
||||||
|
|
||||||
|
clip_timestamps: Union[str, List[float]]
|
||||||
|
Comma-separated list start,end,start,end,... timestamps (in seconds)
|
||||||
|
of clips to process. The last end timestamp defaults to the end of
|
||||||
|
the file.
|
||||||
|
|
||||||
|
hallucination_silence_threshold: Optional[float]
|
||||||
|
When word_timestamps is True, skip silent periods longer than this
|
||||||
|
threshold (in seconds) when a possible hallucination is detected
|
||||||
|
"""
|
||||||
self.model = model
|
self.model = model
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.temperature = temperature
|
self.temperature = temperature
|
||||||
@ -523,6 +591,10 @@ class Transcriber(metaclass=PassthroughPropertyDefaults):
|
|||||||
self.latest, self.frame_offset = mel, offset
|
self.latest, self.frame_offset = mel, offset
|
||||||
content_frames = mel.shape[-1] - N_FRAMES + offset
|
content_frames = mel.shape[-1] - N_FRAMES + offset
|
||||||
content_duration = float(content_frames * HOP_LENGTH / SAMPLE_RATE)
|
content_duration = float(content_frames * HOP_LENGTH / SAMPLE_RATE)
|
||||||
|
# NOTE: This loop is obscurely flattened to make the diff readable.
|
||||||
|
# A later commit should turn this into a simpler nested loop.
|
||||||
|
# for seek_clip_start, seek_clip_end in seek_clips:
|
||||||
|
# while seek < seek_clip_end
|
||||||
while self.clip_idx < len(self.seek_clips):
|
while self.clip_idx < len(self.seek_clips):
|
||||||
seek_clip_start, seek_clip_end = self.seek_clips[self.clip_idx]
|
seek_clip_start, seek_clip_end = self.seek_clips[self.clip_idx]
|
||||||
seek_clip_end = content_frames if seek_clip_end is None else \
|
seek_clip_end = content_frames if seek_clip_end is None else \
|
||||||
@ -685,6 +757,8 @@ class ProgressTranscriber(MinimalTranscriber):
|
|||||||
if self._pbar is None:
|
if self._pbar is None:
|
||||||
n = self.latest.shape[-1] if self.duration is None \
|
n = self.latest.shape[-1] if self.duration is None \
|
||||||
else -int(self.duration * -FRAMES_PER_SECOND)
|
else -int(self.duration * -FRAMES_PER_SECOND)
|
||||||
|
# show the progress bar when verbose is False
|
||||||
|
# (if True, transcribed text will be printed)
|
||||||
self._pbar = tqdm.tqdm(
|
self._pbar = tqdm.tqdm(
|
||||||
total=n, unit="frames", disable=self.verbose is not False)
|
total=n, unit="frames", disable=self.verbose is not False)
|
||||||
self._pbar.__enter__()
|
self._pbar.__enter__()
|
||||||
@ -714,6 +788,23 @@ def transcribe(
|
|||||||
model: "Whisper",
|
model: "Whisper",
|
||||||
audio: Union[str, np.ndarray, torch.Tensor],
|
audio: Union[str, np.ndarray, torch.Tensor],
|
||||||
**kw):
|
**kw):
|
||||||
|
"""
|
||||||
|
Transcribe an audio file using Whisper
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
model: Whisper
|
||||||
|
The Whisper model instance
|
||||||
|
|
||||||
|
audio: Union[str, np.ndarray, torch.Tensor]
|
||||||
|
The path to the audio file to open, or the audio waveform
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
A dictionary containing the resulting text ("text") and segment-level
|
||||||
|
details ("segments"), and the spoken language ("language"), which is
|
||||||
|
detected when `decode_options["language"]` is None.
|
||||||
|
"""
|
||||||
return ProgressTranscriber(model, **kw)(audio_tensor(audio))
|
return ProgressTranscriber(model, **kw)(audio_tensor(audio))
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user