add parameter documentation back in

2025-11-24 14:35:57 +00:00 · 2024-07-14 16:24:14 -06:00 · 2024-07-14 16:24:14 -06:00 · e0704ddeba
commit e0704ddeba
parent b4fd954955
1 changed files with 91 additions and 0 deletions
--- a/whisper/transcribe.py
+++ b/whisper/transcribe.py
@ -273,6 +273,74 @@ class Transcriber(metaclass=PassthroughPropertyDefaults):
            clip_timestamps: Union[str, List[float]] = "0",
            hallucination_silence_threshold: Optional[float] = None,
            **decode_options):
        """
        Transcribe an audio file using Whisper
        Parameters
        ----------
        model: Whisper
            The Whisper model instance
        verbose: bool
            Whether to display the text being decoded to the console. If True,
            displays all the details, If False, displays minimal details. If
            None, does not display anything
        temperature: Union[float, Tuple[float, ...]]
            Temperature for sampling. It can be a tuple of temperatures, which
            will be successively used upon failures according to either
            `compression_ratio_threshold` or `logprob_threshold`.
        compression_ratio_threshold: float
            If the gzip compression ratio is above this value, treat as failed
        logprob_threshold: float
            If the average log probability over sampled tokens is below this
            value, treat as failed
        no_speech_threshold: float
            If the no_speech probability is higher than this value AND the
            average log probability over sampled tokens is below
            `logprob_threshold`, consider the segment as silent
        condition_on_previous_text: bool
            if True, the previous output of the model is provided as a prompt
            for the next window; disabling may make the text inconsistent across
            windows, but the model becomes less prone to getting stuck in a
            failure loop, such as repetition looping or timestamps going out of
            sync.
        word_timestamps: bool
            Extract word-level timestamps using the cross-attention pattern and
            dynamic time warping, and include the timestamps for each word in
            each segment.
        prepend_punctuations: str
            If word_timestamps is True, merge these punctuation symbols with the
            next word
        append_punctuations: str
            If word_timestamps is True, merge these punctuation symbols with the
            previous word
        initial_prompt: Optional[str]
            Optional text to provide as a prompt for the first window. This can
            be used to provide, or "prompt-engineer" a context for
            transcription, e.g. custom vocabularies or proper nouns to make it
            more likely to predict those word correctly.
        decode_options: dict
            Keyword arguments to construct `DecodingOptions` instances
        clip_timestamps: Union[str, List[float]]
            Comma-separated list start,end,start,end,... timestamps (in seconds)
            of clips to process. The last end timestamp defaults to the end of
            the file.
        hallucination_silence_threshold: Optional[float]
            When word_timestamps is True, skip silent periods longer than this
            threshold (in seconds) when a possible hallucination is detected
        """
        self.model = model
        self.verbose = verbose
        self.temperature = temperature
@ -523,6 +591,10 @@ class Transcriber(metaclass=PassthroughPropertyDefaults):
        self.latest, self.frame_offset = mel, offset
        content_frames = mel.shape[-1] - N_FRAMES + offset
        content_duration = float(content_frames * HOP_LENGTH / SAMPLE_RATE)
        # NOTE: This loop is obscurely flattened to make the diff readable.
        # A later commit should turn this into a simpler nested loop.
        # for seek_clip_start, seek_clip_end in seek_clips:
        #     while seek < seek_clip_end
        while self.clip_idx < len(self.seek_clips):
            seek_clip_start, seek_clip_end = self.seek_clips[self.clip_idx]
            seek_clip_end = content_frames if seek_clip_end is None else \
@ -685,6 +757,8 @@ class ProgressTranscriber(MinimalTranscriber):
        if self._pbar is None:
            n = self.latest.shape[-1] if self.duration is None \
                    else -int(self.duration * -FRAMES_PER_SECOND)
            # show the progress bar when verbose is False
            # (if True, transcribed text will be printed)
            self._pbar = tqdm.tqdm(
                    total=n, unit="frames", disable=self.verbose is not False)
            self._pbar.__enter__()
@ -714,6 +788,23 @@ def transcribe(
        model: "Whisper",
        audio: Union[str, np.ndarray, torch.Tensor],
        **kw):
    """
    Transcribe an audio file using Whisper
    Parameters
    ----------
    model: Whisper
        The Whisper model instance
    audio: Union[str, np.ndarray, torch.Tensor]
        The path to the audio file to open, or the audio waveform
    Returns
    -------
    A dictionary containing the resulting text ("text") and segment-level
    details ("segments"), and the spoken language ("language"), which is
    detected when `decode_options["language"]` is None.
    """
    return ProgressTranscriber(model, **kw)(audio_tensor(audio))