add parameter documentation back in

2025-11-24 06:26:03 +00:00 · 2024-07-14 16:24:14 -06:00 · 2024-07-14 16:24:14 -06:00 · e0704ddeba
commit e0704ddeba
parent b4fd954955
1 changed files with 91 additions and 0 deletions
--- a/whisper/transcribe.py
+++ b/whisper/transcribe.py
@ -273,6 +273,74 @@ class Transcriber(metaclass=PassthroughPropertyDefaults):
            clip_timestamps: Union[str, List[float]] = "0",
            hallucination_silence_threshold: Optional[float] = None,
            **decode_options):
+        """
+        Transcribe an audio file using Whisper
+
+        Parameters
+        ----------
+        model: Whisper
+            The Whisper model instance
+
+        verbose: bool
+            Whether to display the text being decoded to the console. If True,
+            displays all the details, If False, displays minimal details. If
+            None, does not display anything
+
+        temperature: Union[float, Tuple[float, ...]]
+            Temperature for sampling. It can be a tuple of temperatures, which
+            will be successively used upon failures according to either
+            `compression_ratio_threshold` or `logprob_threshold`.
+
+        compression_ratio_threshold: float
+            If the gzip compression ratio is above this value, treat as failed
+
+        logprob_threshold: float
+            If the average log probability over sampled tokens is below this
+            value, treat as failed
+
+        no_speech_threshold: float
+            If the no_speech probability is higher than this value AND the
+            average log probability over sampled tokens is below
+            `logprob_threshold`, consider the segment as silent
+
+        condition_on_previous_text: bool
+            if True, the previous output of the model is provided as a prompt
+            for the next window; disabling may make the text inconsistent across
+            windows, but the model becomes less prone to getting stuck in a
+            failure loop, such as repetition looping or timestamps going out of
+            sync.
+
+        word_timestamps: bool
+            Extract word-level timestamps using the cross-attention pattern and
+            dynamic time warping, and include the timestamps for each word in
+            each segment.
+
+        prepend_punctuations: str
+            If word_timestamps is True, merge these punctuation symbols with the
+            next word
+
+        append_punctuations: str
+            If word_timestamps is True, merge these punctuation symbols with the
+            previous word
+
+        initial_prompt: Optional[str]
+            Optional text to provide as a prompt for the first window. This can
+            be used to provide, or "prompt-engineer" a context for
+            transcription, e.g. custom vocabularies or proper nouns to make it
+            more likely to predict those word correctly.
+
+        decode_options: dict
+            Keyword arguments to construct `DecodingOptions` instances
+
+        clip_timestamps: Union[str, List[float]]
+            Comma-separated list start,end,start,end,... timestamps (in seconds)
+            of clips to process. The last end timestamp defaults to the end of
+            the file.
+
+        hallucination_silence_threshold: Optional[float]
+            When word_timestamps is True, skip silent periods longer than this
+            threshold (in seconds) when a possible hallucination is detected
+        """
        self.model = model
        self.verbose = verbose
        self.temperature = temperature
@ -523,6 +591,10 @@ class Transcriber(metaclass=PassthroughPropertyDefaults):
        self.latest, self.frame_offset = mel, offset
        content_frames = mel.shape[-1] - N_FRAMES + offset
        content_duration = float(content_frames * HOP_LENGTH / SAMPLE_RATE)
+        # NOTE: This loop is obscurely flattened to make the diff readable.
+        # A later commit should turn this into a simpler nested loop.
+        # for seek_clip_start, seek_clip_end in seek_clips:
+        #     while seek < seek_clip_end
        while self.clip_idx < len(self.seek_clips):
            seek_clip_start, seek_clip_end = self.seek_clips[self.clip_idx]
            seek_clip_end = content_frames if seek_clip_end is None else \
@ -685,6 +757,8 @@ class ProgressTranscriber(MinimalTranscriber):
        if self._pbar is None:
            n = self.latest.shape[-1] if self.duration is None \
                    else -int(self.duration * -FRAMES_PER_SECOND)
+            # show the progress bar when verbose is False
+            # (if True, transcribed text will be printed)
            self._pbar = tqdm.tqdm(
                    total=n, unit="frames", disable=self.verbose is not False)
            self._pbar.__enter__()
@ -714,6 +788,23 @@ def transcribe(
        model: "Whisper",
        audio: Union[str, np.ndarray, torch.Tensor],
        **kw):
+    """
+    Transcribe an audio file using Whisper
+
+    Parameters
+    ----------
+    model: Whisper
+        The Whisper model instance
+
+    audio: Union[str, np.ndarray, torch.Tensor]
+        The path to the audio file to open, or the audio waveform
+
+    Returns
+    -------
+    A dictionary containing the resulting text ("text") and segment-level
+    details ("segments"), and the spoken language ("language"), which is
+    detected when `decode_options["language"]` is None.
+    """
    return ProgressTranscriber(model, **kw)(audio_tensor(audio))