From e0704ddeba84a475ff09674fea043bfe8eb25b48 Mon Sep 17 00:00:00 2001
From: Kent Slaney <kent@slaney.org>
Date: Sun, 14 Jul 2024 16:24:14 -0600
Subject: [PATCH] add parameter documentation back in

---
 whisper/transcribe.py | 91 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/whisper/transcribe.py b/whisper/transcribe.py
index 17ec3cc..0964c8a 100644
--- a/whisper/transcribe.py
+++ b/whisper/transcribe.py
@@ -273,6 +273,74 @@ class Transcriber(metaclass=PassthroughPropertyDefaults):
             clip_timestamps: Union[str, List[float]] = "0",
             hallucination_silence_threshold: Optional[float] = None,
             **decode_options):
+        """
+        Transcribe an audio file using Whisper
+
+        Parameters
+        ----------
+        model: Whisper
+            The Whisper model instance
+
+        verbose: bool
+            Whether to display the text being decoded to the console. If True,
+            displays all the details, If False, displays minimal details. If
+            None, does not display anything
+
+        temperature: Union[float, Tuple[float, ...]]
+            Temperature for sampling. It can be a tuple of temperatures, which
+            will be successively used upon failures according to either
+            `compression_ratio_threshold` or `logprob_threshold`.
+
+        compression_ratio_threshold: float
+            If the gzip compression ratio is above this value, treat as failed
+
+        logprob_threshold: float
+            If the average log probability over sampled tokens is below this
+            value, treat as failed
+
+        no_speech_threshold: float
+            If the no_speech probability is higher than this value AND the
+            average log probability over sampled tokens is below
+            `logprob_threshold`, consider the segment as silent
+
+        condition_on_previous_text: bool
+            if True, the previous output of the model is provided as a prompt
+            for the next window; disabling may make the text inconsistent across
+            windows, but the model becomes less prone to getting stuck in a
+            failure loop, such as repetition looping or timestamps going out of
+            sync.
+
+        word_timestamps: bool
+            Extract word-level timestamps using the cross-attention pattern and
+            dynamic time warping, and include the timestamps for each word in
+            each segment.
+
+        prepend_punctuations: str
+            If word_timestamps is True, merge these punctuation symbols with the
+            next word
+
+        append_punctuations: str
+            If word_timestamps is True, merge these punctuation symbols with the
+            previous word
+
+        initial_prompt: Optional[str]
+            Optional text to provide as a prompt for the first window. This can
+            be used to provide, or "prompt-engineer" a context for
+            transcription, e.g. custom vocabularies or proper nouns to make it
+            more likely to predict those word correctly.
+
+        decode_options: dict
+            Keyword arguments to construct `DecodingOptions` instances
+
+        clip_timestamps: Union[str, List[float]]
+            Comma-separated list start,end,start,end,... timestamps (in seconds)
+            of clips to process. The last end timestamp defaults to the end of
+            the file.
+
+        hallucination_silence_threshold: Optional[float]
+            When word_timestamps is True, skip silent periods longer than this
+            threshold (in seconds) when a possible hallucination is detected
+        """
         self.model = model
         self.verbose = verbose
         self.temperature = temperature
@@ -523,6 +591,10 @@ class Transcriber(metaclass=PassthroughPropertyDefaults):
         self.latest, self.frame_offset = mel, offset
         content_frames = mel.shape[-1] - N_FRAMES + offset
         content_duration = float(content_frames * HOP_LENGTH / SAMPLE_RATE)
+        # NOTE: This loop is obscurely flattened to make the diff readable.
+        # A later commit should turn this into a simpler nested loop.
+        # for seek_clip_start, seek_clip_end in seek_clips:
+        #     while seek < seek_clip_end
         while self.clip_idx < len(self.seek_clips):
             seek_clip_start, seek_clip_end = self.seek_clips[self.clip_idx]
             seek_clip_end = content_frames if seek_clip_end is None else \
@@ -685,6 +757,8 @@ class ProgressTranscriber(MinimalTranscriber):
         if self._pbar is None:
             n = self.latest.shape[-1] if self.duration is None \
                     else -int(self.duration * -FRAMES_PER_SECOND)
+            # show the progress bar when verbose is False
+            # (if True, transcribed text will be printed)
             self._pbar = tqdm.tqdm(
                     total=n, unit="frames", disable=self.verbose is not False)
             self._pbar.__enter__()
@@ -714,6 +788,23 @@ def transcribe(
         model: "Whisper",
         audio: Union[str, np.ndarray, torch.Tensor],
         **kw):
+    """
+    Transcribe an audio file using Whisper
+
+    Parameters
+    ----------
+    model: Whisper
+        The Whisper model instance
+
+    audio: Union[str, np.ndarray, torch.Tensor]
+        The path to the audio file to open, or the audio waveform
+
+    Returns
+    -------
+    A dictionary containing the resulting text ("text") and segment-level
+    details ("segments"), and the spoken language ("language"), which is
+    detected when `decode_options["language"]` is None.
+    """
     return ProgressTranscriber(model, **kw)(audio_tensor(audio))