Update transcribe.py

2025-11-24 14:35:57 +00:00 · 2024-11-01 20:33:52 +03:00 · 2024-11-01 20:33:52 +03:00 · 482a5b89d8
commit 482a5b89d8
parent 80ddd07c28
1 changed files with 18 additions and 5 deletions
--- a/whisper/transcribe.py
+++ b/whisper/transcribe.py
@ -76,8 +76,8 @@ def transcribe(
    compression_ratio_threshold: float
        If the gzip compression ratio is above this value, treat as failed

-    compression_ratio_hallucination_threshold: float
-        If the gzip compression ratio is above this value after all attempts to decode, treat as a halucination and skip
+    compression_ratio_halcination_threshold: float
+        If the gzip compression ratio is above this value after all attempts to decode, treat as a hallucination and skip

    logprob_threshold: float
        If the average log probability over sampled tokens is below this value, treat as failed
@ -106,6 +106,11 @@ def transcribe(
        "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
        to make it more likely to predict those word correctly.

+    carry_initial_prompt: bool
+        If carry_initial_prompt is True, `initial_prompt` is prepended to the prompt of each internal
+        `decode()` call. If there is not enough context space at the start of the prompt, it is
+        left-sliced to make space.
+
    decode_options: dict
        Keyword arguments to construct `DecodingOptions` instances

@ -221,8 +226,6 @@ def transcribe(
            ):
            # Discard the segment
                continue  # Skip to the next segment
-
-
            if not needs_fallback:
                break

@ -240,9 +243,11 @@ def transcribe(
    all_segments = []
    prompt_reset_since = 0

+    remaining_prompt_length = model.dims.n_text_ctx // 2 - 1
    if initial_prompt is not None:
        initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip())
        all_tokens.extend(initial_prompt_tokens)
+        remaining_prompt_length -= len(initial_prompt_tokens)
    else:
        initial_prompt_tokens = []

@ -288,7 +293,13 @@ def transcribe(
            segment_duration = segment_size * HOP_LENGTH / SAMPLE_RATE
            mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(model.device).to(dtype)

-            decode_options["prompt"] = all_tokens[prompt_reset_since:]
+            if carry_initial_prompt:
+                nignored = max(len(initial_prompt_tokens), prompt_reset_since)
+                remaining_prompt = all_tokens[nignored:][-remaining_prompt_length:]
+                decode_options["prompt"] = initial_prompt_tokens + remaining_prompt
+            else:
+                decode_options["prompt"] = all_tokens[prompt_reset_since:]
+
            result: DecodingResult = decode_with_fallback(mel_segment)
            tokens = torch.tensor(result.tokens)

@ -542,6 +553,8 @@ def cli():

    parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
    parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
+    parser.add_argument("--carry_initial_prompt", type=str2bool, default=False, help="if True, prepend initial_prompt to every internal decode() call. May reduce the effectiveness of condition_on_previous_text")
+
    parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
    parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")