Merge f38acdff61da797d394a98dbc436587fbc10c663 into c0d2f624c09dc18e709e37c2ad90c039a4eb72a2

2025-11-23 22:15:58 +00:00 · 2025-10-02 23:41:37 +02:00 · 2025-10-02 23:41:37 +02:00 · 88792db279
commit 88792db279
parent c0d2f624c0 f38acdff61
2 changed files with 10 additions and 2 deletions
--- a/whisper/timing.py
+++ b/whisper/timing.py
@ -2,7 +2,7 @@ import itertools
 import subprocess
 import warnings
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, List, Optional, Callable

 import numba
 import numpy as np
@ -286,6 +286,7 @@ def add_word_timestamps(
    prepend_punctuations: str = "\"'“¿([{-",
    append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
    last_speech_timestamp: float,
+    word_stream_callback: Optional[Callable] = None,
    **kwargs,
 ):
    if len(segments) == 0:
@ -329,6 +330,8 @@ def add_word_timestamps(
            timing = alignment[word_index]

            if timing.word:
+                if word_stream_callback is not None:
+                    word_stream_callback(timing)
                words.append(
                    dict(
                        word=timing.word,
--- a/whisper/transcribe.py
+++ b/whisper/transcribe.py
@ -2,7 +2,7 @@ import argparse
 import os
 import traceback
 import warnings
-from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union, Callable

 import numpy as np
 import torch
@ -40,6 +40,7 @@ def transcribe(
    audio: Union[str, np.ndarray, torch.Tensor],
    *,
    verbose: Optional[bool] = None,
+    word_stream_callback: Optional[Callable] = None,
    temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
    compression_ratio_threshold: Optional[float] = 2.4,
    logprob_threshold: Optional[float] = -1.0,
@ -69,6 +70,9 @@ def transcribe(
        Whether to display the text being decoded to the console. If True, displays all the details,
        If False, displays minimal details. If None, does not display anything

+    word_stream_callback: Callable
+        Function that receives ready words as the other voice chunks are in progress.
+
    temperature: Union[float, Tuple[float, ...]]
        Temperature for sampling. It can be a tuple of temperatures, which will be successively used
        upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.
@ -408,6 +412,7 @@ def transcribe(
                    prepend_punctuations=prepend_punctuations,
                    append_punctuations=append_punctuations,
                    last_speech_timestamp=last_speech_timestamp,
+                    word_stream_callback=word_stream_callback
                )

                if not single_timestamp_ending: