Merge 5f850028a771439cd61a6c6f290f34f0334d2c66 into c0d2f624c09dc18e709e37c2ad90c039a4eb72a2

2025-11-24 14:35:57 +00:00 · 2025-07-16 23:15:52 -04:00 · 2025-07-16 23:15:52 -04:00 · c0287e9184
commit c0287e9184
parent c0d2f624c0 5f850028a7
3 changed files with 35 additions and 2 deletions
--- a/tests/fdr.mp3
+++ b/tests/fdr.mp3
--- a/tests/test_progress_callback.py
+++ b/tests/test_progress_callback.py
@ -0,0 +1,27 @@
 import os
 import pytest
 import torch
 import whisper
 def test_progress_callback():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = whisper.load_model("tiny").to(device)
    audio_path = os.path.join(os.path.dirname(__file__), "fdr.mp3")
    progress = []
    def callback(progress_data):
        progress.append(progress_data)
    model.transcribe(
        audio_path,
        language="en",
        verbose=False,  # purely for visualization purposes, not needed for the progress callback
        progress_callback=callback
    )
    print(progress)
    assert len(progress) > 0
    assert progress[-1] == 100.0
--- a/whisper/transcribe.py
+++ b/whisper/transcribe.py
@ -40,6 +40,7 @@ def transcribe(
    audio: Union[str, np.ndarray, torch.Tensor],
    *,
    verbose: Optional[bool] = None,
        progress_callback: Optional[callable] = None,
    temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
    compression_ratio_threshold: Optional[float] = 2.4,
    logprob_threshold: Optional[float] = -1.0,
@ -138,6 +139,7 @@ def transcribe(
    # Pad 30-seconds of silence to the input audio, for slicing
    mel = log_mel_spectrogram(audio, model.dims.n_mels, padding=N_SAMPLES)
    content_frames = mel.shape[-1] - N_FRAMES
    curr_frames = 0
    content_duration = float(content_frames * HOP_LENGTH / SAMPLE_RATE)
    if decode_options.get("language", None) is None:
@ -262,7 +264,7 @@ def transcribe(
    # show the progress bar when verbose is False (if True, transcribed text will be printed)
    with tqdm.tqdm(
-        total=content_frames, unit="frames", disable=verbose is not False
+            total=content_frames, unit="frame", disable=verbose is not False
    ) as pbar:
        last_speech_timestamp = 0.0
        # NOTE: This loop is obscurely flattened to make the diff readable.
@ -505,7 +507,11 @@ def transcribe(
                prompt_reset_since = len(all_tokens)
            # update progress bar
-            pbar.update(min(content_frames, seek) - previous_seek)
+            frames_processed = min(content_frames, seek) - previous_seek
            if progress_callback is not None:
                curr_frames = frames_processed + curr_frames
                progress_callback(curr_frames / content_frames * 100)
            pbar.update(frames_processed)
    return dict(
        text=tokenizer.decode(all_tokens[len(initial_prompt_tokens) :]),