From 32d55d5d76c9ecbe2dfa3e6735896c648156ab63 Mon Sep 17 00:00:00 2001
From: Jianan Xing <1633398+xingjianan@users.noreply.github.com>
Date: Tue, 10 Sep 2024 09:53:08 -0700
Subject: [PATCH 01/18] Relax triton requirements for compatibility with
 pytorch 2.4 and newer (#2307)

* Relax triton requirements for compatibility with pytorch 2.4 and newer

Similar to https://github.com/openai/whisper/pull/1802, but now when pytorch upgrades to 2.4, it requires triton==3.0.0. I am not sure if it makes sense to remove the upper bound version constraints

* Update requirements.txt
---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 62f5f9d..8ee5920 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,4 @@ torch
 tqdm
 more-itertools
 tiktoken
-triton>=2.0.0,<3;platform_machine=="x86_64" and sys_platform=="linux" or sys_platform=="linux2"
+triton>=2.0.0;platform_machine=="x86_64" and sys_platform=="linux" or sys_platform=="linux2"
diff --git a/setup.py b/setup.py
index 183b527..73c4eb8 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@ def read_version(fname="whisper/version.py"):
 
 requirements = []
 if sys.platform.startswith("linux") and platform.machine() == "x86_64":
-    requirements.append("triton>=2.0.0,<3")
+    requirements.append("triton>=2.0.0")
 
 setup(
     name="openai-whisper",

From 279133e3107392276dc509148da1f41bfb532c7e Mon Sep 17 00:00:00 2001
From: Jong Wook Kim <jongwook@openai.com>
Date: Tue, 10 Sep 2024 10:43:21 -0700
Subject: [PATCH 02/18] pinning numpy<2 in tests (#2332)

* pinning numpy<2 in tests

* pip install together

* pip install together
---
 .github/workflows/test.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index dffc17c..1eaf505 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -49,8 +49,7 @@ jobs:
     steps:
       - uses: conda-incubator/setup-miniconda@v2
       - run: conda install -n test ffmpeg python=${{ matrix.python-version }}
-      - run: pip3 install torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu
       - uses: actions/checkout@v3
       - run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH
-      - run: pip install .["dev"]
+      - run: pip3 install .["dev"] 'numpy<2' torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple
       - run: pytest --durations=0 -vv -k 'not test_transcribe or test_transcribe[tiny] or test_transcribe[tiny.en]' -m 'not requires_cuda'

From 423492dda7806206abe56bdfe427c1096473a020 Mon Sep 17 00:00:00 2001
From: Jong Wook Kim <jongwook@nyu.edu>
Date: Fri, 27 Sep 2024 16:43:58 -0700
Subject: [PATCH 03/18] Release 20240927

---
 CHANGELOG.md       | 7 +++++++
 whisper/version.py | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5895541..3f09538 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # CHANGELOG
 
+## [v20240927](https://github.com/openai/whisper/releases/tag/v20240927)
+
+* pinning numpy<2 in tests ([#2332](https://github.com/openai/whisper/pull/2332))
+* Relax triton requirements for compatibility with pytorch 2.4 and newer ([#2307](https://github.com/openai/whisper/pull/2307))
+* Skip silence around hallucinations ([#1838](https://github.com/openai/whisper/pull/1838))
+* Fix triton env marker ([#1887](https://github.com/openai/whisper/pull/1887))
+
 ## [v20231117](https://github.com/openai/whisper/releases/tag/v20231117)
 
 * Relax triton requirements for compatibility with pytorch 2.1 and newer ([#1802](https://github.com/openai/whisper/pull/1802))
diff --git a/whisper/version.py b/whisper/version.py
index c96dd9c..2242d25 100644
--- a/whisper/version.py
+++ b/whisper/version.py
@@ -1 +1 @@
-__version__ = "20231117"
+__version__ = "20240927"

From 27f971320a50e65fd510b88be04219a6ade31f9b Mon Sep 17 00:00:00 2001
From: Jong Wook Kim <jongwook@openai.com>
Date: Mon, 30 Sep 2024 10:27:14 -0700
Subject: [PATCH 04/18] using sdpa if available (#2359)

* using sdpa if available

* Update model.py
---
 whisper/model.py  | 51 +++++++++++++++++++++++++++++++++++++----------
 whisper/timing.py |  4 +++-
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/whisper/model.py b/whisper/model.py
index a678283..e537447 100644
--- a/whisper/model.py
+++ b/whisper/model.py
@@ -1,7 +1,8 @@
 import base64
 import gzip
+from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Dict, Iterable, Optional
+from typing import Dict, Iterable, Optional, Tuple
 
 import numpy as np
 import torch
@@ -12,6 +13,14 @@ from .decoding import decode as decode_function
 from .decoding import detect_language as detect_language_function
 from .transcribe import transcribe as transcribe_function
 
+try:
+    from torch.nn.functional import scaled_dot_product_attention
+
+    SDPA_AVAILABLE = True
+except (ImportError, RuntimeError, OSError):
+    scaled_dot_product_attention = None
+    SDPA_AVAILABLE = False
+
 
 @dataclass
 class ModelDimensions:
@@ -59,7 +68,19 @@ def sinusoids(length, channels, max_timescale=10000):
     return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
 
 
+@contextmanager
+def disable_sdpa():
+    prev_state = MultiHeadAttention.use_sdpa
+    try:
+        MultiHeadAttention.use_sdpa = False
+        yield
+    finally:
+        MultiHeadAttention.use_sdpa = prev_state
+
+
 class MultiHeadAttention(nn.Module):
+    use_sdpa = True
+
     def __init__(self, n_state: int, n_head: int):
         super().__init__()
         self.n_head = n_head
@@ -92,20 +113,30 @@ class MultiHeadAttention(nn.Module):
 
     def qkv_attention(
         self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None
-    ):
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         n_batch, n_ctx, n_state = q.shape
         scale = (n_state // self.n_head) ** -0.25
-        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale
-        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale
+        q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
+        k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
         v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3)
 
-        qk = q @ k
-        if mask is not None:
-            qk = qk + mask[:n_ctx, :n_ctx]
-        qk = qk.float()
+        if SDPA_AVAILABLE and MultiHeadAttention.use_sdpa:
+            a = scaled_dot_product_attention(
+                q, k, v, is_causal=mask is not None and n_ctx > 1
+            )
+            out = a.permute(0, 2, 1, 3).flatten(start_dim=2)
+            qk = None
+        else:
+            qk = (q * scale) @ (k * scale).transpose(-1, -2)
+            if mask is not None:
+                qk = qk + mask[:n_ctx, :n_ctx]
+            qk = qk.float()
 
-        w = F.softmax(qk, dim=-1).to(q.dtype)
-        return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach()
+            w = F.softmax(qk, dim=-1).to(q.dtype)
+            out = (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2)
+            qk = qk.detach()
+
+        return out, qk
 
 
 class ResidualAttentionBlock(nn.Module):
diff --git a/whisper/timing.py b/whisper/timing.py
index b695ead..e563414 100644
--- a/whisper/timing.py
+++ b/whisper/timing.py
@@ -191,7 +191,9 @@ def find_alignment(
         for i, block in enumerate(model.decoder.blocks)
     ]
 
-    with torch.no_grad():
+    from .model import disable_sdpa
+
+    with torch.no_grad(), disable_sdpa():
         logits = model(mel.unsqueeze(0), tokens.unsqueeze(0))[0]
         sampled_logits = logits[len(tokenizer.sot_sequence) :, : tokenizer.eot]
         token_probs = sampled_logits.softmax(dim=-1)

From b66b46f32dd3934edd3e79b2821357f52d388501 Mon Sep 17 00:00:00 2001
From: Jong Wook Kim <jongwook@openai.com>
Date: Mon, 30 Sep 2024 10:33:56 -0700
Subject: [PATCH 05/18] test on python/pytorch versions up to 3.12 and 2.4.1
 (#2360)

---
 .github/workflows/test.yml | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 1eaf505..a1cc48d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -41,11 +41,19 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.8', '3.9', '3.10', '3.11']
-        pytorch-version: [1.13.1, 2.0.0]
-        exclude:
-          - python-version: '3.11'
+        include:
+          - python-version: '3.8'
             pytorch-version: 1.13.1
+          - python-version: '3.8'
+            pytorch-version: 2.0.1
+          - python-version: '3.9'
+            pytorch-version: 2.1.2
+          - python-version: '3.10'
+            pytorch-version: 2.2.2
+          - python-version: '3.11'
+            pytorch-version: 2.3.1
+          - python-version: '3.12'
+            pytorch-version: 2.4.1
     steps:
       - uses: conda-incubator/setup-miniconda@v2
       - run: conda install -n test ffmpeg python=${{ matrix.python-version }}

From 25e5c364e0a21ddefee46adb674c591f1ba610ba Mon Sep 17 00:00:00 2001
From: Jong Wook Kim <jongwook@openai.com>
Date: Mon, 30 Sep 2024 10:59:51 -0700
Subject: [PATCH 06/18] large-v3-turbo model (#2361)

---
 README.md             | 20 ++++++++++++--------
 model-card.md         |  4 +++-
 whisper/__init__.py   |  4 ++++
 whisper/transcribe.py |  2 +-
 4 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index afca9c9..910b7db 100644
--- a/README.md
+++ b/README.md
@@ -57,17 +57,21 @@ pip install setuptools-rust
 
 ## Available models and languages
 
-There are five model sizes, four with English-only versions, offering speed and accuracy tradeoffs. Below are the names of the available models and their approximate memory requirements and inference speed relative to the large model; actual speed may vary depending on many factors including the available hardware.
+There are six model sizes, four with English-only versions, offering speed and accuracy tradeoffs.
+Below are the names of the available models and their approximate memory requirements and inference speed relative to the large model.
+The relative speeds below are measured by transcribing English speech on a A100, and the real-world speed may vary significantly depending on many factors including the language, the speaking speed, and the available hardware.
 
 |  Size  | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed |
 |:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:|
-|  tiny  |    39 M    |     `tiny.en`      |       `tiny`       |     ~1 GB     |      ~32x      |
-|  base  |    74 M    |     `base.en`      |       `base`       |     ~1 GB     |      ~16x      |
-| small  |   244 M    |     `small.en`     |      `small`       |     ~2 GB     |      ~6x       |
+|  tiny  |    39 M    |     `tiny.en`      |       `tiny`       |     ~1 GB     |      ~10x      |
+|  base  |    74 M    |     `base.en`      |       `base`       |     ~1 GB     |      ~7x       |
+| small  |   244 M    |     `small.en`     |      `small`       |     ~2 GB     |      ~4x       |
 | medium |   769 M    |    `medium.en`     |      `medium`      |     ~5 GB     |      ~2x       |
 | large  |   1550 M   |        N/A         |      `large`       |    ~10 GB     |       1x       |
+| turbo  |   809 M    |        N/A         |      `turbo`       |     ~6 GB     |      ~8x       |
 
 The `.en` models for English-only applications tend to perform better, especially for the `tiny.en` and `base.en` models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models.
+Additionally, the `turbo` model is an optimized version of `large-v3` that offers faster transcription speed with a minimal degradation in accuracy.
 
 Whisper's performance varies widely depending on the language. The figure below shows a performance breakdown of `large-v3` and `large-v2` models by language, using WERs (word error rates) or CER (character error rates, shown in *Italic*) evaluated on the Common Voice 15 and Fleurs datasets. Additional WER/CER metrics corresponding to the other models and datasets can be found in Appendix D.1, D.2, and D.4 of [the paper](https://arxiv.org/abs/2212.04356), as well as the BLEU (Bilingual Evaluation Understudy) scores for translation in Appendix D.3.
 
@@ -77,9 +81,9 @@ Whisper's performance varies widely depending on the language. The figure below
 
 ## Command-line usage
 
-The following command will transcribe speech in audio files, using the `medium` model:
+The following command will transcribe speech in audio files, using the `turbo` model:
 
-    whisper audio.flac audio.mp3 audio.wav --model medium
+    whisper audio.flac audio.mp3 audio.wav --model turbo
 
 The default setting (which selects the `small` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option:
 
@@ -103,7 +107,7 @@ Transcription can also be performed within Python:
 ```python
 import whisper
 
-model = whisper.load_model("base")
+model = whisper.load_model("turbo")
 result = model.transcribe("audio.mp3")
 print(result["text"])
 ```
@@ -115,7 +119,7 @@ Below is an example usage of `whisper.detect_language()` and `whisper.decode()`
 ```python
 import whisper
 
-model = whisper.load_model("base")
+model = whisper.load_model("turbo")
 
 # load audio and pad/trim it to fit 30 seconds
 audio = whisper.load_audio("audio.mp3")
diff --git a/model-card.md b/model-card.md
index 3c041a1..291bc4b 100644
--- a/model-card.md
+++ b/model-card.md
@@ -16,13 +16,15 @@ The Whisper models are trained for speech recognition and translation tasks, cap
 | small  |   244 M    |         ✓          |         ✓          |
 | medium |   769 M    |         ✓          |         ✓          |
 | large  |   1550 M   |                    |         ✓          |
+| turbo  |   798 M    |                    |         ✓          |
 
 In December 2022, we [released an improved large model named `large-v2`](https://github.com/openai/whisper/discussions/661), and `large-v3` in November 2023.
+Additionally, we've added a `turbo` model in September 2024 which is optimized for inference speed.
 
 
 ### Release date
 
-September 2022 (original series), December 2022 (`large-v2`), and November 2023 (`large-v3`)
+September 2022 (original series), December 2022 (`large-v2`), November 2023 (`large-v3`), September 2024 (`large-v3-turbo`)
 
 ### Model type
 
diff --git a/whisper/__init__.py b/whisper/__init__.py
index d7fbba3..e210718 100644
--- a/whisper/__init__.py
+++ b/whisper/__init__.py
@@ -27,6 +27,8 @@ _MODELS = {
     "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
     "large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt",
     "large": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt",
+    "large-v3-turbo": "https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt",
+    "turbo": "https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt",
 }
 
 # base85-encoded (n_layers, n_heads) boolean arrays indicating the cross-attention heads that are
@@ -44,6 +46,8 @@ _ALIGNMENT_HEADS = {
     "large-v2": b"ABzY8zd+h!0{>%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj",
     "large-v3": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00",
     "large": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00",
+    "large-v3-turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`",
+    "turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`",
 }
 
 
diff --git a/whisper/transcribe.py b/whisper/transcribe.py
index 1c075a2..8e1240b 100644
--- a/whisper/transcribe.py
+++ b/whisper/transcribe.py
@@ -511,7 +511,7 @@ def cli():
     # fmt: off
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
-    parser.add_argument("--model", default="small", type=valid_model_name, help="name of the Whisper model to use")
+    parser.add_argument("--model", default="turbo", type=valid_model_name, help="name of the Whisper model to use")
     parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
     parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
     parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")

From 260bbcfcb3cd17a6952f1a51d516e4b2f0e2559a Mon Sep 17 00:00:00 2001
From: Jong Wook Kim <jongwook@openai.com>
Date: Mon, 30 Sep 2024 11:18:17 -0700
Subject: [PATCH 07/18] allowing numpy 2 in tests (#2362)

* allowing numpy 2 in tests

* allowing numpy 2 in tests
---
 .github/workflows/test.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a1cc48d..88131f5 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -44,20 +44,26 @@ jobs:
         include:
           - python-version: '3.8'
             pytorch-version: 1.13.1
+            numpy-requirement: "'numpy<2'"
           - python-version: '3.8'
             pytorch-version: 2.0.1
+            numpy-requirement: "'numpy<2'"
           - python-version: '3.9'
             pytorch-version: 2.1.2
+            numpy-requirement: "'numpy<2'"
           - python-version: '3.10'
             pytorch-version: 2.2.2
+            numpy-requirement: "'numpy<2'"
           - python-version: '3.11'
             pytorch-version: 2.3.1
+            numpy-requirement: "'numpy'"
           - python-version: '3.12'
             pytorch-version: 2.4.1
+            numpy-requirement: "'numpy'"
     steps:
       - uses: conda-incubator/setup-miniconda@v2
       - run: conda install -n test ffmpeg python=${{ matrix.python-version }}
       - uses: actions/checkout@v3
       - run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH
-      - run: pip3 install .["dev"] 'numpy<2' torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple
+      - run: pip3 install .["dev"] ${{ matrix.numpy-requirement }} torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple
       - run: pytest --durations=0 -vv -k 'not test_transcribe or test_transcribe[tiny] or test_transcribe[tiny.en]' -m 'not requires_cuda'

From 25639fc17ddc013d56c594bfbf7644f2185fad84 Mon Sep 17 00:00:00 2001
From: Jong Wook Kim <jongwook@nyu.edu>
Date: Mon, 30 Sep 2024 11:20:53 -0700
Subject: [PATCH 08/18] Release 20240930

---
 CHANGELOG.md       | 7 +++++++
 whisper/version.py | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3f09538..7152899 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # CHANGELOG
 
+## [v20240930](https://github.com/openai/whisper/releases/tag/v20240930)
+
+* allowing numpy 2 in tests ([#2362](https://github.com/openai/whisper/pull/2362))
+* large-v3-turbo model ([#2361](https://github.com/openai/whisper/pull/2361))
+* test on python/pytorch versions up to 3.12 and 2.4.1 ([#2360](https://github.com/openai/whisper/pull/2360))
+* using sdpa if available ([#2359](https://github.com/openai/whisper/pull/2359))
+
 ## [v20240927](https://github.com/openai/whisper/releases/tag/v20240927)
 
 * pinning numpy<2 in tests ([#2332](https://github.com/openai/whisper/pull/2332))
diff --git a/whisper/version.py b/whisper/version.py
index 2242d25..b4b3350 100644
--- a/whisper/version.py
+++ b/whisper/version.py
@@ -1 +1 @@
-__version__ = "20240927"
+__version__ = "20240930"

From cdb81479623391f0651f4f9175ad986e85777f31 Mon Sep 17 00:00:00 2001
From: Jong Wook Kim <jongwook@openai.com>
Date: Fri, 25 Oct 2024 17:30:02 -0700
Subject: [PATCH 09/18] more pytorch versions in tests (#2408)

---
 .github/workflows/test.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 88131f5..84b81cc 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -42,6 +42,9 @@ jobs:
     strategy:
       matrix:
         include:
+          - python-version: '3.8'
+            pytorch-version: 1.10.1
+            numpy-requirement: "'numpy<2'"
           - python-version: '3.8'
             pytorch-version: 1.13.1
             numpy-requirement: "'numpy<2'"
@@ -60,6 +63,9 @@ jobs:
           - python-version: '3.12'
             pytorch-version: 2.4.1
             numpy-requirement: "'numpy'"
+          - python-version: '3.12'
+            pytorch-version: 2.5.0
+            numpy-requirement: "'numpy'"
     steps:
       - uses: conda-incubator/setup-miniconda@v2
       - run: conda install -n test ffmpeg python=${{ matrix.python-version }}

From 5979f03701209bb035a0a466f14131aeb1116cbb Mon Sep 17 00:00:00 2001
From: kittsil <ddt07a@acu.edu>
Date: Sat, 26 Oct 2024 09:17:31 -0500
Subject: [PATCH 10/18] Add option to carry initial_prompt with the sliding
 window (#2343)

* Add option to carry initial_prompt with the sliding window

Add an option `carry_initial_prompt = False` to `whisper.transcribe()`.
When set to `True`, `initial_prompt` is prepended to each internal `decode()` call's `prompt`.
If there is not enough context space at the start of the prompt, the prompt is left-sliced to make space.

* Prevent redundant initial_prompt_tokens

* Revert unnecessary .gitignore change

---------

Co-authored-by: Kittsil <kittsil@gmail.com>
Co-authored-by: Jong Wook Kim <jongwook@openai.com>
---
 whisper/transcribe.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/whisper/transcribe.py b/whisper/transcribe.py
index 8e1240b..8eb6a71 100644
--- a/whisper/transcribe.py
+++ b/whisper/transcribe.py
@@ -46,6 +46,7 @@ def transcribe(
     no_speech_threshold: Optional[float] = 0.6,
     condition_on_previous_text: bool = True,
     initial_prompt: Optional[str] = None,
+    carry_initial_prompt: bool = False,
     word_timestamps: bool = False,
     prepend_punctuations: str = "\"'“¿([{-",
     append_punctuations: str = "\"'.。,，!！?？:：”)]}、",
@@ -102,6 +103,11 @@ def transcribe(
         "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
         to make it more likely to predict those word correctly.
 
+    carry_initial_prompt: bool
+        If carry_initial_prompt is True, `initial_prompt` is prepended to the prompt of each internal
+        `decode()` call. If there is not enough context space at the start of the prompt, it is
+        left-sliced to make space.
+
     decode_options: dict
         Keyword arguments to construct `DecodingOptions` instances
 
@@ -227,9 +233,11 @@ def transcribe(
     all_segments = []
     prompt_reset_since = 0
 
+    remaining_prompt_length = model.dims.n_text_ctx // 2 - 1
     if initial_prompt is not None:
         initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip())
         all_tokens.extend(initial_prompt_tokens)
+        remaining_prompt_length -= len(initial_prompt_tokens)
     else:
         initial_prompt_tokens = []
 
@@ -275,7 +283,13 @@ def transcribe(
             segment_duration = segment_size * HOP_LENGTH / SAMPLE_RATE
             mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(model.device).to(dtype)
 
-            decode_options["prompt"] = all_tokens[prompt_reset_since:]
+            if carry_initial_prompt:
+                nignored = max(len(initial_prompt_tokens), prompt_reset_since)
+                remaining_prompt = all_tokens[nignored:][-remaining_prompt_length:]
+                decode_options["prompt"] = initial_prompt_tokens + remaining_prompt
+            else:
+                decode_options["prompt"] = all_tokens[prompt_reset_since:]
+
             result: DecodingResult = decode_with_fallback(mel_segment)
             tokens = torch.tensor(result.tokens)
 
@@ -529,6 +543,8 @@ def cli():
 
     parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
     parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
+    parser.add_argument("--carry_initial_prompt", type=str2bool, default=False, help="if True, prepend initial_prompt to every internal decode() call. May reduce the effectiveness of condition_on_previous_text")
+
     parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
     parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
 

From 271445b2f24f00f8175c4fb7ae91876f7451dfc1 Mon Sep 17 00:00:00 2001
From: BotMaster3000 <jdgebauer@gmx.de>
Date: Mon, 4 Nov 2024 08:00:30 +0100
Subject: [PATCH 11/18] Update README.md (#2379)

Default now uses Turbo instead of Small
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 910b7db..1a661d7 100644
--- a/README.md
+++ b/README.md
@@ -85,7 +85,7 @@ The following command will transcribe speech in audio files, using the `turbo` m
 
     whisper audio.flac audio.mp3 audio.wav --model turbo
 
-The default setting (which selects the `small` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option:
+The default setting (which selects the `turbo` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option:
 
     whisper japanese.wav --language Japanese
 

From 173ff7dd1d9fb1c4fddea0d41d704cfefeb8908c Mon Sep 17 00:00:00 2001
From: f1sh <71207078+YuZekai@users.noreply.github.com>
Date: Wed, 13 Nov 2024 08:35:54 +0800
Subject: [PATCH 12/18] fix typo data/README.md (#2433)

---
 data/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/README.md b/data/README.md
index 3b4aea1..fcb3200 100644
--- a/data/README.md
+++ b/data/README.md
@@ -45,7 +45,7 @@ We downloaded the [CHiME-5 dataset](https://spandh.dcs.shef.ac.uk//chime_challen
 
 ### AMI-IHM, AMI-SDM1
 
-We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 ad 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b).
+We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 and 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b).
 
 
 ## Long-form English-only datasets

From fc5ded7d9045c693692f13853857c3f8baea3a7b Mon Sep 17 00:00:00 2001
From: Lowell Vaughn <lowell@vaughnresearch.com>
Date: Tue, 26 Nov 2024 09:37:01 -0800
Subject: [PATCH 13/18] Updating README and doc strings to reflect that n_mels
 can now be 128 (#2049)

---
 README.md        | 2 +-
 whisper/audio.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 1a661d7..696869c 100644
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ audio = whisper.load_audio("audio.mp3")
 audio = whisper.pad_or_trim(audio)
 
 # make log-Mel spectrogram and move to the same device as the model
-mel = whisper.log_mel_spectrogram(audio).to(model.device)
+mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)
 
 # detect the spoken language
 _, probs = model.detect_language(mel)
diff --git a/whisper/audio.py b/whisper/audio.py
index cf6c66a..826250f 100644
--- a/whisper/audio.py
+++ b/whisper/audio.py
@@ -122,7 +122,7 @@ def log_mel_spectrogram(
         The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
 
     n_mels: int
-        The number of Mel-frequency filters, only 80 is supported
+        The number of Mel-frequency filters, only 80 and 128 are supported
 
     padding: int
         Number of zero samples to pad to the right
@@ -132,7 +132,7 @@ def log_mel_spectrogram(
 
     Returns
     -------
-    torch.Tensor, shape = (80, n_frames)
+    torch.Tensor, shape = (n_mels, n_frames)
         A Tensor that contains the Mel spectrogram
     """
     if not torch.is_tensor(audio):

From 90db0de1896c23cbfaf0c58bc2d30665f709f170 Mon Sep 17 00:00:00 2001
From: Purfview <69023953+Purfview@users.noreply.github.com>
Date: Sun, 1 Dec 2024 05:47:01 +0000
Subject: [PATCH 14/18] Bugfix: Illogical "Avoid computing higher temperatures
 on no_speech" (#1903)

* Bugfix: Illogical "Avoid computing higher temperatures on no_speech"

Bugfix for https://github.com/openai/whisper/pull/1279

It's "silence" when decoding has failed due to `compression_ratio_threshold` too, when further down the code it's not "silence" anymore.

"Silence" should be only when decoding has failed due to `logprob_threshold`.

Like described there:
https://github.com/openai/whisper/blob/8bc8860694949db53c42ba47ddc23786c2e02a8b/whisper/transcribe.py#L421

And in code there:
https://github.com/openai/whisper/blob/8bc8860694949db53c42ba47ddc23786c2e02a8b/whisper/transcribe.py#L243-L251

* Fix if "logprob_threshold=None"

---------

Co-authored-by: Jong Wook Kim <jongwook@openai.com>
---
 whisper/transcribe.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/whisper/transcribe.py b/whisper/transcribe.py
index 8eb6a71..0a4cc36 100644
--- a/whisper/transcribe.py
+++ b/whisper/transcribe.py
@@ -214,6 +214,8 @@ def transcribe(
             if (
                 no_speech_threshold is not None
                 and decode_result.no_speech_prob > no_speech_threshold
+                and logprob_threshold is not None
+                and decode_result.avg_logprob < logprob_threshold
             ):
                 needs_fallback = False  # silence
             if not needs_fallback:

From 6c1d8f1ea10b85ec0a0ed584edb5ad9c8efc3195 Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Sat, 4 Jan 2025 09:47:12 +0100
Subject: [PATCH 15/18] Upgrade GitHub Actions (#2430)

---
 .github/workflows/test.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 84b81cc..106c66b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -11,10 +11,10 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Fetch base branch
         run: git fetch origin ${{ github.base_ref }}
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
           python-version: "3.8"
           architecture: x64
@@ -23,7 +23,7 @@ jobs:
         run: |
           echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT
       - name: pip/pre-commit cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: |
             ${{ steps.pip-cache.outputs.dir }}
@@ -67,9 +67,9 @@ jobs:
             pytorch-version: 2.5.0
             numpy-requirement: "'numpy'"
     steps:
-      - uses: conda-incubator/setup-miniconda@v2
+      - uses: conda-incubator/setup-miniconda@v3
       - run: conda install -n test ffmpeg python=${{ matrix.python-version }}
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH
       - run: pip3 install .["dev"] ${{ matrix.numpy-requirement }} torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple
       - run: pytest --durations=0 -vv -k 'not test_transcribe or test_transcribe[tiny] or test_transcribe[tiny.en]' -m 'not requires_cuda'

From 26a7cacc83c2cfbbf743022da8331b29702ceedc Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Sat, 4 Jan 2025 10:02:18 +0100
Subject: [PATCH 16/18] pre-commit autoupdate && pre-commit run --all-files
 (#2484)

* pre-commit autoupdate && pre-commit run --all-files

* Black formatter needs a current version of Python
---
 .github/workflows/test.yml   |  4 ++--
 .pre-commit-config.yaml      |  8 ++++----
 whisper/normalizers/basic.py | 22 +++++++++++++---------
 whisper/utils.py             |  8 +++++---
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 106c66b..16c7ff7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -16,7 +16,7 @@ jobs:
         run: git fetch origin ${{ github.base_ref }}
       - uses: actions/setup-python@v5
         with:
-          python-version: "3.8"
+          python-version: "3.9"
           architecture: x64
       - name: Get pip cache dir
         id: pip-cache
@@ -33,7 +33,7 @@ jobs:
             ${{ runner.os }}-pip-pre-commit
       - name: pre-commit
         run: |
-          pip install -U pre-commit
+          pip install --upgrade pre-commit
           pre-commit install --install-hooks
           pre-commit run --all-files
   whisper-test:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3f5a74b..48df249 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.0.1
+    rev: v5.0.0
     hooks:
       - id: check-json
       - id: end-of-file-fixer
@@ -11,17 +11,17 @@ repos:
       - id: check-added-large-files
         args: [--maxkb=4096]
   - repo: https://github.com/psf/black
-    rev: 23.7.0
+    rev: 24.10.0
     hooks:
       - id: black
   - repo: https://github.com/pycqa/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
       - id: isort
         name: isort (python)
         args: ["--profile", "black", "-l", "88", "--trailing-comma", "--multi-line", "3"]
   - repo: https://github.com/pycqa/flake8.git
-    rev: 6.0.0
+    rev: 7.1.1
     hooks:
       - id: flake8
         types: [python]
diff --git a/whisper/normalizers/basic.py b/whisper/normalizers/basic.py
index a824032..8690ae7 100644
--- a/whisper/normalizers/basic.py
+++ b/whisper/normalizers/basic.py
@@ -30,15 +30,19 @@ def remove_symbols_and_diacritics(s: str, keep=""):
     and drop any diacritics (category 'Mn' and some manual mappings)
     """
     return "".join(
-        c
-        if c in keep
-        else ADDITIONAL_DIACRITICS[c]
-        if c in ADDITIONAL_DIACRITICS
-        else ""
-        if unicodedata.category(c) == "Mn"
-        else " "
-        if unicodedata.category(c)[0] in "MSP"
-        else c
+        (
+            c
+            if c in keep
+            else (
+                ADDITIONAL_DIACRITICS[c]
+                if c in ADDITIONAL_DIACRITICS
+                else (
+                    ""
+                    if unicodedata.category(c) == "Mn"
+                    else " " if unicodedata.category(c)[0] in "MSP" else c
+                )
+            )
+        )
         for c in unicodedata.normalize("NFKD", s)
     )
 
diff --git a/whisper/utils.py b/whisper/utils.py
index 9b9b138..13792f7 100644
--- a/whisper/utils.py
+++ b/whisper/utils.py
@@ -209,9 +209,11 @@ class SubtitlesWriter(ResultWriter):
 
                         yield start, end, "".join(
                             [
-                                re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
-                                if j == i
-                                else word
+                                (
+                                    re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
+                                    if j == i
+                                    else word
+                                )
                                 for j, word in enumerate(all_words)
                             ]
                         )

From dd4d010d2c585bc70aeddd166cd3e26b0bb62f31 Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Sat, 4 Jan 2025 10:38:35 +0100
Subject: [PATCH 17/18] PEP 621: Migrate from setup.py to pyproject.toml
 (#2435)

---
 pyproject.toml | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 setup.py       | 42 ------------------------------------------
 2 files changed, 47 insertions(+), 43 deletions(-)
 delete mode 100644 setup.py

diff --git a/pyproject.toml b/pyproject.toml
index 84637eb..21b90e7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,50 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+
+requires = [ "setuptools>=61.2" ]
+
+[project]
+name = "openai-whisper"
+description = "Robust Speech Recognition via Large-Scale Weak Supervision"
+readme.content-type = "text/markdown"
+readme.file = "README.md"
+license = { text = "MIT" }
+authors = [ { name = "OpenAI" } ]
+requires-python = ">=3.8"
+classifiers = [
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+]
+dynamic = [ "version" ]
+dependencies = [
+  "more-itertools",
+  "numba",
+  "numpy",
+  "tiktoken",
+  "torch",
+  "tqdm",
+  "triton>=2; (platform_machine=='x86_64' and sys_platform=='linux') or sys_platform=='linux2'",
+]
+optional-dependencies.dev = [ "black", "flake8", "isort", "pytest", "scipy" ]
+urls = { Homepage = "https://github.com/openai/whisper" }
+scripts.whisper = "whisper.transcribe:cli"
+
+[tool.setuptools]
+py-modules = [ "whisper" ]
+include-package-data = true
+
+[tool.setuptools.dynamic]
+version = { attr = "whisper.version.__version__" }
+
+[tool.setuptools.packages.find]
+exclude = [ "tests*" ]
+namespaces = false
+
 [tool.black]
 
 [tool.isort]
@@ -5,4 +52,3 @@ profile = "black"
 include_trailing_comma = true
 line_length = 88
 multi_line_output = 3
-
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 73c4eb8..0000000
--- a/setup.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import platform
-import sys
-from pathlib import Path
-
-import pkg_resources
-from setuptools import find_packages, setup
-
-
-def read_version(fname="whisper/version.py"):
-    exec(compile(open(fname, encoding="utf-8").read(), fname, "exec"))
-    return locals()["__version__"]
-
-
-requirements = []
-if sys.platform.startswith("linux") and platform.machine() == "x86_64":
-    requirements.append("triton>=2.0.0")
-
-setup(
-    name="openai-whisper",
-    py_modules=["whisper"],
-    version=read_version(),
-    description="Robust Speech Recognition via Large-Scale Weak Supervision",
-    long_description=open("README.md", encoding="utf-8").read(),
-    long_description_content_type="text/markdown",
-    readme="README.md",
-    python_requires=">=3.8",
-    author="OpenAI",
-    url="https://github.com/openai/whisper",
-    license="MIT",
-    packages=find_packages(exclude=["tests*"]),
-    install_requires=[
-        str(r)
-        for r in pkg_resources.parse_requirements(
-            Path(__file__).with_name("requirements.txt").open()
-        )
-    ],
-    entry_points={
-        "console_scripts": ["whisper=whisper.transcribe:cli"],
-    },
-    include_package_data=True,
-    extras_require={"dev": ["pytest", "scipy", "black", "flake8", "isort"]},
-)

From 517a43ecd132a2089d85f4ebc044728a71d49f6e Mon Sep 17 00:00:00 2001
From: Jong Wook Kim <jongwook@openai.com>
Date: Sat, 4 Jan 2025 12:56:16 -0800
Subject: [PATCH 18/18] Update python-publish.yml

using `-m build --sdist` instead of `setup.py sdist`
---
 .github/workflows/python-publish.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index 4b91a2a..c868068 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -33,5 +33,5 @@ jobs:
         TWINE_USERNAME: __token__
         TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
       run: |
-        python setup.py sdist
+        python -m build --sdist
         twine upload dist/*