From 32d55d5d76c9ecbe2dfa3e6735896c648156ab63 Mon Sep 17 00:00:00 2001 From: Jianan Xing <1633398+xingjianan@users.noreply.github.com> Date: Tue, 10 Sep 2024 09:53:08 -0700 Subject: [PATCH 01/18] Relax triton requirements for compatibility with pytorch 2.4 and newer (#2307) * Relax triton requirements for compatibility with pytorch 2.4 and newer Similar to https://github.com/openai/whisper/pull/1802, but now when pytorch upgrades to 2.4, it requires triton==3.0.0. I am not sure if it makes sense to remove the upper bound version constraints * Update requirements.txt --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 62f5f9d..8ee5920 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,4 @@ torch tqdm more-itertools tiktoken -triton>=2.0.0,<3;platform_machine=="x86_64" and sys_platform=="linux" or sys_platform=="linux2" +triton>=2.0.0;platform_machine=="x86_64" and sys_platform=="linux" or sys_platform=="linux2" diff --git a/setup.py b/setup.py index 183b527..73c4eb8 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def read_version(fname="whisper/version.py"): requirements = [] if sys.platform.startswith("linux") and platform.machine() == "x86_64": - requirements.append("triton>=2.0.0,<3") + requirements.append("triton>=2.0.0") setup( name="openai-whisper", From 279133e3107392276dc509148da1f41bfb532c7e Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Tue, 10 Sep 2024 10:43:21 -0700 Subject: [PATCH 02/18] pinning numpy<2 in tests (#2332) * pinning numpy<2 in tests * pip install together * pip install together --- .github/workflows/test.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index dffc17c..1eaf505 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -49,8 +49,7 @@ jobs: steps: - uses: conda-incubator/setup-miniconda@v2 - run: conda install -n test ffmpeg python=${{ matrix.python-version }} - - run: pip3 install torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu - uses: actions/checkout@v3 - run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH - - run: pip install .["dev"] + - run: pip3 install .["dev"] 'numpy<2' torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple - run: pytest --durations=0 -vv -k 'not test_transcribe or test_transcribe[tiny] or test_transcribe[tiny.en]' -m 'not requires_cuda' From 423492dda7806206abe56bdfe427c1096473a020 Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Fri, 27 Sep 2024 16:43:58 -0700 Subject: [PATCH 03/18] Release 20240927 --- CHANGELOG.md | 7 +++++++ whisper/version.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5895541..3f09538 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # CHANGELOG +## [v20240927](https://github.com/openai/whisper/releases/tag/v20240927) + +* pinning numpy<2 in tests ([#2332](https://github.com/openai/whisper/pull/2332)) +* Relax triton requirements for compatibility with pytorch 2.4 and newer ([#2307](https://github.com/openai/whisper/pull/2307)) +* Skip silence around hallucinations ([#1838](https://github.com/openai/whisper/pull/1838)) +* Fix triton env marker ([#1887](https://github.com/openai/whisper/pull/1887)) + ## [v20231117](https://github.com/openai/whisper/releases/tag/v20231117) * Relax triton requirements for compatibility with pytorch 2.1 and newer ([#1802](https://github.com/openai/whisper/pull/1802)) diff --git a/whisper/version.py b/whisper/version.py index c96dd9c..2242d25 100644 --- a/whisper/version.py +++ b/whisper/version.py @@ -1 +1 @@ -__version__ = "20231117" +__version__ = "20240927" From 27f971320a50e65fd510b88be04219a6ade31f9b Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 30 Sep 2024 10:27:14 -0700 Subject: [PATCH 04/18] using sdpa if available (#2359) * using sdpa if available * Update model.py --- whisper/model.py | 51 +++++++++++++++++++++++++++++++++++++---------- whisper/timing.py | 4 +++- 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/whisper/model.py b/whisper/model.py index a678283..e537447 100644 --- a/whisper/model.py +++ b/whisper/model.py @@ -1,7 +1,8 @@ import base64 import gzip +from contextlib import contextmanager from dataclasses import dataclass -from typing import Dict, Iterable, Optional +from typing import Dict, Iterable, Optional, Tuple import numpy as np import torch @@ -12,6 +13,14 @@ from .decoding import decode as decode_function from .decoding import detect_language as detect_language_function from .transcribe import transcribe as transcribe_function +try: + from torch.nn.functional import scaled_dot_product_attention + + SDPA_AVAILABLE = True +except (ImportError, RuntimeError, OSError): + scaled_dot_product_attention = None + SDPA_AVAILABLE = False + @dataclass class ModelDimensions: @@ -59,7 +68,19 @@ def sinusoids(length, channels, max_timescale=10000): return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1) +@contextmanager +def disable_sdpa(): + prev_state = MultiHeadAttention.use_sdpa + try: + MultiHeadAttention.use_sdpa = False + yield + finally: + MultiHeadAttention.use_sdpa = prev_state + + class MultiHeadAttention(nn.Module): + use_sdpa = True + def __init__(self, n_state: int, n_head: int): super().__init__() self.n_head = n_head @@ -92,20 +113,30 @@ class MultiHeadAttention(nn.Module): def qkv_attention( self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None - ): + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: n_batch, n_ctx, n_state = q.shape scale = (n_state // self.n_head) ** -0.25 - q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale - k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale + q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) + k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) - qk = q @ k - if mask is not None: - qk = qk + mask[:n_ctx, :n_ctx] - qk = qk.float() + if SDPA_AVAILABLE and MultiHeadAttention.use_sdpa: + a = scaled_dot_product_attention( + q, k, v, is_causal=mask is not None and n_ctx > 1 + ) + out = a.permute(0, 2, 1, 3).flatten(start_dim=2) + qk = None + else: + qk = (q * scale) @ (k * scale).transpose(-1, -2) + if mask is not None: + qk = qk + mask[:n_ctx, :n_ctx] + qk = qk.float() - w = F.softmax(qk, dim=-1).to(q.dtype) - return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach() + w = F.softmax(qk, dim=-1).to(q.dtype) + out = (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2) + qk = qk.detach() + + return out, qk class ResidualAttentionBlock(nn.Module): diff --git a/whisper/timing.py b/whisper/timing.py index b695ead..e563414 100644 --- a/whisper/timing.py +++ b/whisper/timing.py @@ -191,7 +191,9 @@ def find_alignment( for i, block in enumerate(model.decoder.blocks) ] - with torch.no_grad(): + from .model import disable_sdpa + + with torch.no_grad(), disable_sdpa(): logits = model(mel.unsqueeze(0), tokens.unsqueeze(0))[0] sampled_logits = logits[len(tokenizer.sot_sequence) :, : tokenizer.eot] token_probs = sampled_logits.softmax(dim=-1) From b66b46f32dd3934edd3e79b2821357f52d388501 Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 30 Sep 2024 10:33:56 -0700 Subject: [PATCH 05/18] test on python/pytorch versions up to 3.12 and 2.4.1 (#2360) --- .github/workflows/test.yml | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1eaf505..a1cc48d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -41,11 +41,19 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] - pytorch-version: [1.13.1, 2.0.0] - exclude: - - python-version: '3.11' + include: + - python-version: '3.8' pytorch-version: 1.13.1 + - python-version: '3.8' + pytorch-version: 2.0.1 + - python-version: '3.9' + pytorch-version: 2.1.2 + - python-version: '3.10' + pytorch-version: 2.2.2 + - python-version: '3.11' + pytorch-version: 2.3.1 + - python-version: '3.12' + pytorch-version: 2.4.1 steps: - uses: conda-incubator/setup-miniconda@v2 - run: conda install -n test ffmpeg python=${{ matrix.python-version }} From 25e5c364e0a21ddefee46adb674c591f1ba610ba Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 30 Sep 2024 10:59:51 -0700 Subject: [PATCH 06/18] large-v3-turbo model (#2361) --- README.md | 20 ++++++++++++-------- model-card.md | 4 +++- whisper/__init__.py | 4 ++++ whisper/transcribe.py | 2 +- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index afca9c9..910b7db 100644 --- a/README.md +++ b/README.md @@ -57,17 +57,21 @@ pip install setuptools-rust ## Available models and languages -There are five model sizes, four with English-only versions, offering speed and accuracy tradeoffs. Below are the names of the available models and their approximate memory requirements and inference speed relative to the large model; actual speed may vary depending on many factors including the available hardware. +There are six model sizes, four with English-only versions, offering speed and accuracy tradeoffs. +Below are the names of the available models and their approximate memory requirements and inference speed relative to the large model. +The relative speeds below are measured by transcribing English speech on a A100, and the real-world speed may vary significantly depending on many factors including the language, the speaking speed, and the available hardware. | Size | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed | |:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:| -| tiny | 39 M | `tiny.en` | `tiny` | ~1 GB | ~32x | -| base | 74 M | `base.en` | `base` | ~1 GB | ~16x | -| small | 244 M | `small.en` | `small` | ~2 GB | ~6x | +| tiny | 39 M | `tiny.en` | `tiny` | ~1 GB | ~10x | +| base | 74 M | `base.en` | `base` | ~1 GB | ~7x | +| small | 244 M | `small.en` | `small` | ~2 GB | ~4x | | medium | 769 M | `medium.en` | `medium` | ~5 GB | ~2x | | large | 1550 M | N/A | `large` | ~10 GB | 1x | +| turbo | 809 M | N/A | `turbo` | ~6 GB | ~8x | The `.en` models for English-only applications tend to perform better, especially for the `tiny.en` and `base.en` models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models. +Additionally, the `turbo` model is an optimized version of `large-v3` that offers faster transcription speed with a minimal degradation in accuracy. Whisper's performance varies widely depending on the language. The figure below shows a performance breakdown of `large-v3` and `large-v2` models by language, using WERs (word error rates) or CER (character error rates, shown in *Italic*) evaluated on the Common Voice 15 and Fleurs datasets. Additional WER/CER metrics corresponding to the other models and datasets can be found in Appendix D.1, D.2, and D.4 of [the paper](https://arxiv.org/abs/2212.04356), as well as the BLEU (Bilingual Evaluation Understudy) scores for translation in Appendix D.3. @@ -77,9 +81,9 @@ Whisper's performance varies widely depending on the language. The figure below ## Command-line usage -The following command will transcribe speech in audio files, using the `medium` model: +The following command will transcribe speech in audio files, using the `turbo` model: - whisper audio.flac audio.mp3 audio.wav --model medium + whisper audio.flac audio.mp3 audio.wav --model turbo The default setting (which selects the `small` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option: @@ -103,7 +107,7 @@ Transcription can also be performed within Python: ```python import whisper -model = whisper.load_model("base") +model = whisper.load_model("turbo") result = model.transcribe("audio.mp3") print(result["text"]) ``` @@ -115,7 +119,7 @@ Below is an example usage of `whisper.detect_language()` and `whisper.decode()` ```python import whisper -model = whisper.load_model("base") +model = whisper.load_model("turbo") # load audio and pad/trim it to fit 30 seconds audio = whisper.load_audio("audio.mp3") diff --git a/model-card.md b/model-card.md index 3c041a1..291bc4b 100644 --- a/model-card.md +++ b/model-card.md @@ -16,13 +16,15 @@ The Whisper models are trained for speech recognition and translation tasks, cap | small | 244 M | ✓ | ✓ | | medium | 769 M | ✓ | ✓ | | large | 1550 M | | ✓ | +| turbo | 798 M | | ✓ | In December 2022, we [released an improved large model named `large-v2`](https://github.com/openai/whisper/discussions/661), and `large-v3` in November 2023. +Additionally, we've added a `turbo` model in September 2024 which is optimized for inference speed. ### Release date -September 2022 (original series), December 2022 (`large-v2`), and November 2023 (`large-v3`) +September 2022 (original series), December 2022 (`large-v2`), November 2023 (`large-v3`), September 2024 (`large-v3-turbo`) ### Model type diff --git a/whisper/__init__.py b/whisper/__init__.py index d7fbba3..e210718 100644 --- a/whisper/__init__.py +++ b/whisper/__init__.py @@ -27,6 +27,8 @@ _MODELS = { "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt", "large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt", "large": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt", + "large-v3-turbo": "https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt", + "turbo": "https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt", } # base85-encoded (n_layers, n_heads) boolean arrays indicating the cross-attention heads that are @@ -44,6 +46,8 @@ _ALIGNMENT_HEADS = { "large-v2": b"ABzY8zd+h!0{>%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj", "large-v3": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00", "large": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00", + "large-v3-turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`", + "turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`", } diff --git a/whisper/transcribe.py b/whisper/transcribe.py index 1c075a2..8e1240b 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -511,7 +511,7 @@ def cli(): # fmt: off parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe") - parser.add_argument("--model", default="small", type=valid_model_name, help="name of the Whisper model to use") + parser.add_argument("--model", default="turbo", type=valid_model_name, help="name of the Whisper model to use") parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default") parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference") parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs") From 260bbcfcb3cd17a6952f1a51d516e4b2f0e2559a Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 30 Sep 2024 11:18:17 -0700 Subject: [PATCH 07/18] allowing numpy 2 in tests (#2362) * allowing numpy 2 in tests * allowing numpy 2 in tests --- .github/workflows/test.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a1cc48d..88131f5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -44,20 +44,26 @@ jobs: include: - python-version: '3.8' pytorch-version: 1.13.1 + numpy-requirement: "'numpy<2'" - python-version: '3.8' pytorch-version: 2.0.1 + numpy-requirement: "'numpy<2'" - python-version: '3.9' pytorch-version: 2.1.2 + numpy-requirement: "'numpy<2'" - python-version: '3.10' pytorch-version: 2.2.2 + numpy-requirement: "'numpy<2'" - python-version: '3.11' pytorch-version: 2.3.1 + numpy-requirement: "'numpy'" - python-version: '3.12' pytorch-version: 2.4.1 + numpy-requirement: "'numpy'" steps: - uses: conda-incubator/setup-miniconda@v2 - run: conda install -n test ffmpeg python=${{ matrix.python-version }} - uses: actions/checkout@v3 - run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH - - run: pip3 install .["dev"] 'numpy<2' torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple + - run: pip3 install .["dev"] ${{ matrix.numpy-requirement }} torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple - run: pytest --durations=0 -vv -k 'not test_transcribe or test_transcribe[tiny] or test_transcribe[tiny.en]' -m 'not requires_cuda' From 25639fc17ddc013d56c594bfbf7644f2185fad84 Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 30 Sep 2024 11:20:53 -0700 Subject: [PATCH 08/18] Release 20240930 --- CHANGELOG.md | 7 +++++++ whisper/version.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f09538..7152899 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # CHANGELOG +## [v20240930](https://github.com/openai/whisper/releases/tag/v20240930) + +* allowing numpy 2 in tests ([#2362](https://github.com/openai/whisper/pull/2362)) +* large-v3-turbo model ([#2361](https://github.com/openai/whisper/pull/2361)) +* test on python/pytorch versions up to 3.12 and 2.4.1 ([#2360](https://github.com/openai/whisper/pull/2360)) +* using sdpa if available ([#2359](https://github.com/openai/whisper/pull/2359)) + ## [v20240927](https://github.com/openai/whisper/releases/tag/v20240927) * pinning numpy<2 in tests ([#2332](https://github.com/openai/whisper/pull/2332)) diff --git a/whisper/version.py b/whisper/version.py index 2242d25..b4b3350 100644 --- a/whisper/version.py +++ b/whisper/version.py @@ -1 +1 @@ -__version__ = "20240927" +__version__ = "20240930" From cdb81479623391f0651f4f9175ad986e85777f31 Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Fri, 25 Oct 2024 17:30:02 -0700 Subject: [PATCH 09/18] more pytorch versions in tests (#2408) --- .github/workflows/test.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 88131f5..84b81cc 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -42,6 +42,9 @@ jobs: strategy: matrix: include: + - python-version: '3.8' + pytorch-version: 1.10.1 + numpy-requirement: "'numpy<2'" - python-version: '3.8' pytorch-version: 1.13.1 numpy-requirement: "'numpy<2'" @@ -60,6 +63,9 @@ jobs: - python-version: '3.12' pytorch-version: 2.4.1 numpy-requirement: "'numpy'" + - python-version: '3.12' + pytorch-version: 2.5.0 + numpy-requirement: "'numpy'" steps: - uses: conda-incubator/setup-miniconda@v2 - run: conda install -n test ffmpeg python=${{ matrix.python-version }} From 5979f03701209bb035a0a466f14131aeb1116cbb Mon Sep 17 00:00:00 2001 From: kittsil Date: Sat, 26 Oct 2024 09:17:31 -0500 Subject: [PATCH 10/18] Add option to carry initial_prompt with the sliding window (#2343) * Add option to carry initial_prompt with the sliding window Add an option `carry_initial_prompt = False` to `whisper.transcribe()`. When set to `True`, `initial_prompt` is prepended to each internal `decode()` call's `prompt`. If there is not enough context space at the start of the prompt, the prompt is left-sliced to make space. * Prevent redundant initial_prompt_tokens * Revert unnecessary .gitignore change --------- Co-authored-by: Kittsil Co-authored-by: Jong Wook Kim --- whisper/transcribe.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/whisper/transcribe.py b/whisper/transcribe.py index 8e1240b..8eb6a71 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -46,6 +46,7 @@ def transcribe( no_speech_threshold: Optional[float] = 0.6, condition_on_previous_text: bool = True, initial_prompt: Optional[str] = None, + carry_initial_prompt: bool = False, word_timestamps: bool = False, prepend_punctuations: str = "\"'“¿([{-", append_punctuations: str = "\"'.。,,!!??::”)]}、", @@ -102,6 +103,11 @@ def transcribe( "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those word correctly. + carry_initial_prompt: bool + If carry_initial_prompt is True, `initial_prompt` is prepended to the prompt of each internal + `decode()` call. If there is not enough context space at the start of the prompt, it is + left-sliced to make space. + decode_options: dict Keyword arguments to construct `DecodingOptions` instances @@ -227,9 +233,11 @@ def transcribe( all_segments = [] prompt_reset_since = 0 + remaining_prompt_length = model.dims.n_text_ctx // 2 - 1 if initial_prompt is not None: initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip()) all_tokens.extend(initial_prompt_tokens) + remaining_prompt_length -= len(initial_prompt_tokens) else: initial_prompt_tokens = [] @@ -275,7 +283,13 @@ def transcribe( segment_duration = segment_size * HOP_LENGTH / SAMPLE_RATE mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(model.device).to(dtype) - decode_options["prompt"] = all_tokens[prompt_reset_since:] + if carry_initial_prompt: + nignored = max(len(initial_prompt_tokens), prompt_reset_since) + remaining_prompt = all_tokens[nignored:][-remaining_prompt_length:] + decode_options["prompt"] = initial_prompt_tokens + remaining_prompt + else: + decode_options["prompt"] = all_tokens[prompt_reset_since:] + result: DecodingResult = decode_with_fallback(mel_segment) tokens = torch.tensor(result.tokens) @@ -529,6 +543,8 @@ def cli(): parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations") parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.") + parser.add_argument("--carry_initial_prompt", type=str2bool, default=False, help="if True, prepend initial_prompt to every internal decode() call. May reduce the effectiveness of condition_on_previous_text") + parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop") parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default") From 271445b2f24f00f8175c4fb7ae91876f7451dfc1 Mon Sep 17 00:00:00 2001 From: BotMaster3000 Date: Mon, 4 Nov 2024 08:00:30 +0100 Subject: [PATCH 11/18] Update README.md (#2379) Default now uses Turbo instead of Small --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 910b7db..1a661d7 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,7 @@ The following command will transcribe speech in audio files, using the `turbo` m whisper audio.flac audio.mp3 audio.wav --model turbo -The default setting (which selects the `small` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option: +The default setting (which selects the `turbo` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option: whisper japanese.wav --language Japanese From 173ff7dd1d9fb1c4fddea0d41d704cfefeb8908c Mon Sep 17 00:00:00 2001 From: f1sh <71207078+YuZekai@users.noreply.github.com> Date: Wed, 13 Nov 2024 08:35:54 +0800 Subject: [PATCH 12/18] fix typo data/README.md (#2433) --- data/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/README.md b/data/README.md index 3b4aea1..fcb3200 100644 --- a/data/README.md +++ b/data/README.md @@ -45,7 +45,7 @@ We downloaded the [CHiME-5 dataset](https://spandh.dcs.shef.ac.uk//chime_challen ### AMI-IHM, AMI-SDM1 -We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 ad 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b). +We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 and 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b). ## Long-form English-only datasets From fc5ded7d9045c693692f13853857c3f8baea3a7b Mon Sep 17 00:00:00 2001 From: Lowell Vaughn Date: Tue, 26 Nov 2024 09:37:01 -0800 Subject: [PATCH 13/18] Updating README and doc strings to reflect that n_mels can now be 128 (#2049) --- README.md | 2 +- whisper/audio.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1a661d7..696869c 100644 --- a/README.md +++ b/README.md @@ -126,7 +126,7 @@ audio = whisper.load_audio("audio.mp3") audio = whisper.pad_or_trim(audio) # make log-Mel spectrogram and move to the same device as the model -mel = whisper.log_mel_spectrogram(audio).to(model.device) +mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device) # detect the spoken language _, probs = model.detect_language(mel) diff --git a/whisper/audio.py b/whisper/audio.py index cf6c66a..826250f 100644 --- a/whisper/audio.py +++ b/whisper/audio.py @@ -122,7 +122,7 @@ def log_mel_spectrogram( The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz n_mels: int - The number of Mel-frequency filters, only 80 is supported + The number of Mel-frequency filters, only 80 and 128 are supported padding: int Number of zero samples to pad to the right @@ -132,7 +132,7 @@ def log_mel_spectrogram( Returns ------- - torch.Tensor, shape = (80, n_frames) + torch.Tensor, shape = (n_mels, n_frames) A Tensor that contains the Mel spectrogram """ if not torch.is_tensor(audio): From 90db0de1896c23cbfaf0c58bc2d30665f709f170 Mon Sep 17 00:00:00 2001 From: Purfview <69023953+Purfview@users.noreply.github.com> Date: Sun, 1 Dec 2024 05:47:01 +0000 Subject: [PATCH 14/18] Bugfix: Illogical "Avoid computing higher temperatures on no_speech" (#1903) * Bugfix: Illogical "Avoid computing higher temperatures on no_speech" Bugfix for https://github.com/openai/whisper/pull/1279 It's "silence" when decoding has failed due to `compression_ratio_threshold` too, when further down the code it's not "silence" anymore. "Silence" should be only when decoding has failed due to `logprob_threshold`. Like described there: https://github.com/openai/whisper/blob/8bc8860694949db53c42ba47ddc23786c2e02a8b/whisper/transcribe.py#L421 And in code there: https://github.com/openai/whisper/blob/8bc8860694949db53c42ba47ddc23786c2e02a8b/whisper/transcribe.py#L243-L251 * Fix if "logprob_threshold=None" --------- Co-authored-by: Jong Wook Kim --- whisper/transcribe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/whisper/transcribe.py b/whisper/transcribe.py index 8eb6a71..0a4cc36 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -214,6 +214,8 @@ def transcribe( if ( no_speech_threshold is not None and decode_result.no_speech_prob > no_speech_threshold + and logprob_threshold is not None + and decode_result.avg_logprob < logprob_threshold ): needs_fallback = False # silence if not needs_fallback: From 6c1d8f1ea10b85ec0a0ed584edb5ad9c8efc3195 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sat, 4 Jan 2025 09:47:12 +0100 Subject: [PATCH 15/18] Upgrade GitHub Actions (#2430) --- .github/workflows/test.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 84b81cc..106c66b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -11,10 +11,10 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Fetch base branch run: git fetch origin ${{ github.base_ref }} - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: "3.8" architecture: x64 @@ -23,7 +23,7 @@ jobs: run: | echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT - name: pip/pre-commit cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: | ${{ steps.pip-cache.outputs.dir }} @@ -67,9 +67,9 @@ jobs: pytorch-version: 2.5.0 numpy-requirement: "'numpy'" steps: - - uses: conda-incubator/setup-miniconda@v2 + - uses: conda-incubator/setup-miniconda@v3 - run: conda install -n test ffmpeg python=${{ matrix.python-version }} - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH - run: pip3 install .["dev"] ${{ matrix.numpy-requirement }} torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple - run: pytest --durations=0 -vv -k 'not test_transcribe or test_transcribe[tiny] or test_transcribe[tiny.en]' -m 'not requires_cuda' From 26a7cacc83c2cfbbf743022da8331b29702ceedc Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sat, 4 Jan 2025 10:02:18 +0100 Subject: [PATCH 16/18] pre-commit autoupdate && pre-commit run --all-files (#2484) * pre-commit autoupdate && pre-commit run --all-files * Black formatter needs a current version of Python --- .github/workflows/test.yml | 4 ++-- .pre-commit-config.yaml | 8 ++++---- whisper/normalizers/basic.py | 22 +++++++++++++--------- whisper/utils.py | 8 +++++--- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 106c66b..16c7ff7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,7 +16,7 @@ jobs: run: git fetch origin ${{ github.base_ref }} - uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.9" architecture: x64 - name: Get pip cache dir id: pip-cache @@ -33,7 +33,7 @@ jobs: ${{ runner.os }}-pip-pre-commit - name: pre-commit run: | - pip install -U pre-commit + pip install --upgrade pre-commit pre-commit install --install-hooks pre-commit run --all-files whisper-test: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3f5a74b..48df249 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.0.1 + rev: v5.0.0 hooks: - id: check-json - id: end-of-file-fixer @@ -11,17 +11,17 @@ repos: - id: check-added-large-files args: [--maxkb=4096] - repo: https://github.com/psf/black - rev: 23.7.0 + rev: 24.10.0 hooks: - id: black - repo: https://github.com/pycqa/isort - rev: 5.12.0 + rev: 5.13.2 hooks: - id: isort name: isort (python) args: ["--profile", "black", "-l", "88", "--trailing-comma", "--multi-line", "3"] - repo: https://github.com/pycqa/flake8.git - rev: 6.0.0 + rev: 7.1.1 hooks: - id: flake8 types: [python] diff --git a/whisper/normalizers/basic.py b/whisper/normalizers/basic.py index a824032..8690ae7 100644 --- a/whisper/normalizers/basic.py +++ b/whisper/normalizers/basic.py @@ -30,15 +30,19 @@ def remove_symbols_and_diacritics(s: str, keep=""): and drop any diacritics (category 'Mn' and some manual mappings) """ return "".join( - c - if c in keep - else ADDITIONAL_DIACRITICS[c] - if c in ADDITIONAL_DIACRITICS - else "" - if unicodedata.category(c) == "Mn" - else " " - if unicodedata.category(c)[0] in "MSP" - else c + ( + c + if c in keep + else ( + ADDITIONAL_DIACRITICS[c] + if c in ADDITIONAL_DIACRITICS + else ( + "" + if unicodedata.category(c) == "Mn" + else " " if unicodedata.category(c)[0] in "MSP" else c + ) + ) + ) for c in unicodedata.normalize("NFKD", s) ) diff --git a/whisper/utils.py b/whisper/utils.py index 9b9b138..13792f7 100644 --- a/whisper/utils.py +++ b/whisper/utils.py @@ -209,9 +209,11 @@ class SubtitlesWriter(ResultWriter): yield start, end, "".join( [ - re.sub(r"^(\s*)(.*)$", r"\1\2", word) - if j == i - else word + ( + re.sub(r"^(\s*)(.*)$", r"\1\2", word) + if j == i + else word + ) for j, word in enumerate(all_words) ] ) From dd4d010d2c585bc70aeddd166cd3e26b0bb62f31 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sat, 4 Jan 2025 10:38:35 +0100 Subject: [PATCH 17/18] PEP 621: Migrate from setup.py to pyproject.toml (#2435) --- pyproject.toml | 48 +++++++++++++++++++++++++++++++++++++++++++++++- setup.py | 42 ------------------------------------------ 2 files changed, 47 insertions(+), 43 deletions(-) delete mode 100644 setup.py diff --git a/pyproject.toml b/pyproject.toml index 84637eb..21b90e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,50 @@ +[build-system] +build-backend = "setuptools.build_meta" + +requires = [ "setuptools>=61.2" ] + +[project] +name = "openai-whisper" +description = "Robust Speech Recognition via Large-Scale Weak Supervision" +readme.content-type = "text/markdown" +readme.file = "README.md" +license = { text = "MIT" } +authors = [ { name = "OpenAI" } ] +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +dynamic = [ "version" ] +dependencies = [ + "more-itertools", + "numba", + "numpy", + "tiktoken", + "torch", + "tqdm", + "triton>=2; (platform_machine=='x86_64' and sys_platform=='linux') or sys_platform=='linux2'", +] +optional-dependencies.dev = [ "black", "flake8", "isort", "pytest", "scipy" ] +urls = { Homepage = "https://github.com/openai/whisper" } +scripts.whisper = "whisper.transcribe:cli" + +[tool.setuptools] +py-modules = [ "whisper" ] +include-package-data = true + +[tool.setuptools.dynamic] +version = { attr = "whisper.version.__version__" } + +[tool.setuptools.packages.find] +exclude = [ "tests*" ] +namespaces = false + [tool.black] [tool.isort] @@ -5,4 +52,3 @@ profile = "black" include_trailing_comma = true line_length = 88 multi_line_output = 3 - diff --git a/setup.py b/setup.py deleted file mode 100644 index 73c4eb8..0000000 --- a/setup.py +++ /dev/null @@ -1,42 +0,0 @@ -import platform -import sys -from pathlib import Path - -import pkg_resources -from setuptools import find_packages, setup - - -def read_version(fname="whisper/version.py"): - exec(compile(open(fname, encoding="utf-8").read(), fname, "exec")) - return locals()["__version__"] - - -requirements = [] -if sys.platform.startswith("linux") and platform.machine() == "x86_64": - requirements.append("triton>=2.0.0") - -setup( - name="openai-whisper", - py_modules=["whisper"], - version=read_version(), - description="Robust Speech Recognition via Large-Scale Weak Supervision", - long_description=open("README.md", encoding="utf-8").read(), - long_description_content_type="text/markdown", - readme="README.md", - python_requires=">=3.8", - author="OpenAI", - url="https://github.com/openai/whisper", - license="MIT", - packages=find_packages(exclude=["tests*"]), - install_requires=[ - str(r) - for r in pkg_resources.parse_requirements( - Path(__file__).with_name("requirements.txt").open() - ) - ], - entry_points={ - "console_scripts": ["whisper=whisper.transcribe:cli"], - }, - include_package_data=True, - extras_require={"dev": ["pytest", "scipy", "black", "flake8", "isort"]}, -) From 517a43ecd132a2089d85f4ebc044728a71d49f6e Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Sat, 4 Jan 2025 12:56:16 -0800 Subject: [PATCH 18/18] Update python-publish.yml using `-m build --sdist` instead of `setup.py sdist` --- .github/workflows/python-publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 4b91a2a..c868068 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -33,5 +33,5 @@ jobs: TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} run: | - python setup.py sdist + python -m build --sdist twine upload dist/*