From 79c43e48592bd5cd9d893236a617d2ddeee8d878 Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Tue, 14 Mar 2023 15:47:58 -0400 Subject: [PATCH 01/45] abort find_alignment on empty input (#1090) --- whisper/timing.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/whisper/timing.py b/whisper/timing.py index 1f8f4cf..f5d07e5 100644 --- a/whisper/timing.py +++ b/whisper/timing.py @@ -170,6 +170,9 @@ def find_alignment( medfilt_width: int = 7, qk_scale: float = 1.0, ) -> List[WordTiming]: + if len(text_tokens) == 0: + return [] + tokens = torch.tensor( [ *tokenizer.sot_sequence, From 6dea21fd7f7253bfe450f1e2512a0fe47ee2d258 Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Wed, 15 Mar 2023 00:39:05 -0700 Subject: [PATCH 02/45] Release 20230314 --- CHANGELOG.md | 8 ++++++++ whisper/version.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51697a0..a77d966 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # CHANGELOG +## [v20230314](https://github.com/openai/whisper/releases/tag/v20230314) + +* abort find_alignment on empty input ([#1090](https://github.com/openai/whisper/pull/1090)) +* Fix truncated words list when the replacement character is decoded ([#1089](https://github.com/openai/whisper/pull/1089)) +* fix github language stats getting dominated by jupyter notebook ([#1076](https://github.com/openai/whisper/pull/1076)) +* Fix alignment between the segments and the list of words ([#1087](https://github.com/openai/whisper/pull/1087)) +* Use tiktoken ([#1044](https://github.com/openai/whisper/pull/1044)) + ## [v20230308](https://github.com/openai/whisper/releases/tag/v20230308) * kwargs in decode() for convenience ([#1061](https://github.com/openai/whisper/pull/1061)) diff --git a/whisper/version.py b/whisper/version.py index a0e37f9..572259a 100644 --- a/whisper/version.py +++ b/whisper/version.py @@ -1 +1 @@ -__version__ = "20230308" +__version__ = "20230314" From b5851c6c40e753606765ac45b85b298e3ae9e00d Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Wed, 29 Mar 2023 16:12:36 -0400 Subject: [PATCH 03/45] Update tokenizer.py (#1163) --- whisper/tokenizer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/whisper/tokenizer.py b/whisper/tokenizer.py index 236f65e..4030e15 100644 --- a/whisper/tokenizer.py +++ b/whisper/tokenizer.py @@ -6,7 +6,6 @@ from functools import cached_property, lru_cache from typing import Dict, List, Optional, Tuple import tiktoken -from tiktoken_ext.openai_public import gpt2 LANGUAGES = { "en": "english", @@ -352,7 +351,7 @@ def get_encoding(name: str = "gpt2"): return tiktoken.Encoding( name=os.path.basename(vocab_path), explicit_n_vocab=n_vocab, - pat_str=gpt2()["pat_str"], + pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", mergeable_ranks=ranks, special_tokens=special_tokens, ) From a151816b6b123824c1630e9aed48b04fb90cf4cd Mon Sep 17 00:00:00 2001 From: "K.B.Dharun Krishna" Date: Tue, 11 Apr 2023 02:24:09 +0530 Subject: [PATCH 04/45] python-publish.yml: bump actions version to fix node warning (#1211) --- .github/workflows/python-publish.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 116c859..4b91a2a 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -8,14 +8,14 @@ jobs: deploy: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions-ecosystem/action-regex-match@v2 id: regex-match with: text: ${{ github.event.head_commit.message }} regex: '^Release ([^ ]+)' - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: '3.8' - name: Install dependencies From 255887f219e6b632bc1a6aac1caf28eecfca1bac Mon Sep 17 00:00:00 2001 From: ryanheise Date: Tue, 11 Apr 2023 10:23:53 +1000 Subject: [PATCH 05/45] Squash long words at window and sentence boundaries. (#1114) * Squash long words at window and sentence boundaries. * Formatting requirements. * Fix squashing logic to point to correct words. --------- Co-authored-by: Jong Wook Kim --- whisper/timing.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/whisper/timing.py b/whisper/timing.py index f5d07e5..055ccc0 100644 --- a/whisper/timing.py +++ b/whisper/timing.py @@ -225,17 +225,26 @@ def find_alignment( for i, j in zip(word_boundaries[:-1], word_boundaries[1:]) ] - # hack: ensure the first and second word is not longer than twice the median word duration. + # hack: truncate long words at the start of a window and the start of a sentence. # a better segmentation algorithm based on VAD should be able to replace this. word_durations = end_times - start_times word_durations = word_durations[word_durations.nonzero()] if len(word_durations) > 0: median_duration = np.median(word_durations) max_duration = median_duration * 2 - if len(word_durations) >= 2 and word_durations[1] > max_duration: - boundary = max(end_times[2] / 2, end_times[2] - max_duration) - end_times[0] = start_times[1] = boundary - if len(word_durations) >= 1 and end_times[0] - start_times[0] > max_duration: + sentence_end_marks = ".。!!??" + # ensure words at sentence boundaries are not longer than twice the median word duration. + for i in range(1, len(start_times)): + if end_times[i] - start_times[i] > max_duration: + if words[i] in sentence_end_marks: + end_times[i] = start_times[i] + max_duration + elif words[i - 1] in sentence_end_marks: + start_times[i] = end_times[i] - max_duration + # ensure the first and second word is not longer than twice the median word duration. + if len(start_times) > 0 and end_times[0] - start_times[0] > max_duration: + if len(start_times) > 1 and end_times[1] - start_times[1] > max_duration: + boundary = max(end_times[1] / 2, end_times[1] - max_duration) + end_times[0] = start_times[1] = boundary start_times[0] = max(0, end_times[0] - max_duration) return [ @@ -327,8 +336,17 @@ def add_word_timestamps( word_index += 1 if len(words) > 0: - # adjust the segment-level timestamps based on the word-level timestamps segment["start"] = words[0]["start"] - segment["end"] = words[-1]["end"] + # hack: prefer the segment-level end timestamp if the last word is too long. + # a better segmentation algorithm based on VAD should be able to replace this. + if ( + segment["end"] > words[-1]["start"] + and segment["end"] + 0.5 < words[-1]["end"] + ): + # adjust the word-level timestamps based on the segment-level timestamps + words[-1]["end"] = segment["end"] + else: + # adjust the segment-level timestamps based on the word-level timestamps + segment["end"] = words[-1]["end"] segment["words"] = words From 43940fc9780cb91c4f94899755b4648e19d7b977 Mon Sep 17 00:00:00 2001 From: ryanheise Date: Tue, 11 Apr 2023 10:28:35 +1000 Subject: [PATCH 06/45] Implement max line width and max line count, and make word highlighting optional (#1184) * Add highlight_words, max_line_width, max_line_count * Refactor subtitle generator --------- Co-authored-by: Jong Wook Kim --- whisper/transcribe.py | 13 ++++- whisper/utils.py | 129 +++++++++++++++++++++++++++++------------- 2 files changed, 103 insertions(+), 39 deletions(-) diff --git a/whisper/transcribe.py b/whisper/transcribe.py index ed6d820..84feb12 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -401,6 +401,9 @@ def cli(): parser.add_argument("--word_timestamps", type=str2bool, default=False, help="(experimental) extract word-level timestamps and refine the results based on them") parser.add_argument("--prepend_punctuations", type=str, default="\"\'“¿([{-", help="if word_timestamps is True, merge these punctuation symbols with the next word") parser.add_argument("--append_punctuations", type=str, default="\"\'.。,,!!??::”)]}、", help="if word_timestamps is True, merge these punctuation symbols with the previous word") + parser.add_argument("--highlight_words", type=str2bool, default=False, help="(requires --word_timestamps True) underline each word as it is spoken in srt and vtt") + parser.add_argument("--max_line_width", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of characters in a line before breaking the line") + parser.add_argument("--max_line_count", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of lines in a segment") parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS") # fmt: on @@ -433,9 +436,17 @@ def cli(): model = load_model(model_name, device=device, download_root=model_dir) writer = get_writer(output_format, output_dir) + word_options = ["highlight_words", "max_line_count", "max_line_width"] + if not args["word_timestamps"]: + for option in word_options: + if args[option]: + parser.error(f"--{option} requires --word_timestamps True") + if args["max_line_count"] and not args["max_line_width"]: + warnings.warn("--max_line_count has no effect without --max_line_width") + writer_args = {arg: args.pop(arg) for arg in word_options} for audio_path in args.pop("audio"): result = transcribe(model, audio_path, temperature=temperature, **args) - writer(result, audio_path) + writer(result, audio_path, writer_args) if __name__ == "__main__": diff --git a/whisper/utils.py b/whisper/utils.py index 490bdd1..ba5a10c 100644 --- a/whisper/utils.py +++ b/whisper/utils.py @@ -1,8 +1,9 @@ import json import os +import re import sys import zlib -from typing import Callable, TextIO +from typing import Callable, Optional, TextIO system_encoding = sys.getdefaultencoding() @@ -73,7 +74,7 @@ class ResultWriter: def __init__(self, output_dir: str): self.output_dir = output_dir - def __call__(self, result: dict, audio_path: str): + def __call__(self, result: dict, audio_path: str, options: dict): audio_basename = os.path.basename(audio_path) audio_basename = os.path.splitext(audio_basename)[0] output_path = os.path.join( @@ -81,16 +82,16 @@ class ResultWriter: ) with open(output_path, "w", encoding="utf-8") as f: - self.write_result(result, file=f) + self.write_result(result, file=f, options=options) - def write_result(self, result: dict, file: TextIO): + def write_result(self, result: dict, file: TextIO, options: dict): raise NotImplementedError class WriteTXT(ResultWriter): extension: str = "txt" - def write_result(self, result: dict, file: TextIO): + def write_result(self, result: dict, file: TextIO, options: dict): for segment in result["segments"]: print(segment["text"].strip(), file=file, flush=True) @@ -99,33 +100,81 @@ class SubtitlesWriter(ResultWriter): always_include_hours: bool decimal_marker: str - def iterate_result(self, result: dict): - for segment in result["segments"]: - segment_start = self.format_timestamp(segment["start"]) - segment_end = self.format_timestamp(segment["end"]) - segment_text = segment["text"].strip().replace("-->", "->") + def iterate_result(self, result: dict, options: dict): + raw_max_line_width: Optional[int] = options["max_line_width"] + max_line_count: Optional[int] = options["max_line_count"] + highlight_words: bool = options["highlight_words"] + max_line_width = 1000 if raw_max_line_width is None else raw_max_line_width + preserve_segments = max_line_count is None or raw_max_line_width is None - if word_timings := segment.get("words", None): - all_words = [timing["word"] for timing in word_timings] - all_words[0] = all_words[0].strip() # remove the leading space, if any - last = segment_start - for i, this_word in enumerate(word_timings): - start = self.format_timestamp(this_word["start"]) - end = self.format_timestamp(this_word["end"]) - if last != start: - yield last, start, segment_text + def iterate_subtitles(): + line_len = 0 + line_count = 1 + # the next subtitle to yield (a list of word timings with whitespace) + subtitle: list[dict] = [] + last = result["segments"][0]["words"][0]["start"] + for segment in result["segments"]: + for i, original_timing in enumerate(segment["words"]): + timing = original_timing.copy() + long_pause = not preserve_segments and timing["start"] - last > 3.0 + has_room = line_len + len(timing["word"]) <= max_line_width + seg_break = i == 0 and len(subtitle) > 0 and preserve_segments + if line_len > 0 and has_room and not long_pause and not seg_break: + # line continuation + line_len += len(timing["word"]) + else: + # new line + timing["word"] = timing["word"].strip() + if ( + len(subtitle) > 0 + and max_line_count is not None + and (long_pause or line_count >= max_line_count) + or seg_break + ): + # subtitle break + yield subtitle + subtitle = [] + line_count = 1 + elif line_len > 0: + # line break + line_count += 1 + timing["word"] = "\n" + timing["word"] + line_len = len(timing["word"].strip()) + subtitle.append(timing) + last = timing["start"] + if len(subtitle) > 0: + yield subtitle - yield start, end, "".join( - [ - f"{word}" if j == i else word - for j, word in enumerate(all_words) - ] - ) - last = end + if "words" in result["segments"][0]: + for subtitle in iterate_subtitles(): + subtitle_start = self.format_timestamp(subtitle[0]["start"]) + subtitle_end = self.format_timestamp(subtitle[-1]["end"]) + subtitle_text = "".join([word["word"] for word in subtitle]) + if highlight_words: + last = subtitle_start + all_words = [timing["word"] for timing in subtitle] + for i, this_word in enumerate(subtitle): + start = self.format_timestamp(this_word["start"]) + end = self.format_timestamp(this_word["end"]) + if last != start: + yield last, start, subtitle_text - if last != segment_end: - yield last, segment_end, segment_text - else: + yield start, end, "".join( + [ + re.sub(r"^(\s*)(.*)$", r"\1\2", word) + if j == i + else word + for j, word in enumerate(all_words) + ] + ) + last = end + else: + yield subtitle_start, subtitle_end, subtitle_text + else: + for segment in result["segments"]: + segment_start = self.format_timestamp(segment["start"]) + segment_end = self.format_timestamp(segment["end"]) + segment_text = segment["text"].strip().replace("-->", "->") yield segment_start, segment_end, segment_text def format_timestamp(self, seconds: float): @@ -141,9 +190,9 @@ class WriteVTT(SubtitlesWriter): always_include_hours: bool = False decimal_marker: str = "." - def write_result(self, result: dict, file: TextIO): + def write_result(self, result: dict, file: TextIO, options: dict): print("WEBVTT\n", file=file) - for start, end, text in self.iterate_result(result): + for start, end, text in self.iterate_result(result, options): print(f"{start} --> {end}\n{text}\n", file=file, flush=True) @@ -152,8 +201,10 @@ class WriteSRT(SubtitlesWriter): always_include_hours: bool = True decimal_marker: str = "," - def write_result(self, result: dict, file: TextIO): - for i, (start, end, text) in enumerate(self.iterate_result(result), start=1): + def write_result(self, result: dict, file: TextIO, options: dict): + for i, (start, end, text) in enumerate( + self.iterate_result(result, options), start=1 + ): print(f"{i}\n{start} --> {end}\n{text}\n", file=file, flush=True) @@ -169,7 +220,7 @@ class WriteTSV(ResultWriter): extension: str = "tsv" - def write_result(self, result: dict, file: TextIO): + def write_result(self, result: dict, file: TextIO, options: dict): print("start", "end", "text", sep="\t", file=file) for segment in result["segments"]: print(round(1000 * segment["start"]), file=file, end="\t") @@ -180,11 +231,13 @@ class WriteTSV(ResultWriter): class WriteJSON(ResultWriter): extension: str = "json" - def write_result(self, result: dict, file: TextIO): + def write_result(self, result: dict, file: TextIO, options: dict): json.dump(result, file) -def get_writer(output_format: str, output_dir: str) -> Callable[[dict, TextIO], None]: +def get_writer( + output_format: str, output_dir: str +) -> Callable[[dict, TextIO, dict], None]: writers = { "txt": WriteTXT, "vtt": WriteVTT, @@ -196,9 +249,9 @@ def get_writer(output_format: str, output_dir: str) -> Callable[[dict, TextIO], if output_format == "all": all_writers = [writer(output_dir) for writer in writers.values()] - def write_all(result: dict, file: TextIO): + def write_all(result: dict, file: TextIO, options: dict): for writer in all_writers: - writer(result, file) + writer(result, file, options) return write_all From 76c901ab8d4558992c44138479c4d69eb52fadcb Mon Sep 17 00:00:00 2001 From: Arseniy Bushyn Date: Tue, 11 Apr 2023 03:39:17 +0300 Subject: [PATCH 07/45] Update README.md to reference tiktoken (#1105) Co-authored-by: Jong Wook Kim --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index eba82ce..64e2d84 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ A Transformer sequence-to-sequence model is trained on various speech processing ## Setup -We used Python 3.9.9 and [PyTorch](https://pytorch.org/) 1.10.1 to train and test our models, but the codebase is expected to be compatible with Python 3.8-3.10 and recent PyTorch versions. The codebase also depends on a few Python packages, most notably [HuggingFace Transformers](https://huggingface.co/docs/transformers/index) for their fast tokenizer implementation and [ffmpeg-python](https://github.com/kkroening/ffmpeg-python) for reading audio files. You can download and install (or update to) the latest release of Whisper with the following command: +We used Python 3.9.9 and [PyTorch](https://pytorch.org/) 1.10.1 to train and test our models, but the codebase is expected to be compatible with Python 3.8-3.10 and recent PyTorch versions. The codebase also depends on a few Python packages, most notably [OpenAI's tiktoken](https://github.com/openai/tiktoken) for their fast tokenizer implementation and [ffmpeg-python](https://github.com/kkroening/ffmpeg-python) for reading audio files. You can download and install (or update to) the latest release of Whisper with the following command: pip install -U openai-whisper @@ -48,7 +48,7 @@ choco install ffmpeg scoop install ffmpeg ``` -You may need [`rust`](http://rust-lang.org) installed as well, in case [tokenizers](https://pypi.org/project/tokenizers/) does not provide a pre-built wheel for your platform. If you see installation errors during the `pip install` command above, please follow the [Getting started page](https://www.rust-lang.org/learn/get-started) to install Rust development environment. Additionally, you may need to configure the `PATH` environment variable, e.g. `export PATH="$HOME/.cargo/bin:$PATH"`. If the installation fails with `No module named 'setuptools_rust'`, you need to install `setuptools_rust`, e.g. by running: +You may need [`rust`](http://rust-lang.org) installed as well, in case [tiktoken](https://github.com/openai/tiktoken) does not provide a pre-built wheel for your platform. If you see installation errors during the `pip install` command above, please follow the [Getting started page](https://www.rust-lang.org/learn/get-started) to install Rust development environment. Additionally, you may need to configure the `PATH` environment variable, e.g. `export PATH="$HOME/.cargo/bin:$PATH"`. If the installation fails with `No module named 'setuptools_rust'`, you need to install `setuptools_rust`, e.g. by running: ```bash pip install setuptools-rust From b0022b3283232b2b9f19262360cd80ec9975aeb4 Mon Sep 17 00:00:00 2001 From: "Fernando O. Gallego" Date: Wed, 12 Apr 2023 00:06:03 +0200 Subject: [PATCH 08/45] Update decoding.py (#1155) * Update decoding.py Following the suggestions of @Jeronymous in https://github.com/openai/whisper/pull/914 and https://github.com/openai/whisper/discussions/924, it solves the problem of endless loop. * Removed blank line and whitespaces in empty lines. * Suggested changes according to the linter --------- Co-authored-by: Jong Wook Kim --- whisper/decoding.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/whisper/decoding.py b/whisper/decoding.py index 81cd845..2592ba9 100644 --- a/whisper/decoding.py +++ b/whisper/decoding.py @@ -471,6 +471,13 @@ class ApplyTimestampRules(LogitFilter): # timestamps shouldn't decrease; forbid timestamp tokens smaller than the last logits[k, self.tokenizer.timestamp_begin : timestamps[-1]] = -np.inf + # to force that timestamps are strictly increasing + if last_was_timestamp and not penultimate_was_timestamp: + timestamp_last = timestamps[-1] + else: + timestamp_last = timestamps[-1] + 1 + logits[k, self.tokenizer.timestamp_begin : timestamp_last] = -np.inf + if tokens.shape[1] == self.sample_begin: # suppress generating non-timestamp tokens at the beginning logits[:, : self.tokenizer.timestamp_begin] = -np.inf From c09a7ae299c4c34c5839a76380ae407e7d785914 Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Tue, 11 Apr 2023 18:13:13 -0400 Subject: [PATCH 09/45] Update decoding.py (#1219) --- whisper/decoding.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/whisper/decoding.py b/whisper/decoding.py index 2592ba9..457ee7c 100644 --- a/whisper/decoding.py +++ b/whisper/decoding.py @@ -469,9 +469,7 @@ class ApplyTimestampRules(LogitFilter): ] if timestamps.numel() > 0: # timestamps shouldn't decrease; forbid timestamp tokens smaller than the last - logits[k, self.tokenizer.timestamp_begin : timestamps[-1]] = -np.inf - - # to force that timestamps are strictly increasing + # also force each segment to have a nonzero length, to prevent infinite looping if last_was_timestamp and not penultimate_was_timestamp: timestamp_last = timestamps[-1] else: From e69930cb9c92c6d821225ccf9cc2fb0029f07635 Mon Sep 17 00:00:00 2001 From: Johnny Date: Thu, 4 May 2023 19:42:09 +0200 Subject: [PATCH 10/45] Python 3.11 (#1171) * python 3.11 * python 3.11 * fix * fix * fix * revert changes * Update requirements.txt * Trying pip3 install instead * Excluding cp39 - torch 1.10.2 * Removing 1.10.2 from test --------- Co-authored-by: Jong Wook Kim --- .github/workflows/test.yml | 13 +++++++------ README.md | 4 +++- requirements.txt | 2 +- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index bca49b5..3796a39 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -11,15 +11,16 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10'] - pytorch-version: [1.10.2, 1.13.1] + python-version: ['3.8', '3.9', '3.10', '3.11'] + pytorch-version: [1.13.1, 2.0.0] exclude: - - python-version: '3.10' - pytorch-version: 1.10.2 + - python-version: '3.11' + pytorch-version: 1.13.1 steps: - uses: conda-incubator/setup-miniconda@v2 - - run: conda install -n test ffmpeg python=${{ matrix.python-version }} pytorch=${{ matrix.pytorch-version }} cpuonly -c pytorch - - uses: actions/checkout@v2 + - run: conda install -n test ffmpeg python=${{ matrix.python-version }} + - run: pip3 install torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu + - uses: actions/checkout@v3 - run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH - run: pip install .["dev"] - run: black --check --diff -t py38 --include '(\.pyi?)$' . diff --git a/README.md b/README.md index 64e2d84..648d0c1 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,9 @@ A Transformer sequence-to-sequence model is trained on various speech processing ## Setup -We used Python 3.9.9 and [PyTorch](https://pytorch.org/) 1.10.1 to train and test our models, but the codebase is expected to be compatible with Python 3.8-3.10 and recent PyTorch versions. The codebase also depends on a few Python packages, most notably [OpenAI's tiktoken](https://github.com/openai/tiktoken) for their fast tokenizer implementation and [ffmpeg-python](https://github.com/kkroening/ffmpeg-python) for reading audio files. You can download and install (or update to) the latest release of Whisper with the following command: + +We used Python 3.9.9 and [PyTorch](https://pytorch.org/) 1.10.1 to train and test our models, but the codebase is expected to be compatible with Python 3.8-3.11 and recent PyTorch versions. The codebase also depends on a few Python packages, most notably [OpenAI's tiktoken](https://github.com/openai/tiktoken) for their fast tokenizer implementation and [ffmpeg-python](https://github.com/kkroening/ffmpeg-python) for reading audio files. You can download and install (or update to) the latest release of Whisper with the following command: + pip install -U openai-whisper diff --git a/requirements.txt b/requirements.txt index 09ad0e3..995977a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,5 +3,5 @@ numpy torch tqdm more-itertools -tiktoken==0.3.1 +tiktoken==0.3.3 ffmpeg-python==0.2.0 From 8035e9ef4890a06824a9a56a54d0feea9588f23b Mon Sep 17 00:00:00 2001 From: petterreinholdtsen Date: Thu, 4 May 2023 19:53:59 +0200 Subject: [PATCH 11/45] Drop ffmpeg-python dependency and call ffmpeg directly. (#1242) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Drop ffmpeg-python dependency and call ffmpeg directly. The last ffmpeg-python module release was in 2019[1], upstream seem to be unavailable[2] and the project development seem to have stagnated[3]. As the features it provide is trivial to replace using the Python native subprocess module, drop the dependency. [1] [2] [3] * Rewrote to use subprocess.run() instead of subprocess.Popen(). * formatting changes * formatting update * isort fix * Error checking * isort 🤦🏻 * flake8 fix * minor spelling changes --------- Co-authored-by: Jong Wook Kim --- README.md | 4 +--- requirements.txt | 1 - whisper/audio.py | 28 +++++++++++++++++++--------- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 648d0c1..b4d3998 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,7 @@ A Transformer sequence-to-sequence model is trained on various speech processing ## Setup - -We used Python 3.9.9 and [PyTorch](https://pytorch.org/) 1.10.1 to train and test our models, but the codebase is expected to be compatible with Python 3.8-3.11 and recent PyTorch versions. The codebase also depends on a few Python packages, most notably [OpenAI's tiktoken](https://github.com/openai/tiktoken) for their fast tokenizer implementation and [ffmpeg-python](https://github.com/kkroening/ffmpeg-python) for reading audio files. You can download and install (or update to) the latest release of Whisper with the following command: - +We used Python 3.9.9 and [PyTorch](https://pytorch.org/) 1.10.1 to train and test our models, but the codebase is expected to be compatible with Python 3.8-3.11 and recent PyTorch versions. The codebase also depends on a few Python packages, most notably [OpenAI's tiktoken](https://github.com/openai/tiktoken) for their fast tokenizer implementation. You can download and install (or update to) the latest release of Whisper with the following command: pip install -U openai-whisper diff --git a/requirements.txt b/requirements.txt index 995977a..3c11ac3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,3 @@ torch tqdm more-itertools tiktoken==0.3.3 -ffmpeg-python==0.2.0 diff --git a/whisper/audio.py b/whisper/audio.py index 513ab7c..4f5b6e0 100644 --- a/whisper/audio.py +++ b/whisper/audio.py @@ -1,8 +1,8 @@ import os from functools import lru_cache +from subprocess import CalledProcessError, run from typing import Optional, Union -import ffmpeg import numpy as np import torch import torch.nn.functional as F @@ -39,15 +39,25 @@ def load_audio(file: str, sr: int = SAMPLE_RATE): ------- A NumPy array containing the audio waveform, in float32 dtype. """ + + # This launches a subprocess to decode audio while down-mixing + # and resampling as necessary. Requires the ffmpeg CLI in PATH. + # fmt: off + cmd = [ + "ffmpeg", + "-nostdin", + "-threads", "0", + "-i", file, + "-f", "s16le", + "-ac", "1", + "-acodec", "pcm_s16le", + "-ar", str(sr), + "-" + ] + # fmt: on try: - # This launches a subprocess to decode audio while down-mixing and resampling as necessary. - # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. - out, _ = ( - ffmpeg.input(file, threads=0) - .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) - .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) - ) - except ffmpeg.Error as e: + out = run(cmd, capture_output=True, check=True).stdout + except CalledProcessError as e: raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 From 55237228425e39828bbb964fd7bf774c9962eb67 Mon Sep 17 00:00:00 2001 From: petterreinholdtsen Date: Thu, 4 May 2023 19:58:56 +0200 Subject: [PATCH 12/45] Dropped unused execute bit from mel_filters.npz. (#1254) --- whisper/assets/mel_filters.npz | Bin 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 whisper/assets/mel_filters.npz diff --git a/whisper/assets/mel_filters.npz b/whisper/assets/mel_filters.npz old mode 100755 new mode 100644 From e334ff141d5444fbf6904edaaf408e5b0b416fe8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20BOYER?= Date: Fri, 5 May 2023 02:02:36 +0200 Subject: [PATCH 13/45] Avoid computing higher temperatures on no_speech segments (#1279) * Avoid computing higher temperatures on no_speech In decode_with_fallback, we compute higher temperatures in the case where compression_ratio is too high or avg_logprob is too low. But as the computation of no_speech_prob doens't depend on sampling, we can avoid computing higher temperatures if we detect in the first one that the no_speech condition is fulfilled * Update transcribe.py --------- Co-authored-by: Jong Wook Kim --- whisper/transcribe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/whisper/transcribe.py b/whisper/transcribe.py index 84feb12..cba59ec 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -174,7 +174,11 @@ def transcribe( and decode_result.avg_logprob < logprob_threshold ): needs_fallback = True # average log probability is too low - + if ( + no_speech_threshold is not None + and decode_result.no_speech_prob > no_speech_threshold + ): + needs_fallback = False # silence if not needs_fallback: break From b1c0815c7959fed6e9e1840799c0a7eae8db095c Mon Sep 17 00:00:00 2001 From: Brett Balquist <113616657+brett-b112@users.noreply.github.com> Date: Fri, 5 May 2023 01:47:45 -0500 Subject: [PATCH 14/45] Updated README.md to provide more insight on BLEU and specific appendices (#1236) * Updated README.md to provide more insight on BLEU and specific appendices in the research paper * Update README.md --------- Co-authored-by: Jong Wook Kim --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b4d3998..2053257 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ There are five model sizes, four with English-only versions, offering speed and The `.en` models for English-only applications tend to perform better, especially for the `tiny.en` and `base.en` models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models. -Whisper's performance varies widely depending on the language. The figure below shows a WER (Word Error Rate) breakdown by languages of the Fleurs dataset using the `large-v2` model. More WER and BLEU scores corresponding to the other models and datasets can be found in Appendix D in [the paper](https://arxiv.org/abs/2212.04356). The smaller, the better. +Whisper's performance varies widely depending on the language. The figure below shows a WER (Word Error Rate) breakdown by languages of the Fleurs dataset using the `large-v2` model (The smaller the numbers, the better the performance). Additional WER scores corresponding to the other models and datasets can be found in Appendix D.1, D.2, and D.4. Meanwhile, more BLEU (Bilingual Evaluation Understudy) scores can be found in Appendix D.3. Both are found in [the paper](https://arxiv.org/abs/2212.04356). ![WER breakdown by language](https://raw.githubusercontent.com/openai/whisper/main/language-breakdown.svg) From 7ca9fbea86987a11266c0a6205bc284c089e5fbc Mon Sep 17 00:00:00 2001 From: Paul Willot Date: Fri, 5 May 2023 08:48:06 +0200 Subject: [PATCH 15/45] Fix numba depreceation notice (#1233) From numba 0.57 raise a warning if `nopython` is not supplied: https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit --- whisper/timing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whisper/timing.py b/whisper/timing.py index 055ccc0..1a73eaa 100644 --- a/whisper/timing.py +++ b/whisper/timing.py @@ -54,7 +54,7 @@ def median_filter(x: torch.Tensor, filter_width: int): return result -@numba.jit +@numba.jit(nopython=True) def backtrace(trace: np.ndarray): i = trace.shape[0] - 1 j = trace.shape[1] - 1 From 248b6cb124225dd263bb9bd32d060b6517e067f8 Mon Sep 17 00:00:00 2001 From: Valentin Berkes <16121857+funboarder13920@users.noreply.github.com> Date: Fri, 5 May 2023 09:31:35 +0200 Subject: [PATCH 16/45] fix condition_on_previous_text (#1224) prompt_reset_since is set before all_tokens is extended hence does not have the expected effect. --- whisper/transcribe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/whisper/transcribe.py b/whisper/transcribe.py index cba59ec..ff73a55 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -312,10 +312,6 @@ def transcribe( ) seek += segment_size - if not condition_on_previous_text or result.temperature > 0.5: - # do not feed the prompt tokens if a high temperature was used - prompt_reset_since = len(all_tokens) - if word_timestamps: add_word_timestamps( segments=current_segments, @@ -361,6 +357,10 @@ def transcribe( [token for segment in current_segments for token in segment["tokens"]] ) + if not condition_on_previous_text or result.temperature > 0.5: + # do not feed the prompt tokens if a high temperature was used + prompt_reset_since = len(all_tokens) + # update progress bar pbar.update(min(content_frames, seek) - previous_seek) From f572f2161ba831bae131364c3bffdead7af6d210 Mon Sep 17 00:00:00 2001 From: ryanheise Date: Fri, 30 Jun 2023 09:51:24 +1000 Subject: [PATCH 17/45] Improve timestamp heuristics. (#1461) * Improve timestamp heuristics. * Track pauses with last_speech_timestamp --- whisper/timing.py | 84 ++++++++++++++++++++++++++++--------------- whisper/transcribe.py | 4 +++ 2 files changed, 60 insertions(+), 28 deletions(-) diff --git a/whisper/timing.py b/whisper/timing.py index 1a73eaa..207d877 100644 --- a/whisper/timing.py +++ b/whisper/timing.py @@ -225,28 +225,6 @@ def find_alignment( for i, j in zip(word_boundaries[:-1], word_boundaries[1:]) ] - # hack: truncate long words at the start of a window and the start of a sentence. - # a better segmentation algorithm based on VAD should be able to replace this. - word_durations = end_times - start_times - word_durations = word_durations[word_durations.nonzero()] - if len(word_durations) > 0: - median_duration = np.median(word_durations) - max_duration = median_duration * 2 - sentence_end_marks = ".。!!??" - # ensure words at sentence boundaries are not longer than twice the median word duration. - for i in range(1, len(start_times)): - if end_times[i] - start_times[i] > max_duration: - if words[i] in sentence_end_marks: - end_times[i] = start_times[i] + max_duration - elif words[i - 1] in sentence_end_marks: - start_times[i] = end_times[i] - max_duration - # ensure the first and second word is not longer than twice the median word duration. - if len(start_times) > 0 and end_times[0] - start_times[0] > max_duration: - if len(start_times) > 1 and end_times[1] - start_times[1] > max_duration: - boundary = max(end_times[1] / 2, end_times[1] - max_duration) - end_times[0] = start_times[1] = boundary - start_times[0] = max(0, end_times[0] - max_duration) - return [ WordTiming(word, tokens, start, end, probability) for word, tokens, start, end, probability in zip( @@ -298,6 +276,7 @@ def add_word_timestamps( num_frames: int, prepend_punctuations: str = "\"'“¿([{-", append_punctuations: str = "\"'.。,,!!??::”)]}、", + last_speech_timestamp: float, **kwargs, ): if len(segments) == 0: @@ -310,6 +289,25 @@ def add_word_timestamps( text_tokens = list(itertools.chain.from_iterable(text_tokens_per_segment)) alignment = find_alignment(model, tokenizer, text_tokens, mel, num_frames, **kwargs) + word_durations = np.array([t.end - t.start for t in alignment]) + word_durations = word_durations[word_durations.nonzero()] + median_duration = np.median(word_durations) if len(word_durations) > 0 else 0.0 + max_duration = median_duration * 2 + + # hack: truncate long words at sentence boundaries. + # a better segmentation algorithm based on VAD should be able to replace this. + if len(word_durations) > 0: + median_duration = np.median(word_durations) + max_duration = median_duration * 2 + sentence_end_marks = ".。!!??" + # ensure words at sentence boundaries are not longer than twice the median word duration. + for i in range(1, len(alignment)): + if alignment[i].end - alignment[i].start > max_duration: + if alignment[i].word in sentence_end_marks: + alignment[i].end = alignment[i].start + max_duration + elif alignment[i - 1].word in sentence_end_marks: + alignment[i].start = alignment[i].end - max_duration + merge_punctuations(alignment, prepend_punctuations, append_punctuations) time_offset = segments[0]["seek"] * HOP_LENGTH / SAMPLE_RATE @@ -335,18 +333,48 @@ def add_word_timestamps( saved_tokens += len(timing.tokens) word_index += 1 + # hack: truncate long words at segment boundaries. + # a better segmentation algorithm based on VAD should be able to replace this. if len(words) > 0: - segment["start"] = words[0]["start"] - # hack: prefer the segment-level end timestamp if the last word is too long. - # a better segmentation algorithm based on VAD should be able to replace this. + # ensure the first and second word after a pause is not longer than + # twice the median word duration. + if words[0]["end"] - last_speech_timestamp > median_duration * 4 and ( + words[0]["end"] - words[0]["start"] > max_duration + or ( + len(words) > 1 + and words[1]["end"] - words[0]["start"] > max_duration * 2 + ) + ): + if ( + len(words) > 1 + and words[1]["end"] - words[1]["start"] > max_duration + ): + boundary = max(words[1]["end"] / 2, words[1]["end"] - max_duration) + words[0]["end"] = words[1]["start"] = boundary + words[0]["start"] = max(0, words[0]["end"] - max_duration) + + # prefer the segment-level start timestamp if the first word is too long. + if ( + segment["start"] < words[0]["end"] + and segment["start"] - 0.5 > words[0]["start"] + ): + words[0]["start"] = max( + 0, min(words[0]["end"] - median_duration, segment["start"]) + ) + else: + segment["start"] = words[0]["start"] + + # prefer the segment-level end timestamp if the last word is too long. if ( segment["end"] > words[-1]["start"] and segment["end"] + 0.5 < words[-1]["end"] ): - # adjust the word-level timestamps based on the segment-level timestamps - words[-1]["end"] = segment["end"] + words[-1]["end"] = max( + words[-1]["start"] + median_duration, segment["end"] + ) else: - # adjust the segment-level timestamps based on the word-level timestamps segment["end"] = words[-1]["end"] + last_speech_timestamp = segment["end"] + segment["words"] = words diff --git a/whisper/transcribe.py b/whisper/transcribe.py index ff73a55..6e43a22 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -222,6 +222,7 @@ def transcribe( with tqdm.tqdm( total=content_frames, unit="frames", disable=verbose is not False ) as pbar: + last_speech_timestamp = 0.0 while seek < content_frames: time_offset = float(seek * HOP_LENGTH / SAMPLE_RATE) mel_segment = mel[:, seek : seek + N_FRAMES] @@ -321,10 +322,13 @@ def transcribe( num_frames=segment_size, prepend_punctuations=prepend_punctuations, append_punctuations=append_punctuations, + last_speech_timestamp=last_speech_timestamp, ) word_end_timestamps = [ w["end"] for s in current_segments for w in s["words"] ] + if len(word_end_timestamps) > 0: + last_speech_timestamp = word_end_timestamps[-1] if not single_timestamp_ending and len(word_end_timestamps) > 0: seek_shift = round( (word_end_timestamps[-1] - time_offset) * FRAMES_PER_SECOND From b91c907694f96a3fb9da03d4bbdc83fbcd3a40a4 Mon Sep 17 00:00:00 2001 From: WangChou Lu Date: Fri, 7 Jul 2023 03:48:08 +0800 Subject: [PATCH 18/45] Avoid rearranging all caches (#1483) * avoid rearranging all kv_caches * avoid calculating the same kv_cache from cross attn * Update decoding.py * linter fix --------- Co-authored-by: Jong Wook Kim --- whisper/decoding.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/whisper/decoding.py b/whisper/decoding.py index 457ee7c..ecd98a4 100644 --- a/whisper/decoding.py +++ b/whisper/decoding.py @@ -146,6 +146,10 @@ class PyTorchInference(Inference): self.kv_cache = {} self.hooks = [] + key_modules = [block.attn.key for block in self.model.decoder.blocks] + value_modules = [block.attn.value for block in self.model.decoder.blocks] + self.kv_modules = key_modules + value_modules + def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor: if not self.kv_cache: self.kv_cache, self.hooks = self.model.install_kv_cache_hooks() @@ -164,9 +168,10 @@ class PyTorchInference(Inference): self.hooks = [] def rearrange_kv_cache(self, source_indices): - for module, tensor in self.kv_cache.items(): - # update the key/value cache to contain the selected sequences - self.kv_cache[module] = tensor[source_indices].detach() + if source_indices != list(range(len(source_indices))): + for module in self.kv_modules: + # update the key/value cache to contain the selected sequences + self.kv_cache[module] = self.kv_cache[module][source_indices].detach() class SequenceRanker: @@ -668,7 +673,6 @@ class DecodingTask: return languages, lang_probs def _main_loop(self, audio_features: Tensor, tokens: Tensor): - assert audio_features.shape[0] == tokens.shape[0] n_batch = tokens.shape[0] sum_logprobs: Tensor = torch.zeros(n_batch, device=audio_features.device) no_speech_probs = [np.nan] * n_batch @@ -721,8 +725,7 @@ class DecodingTask: ) ] - # repeat the audio & text tensors by the group size, for beam search or best-of-n sampling - audio_features = audio_features.repeat_interleave(self.n_group, dim=0) + # repeat text tensors by the group size, for beam search or best-of-n sampling tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device) # call the main sampling loop From e8622f9afc4eba139bf796c210f5c01081000472 Mon Sep 17 00:00:00 2001 From: taylorchu Date: Mon, 7 Aug 2023 14:48:56 -0700 Subject: [PATCH 19/45] word timing tweaks (#1559) * word timing tweaks * comment on eot * clearer comments --- whisper/timing.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/whisper/timing.py b/whisper/timing.py index 207d877..56e84d4 100644 --- a/whisper/timing.py +++ b/whisper/timing.py @@ -214,6 +214,13 @@ def find_alignment( text_indices, time_indices = dtw(-matrix) words, word_tokens = tokenizer.split_to_word_tokens(text_tokens + [tokenizer.eot]) + if len(word_tokens) <= 1: + # return on eot only + # >>> np.pad([], (1, 0)) + # array([0.]) + # This results in crashes when we lookup jump_times with float, like + # IndexError: arrays used as indices must be of integer (or boolean) type + return [] word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0)) jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool) @@ -297,8 +304,6 @@ def add_word_timestamps( # hack: truncate long words at sentence boundaries. # a better segmentation algorithm based on VAD should be able to replace this. if len(word_durations) > 0: - median_duration = np.median(word_durations) - max_duration = median_duration * 2 sentence_end_marks = ".。!!??" # ensure words at sentence boundaries are not longer than twice the median word duration. for i in range(1, len(alignment)): From 29b7df62317180a18531e2acbc9c4f52fbf8f639 Mon Sep 17 00:00:00 2001 From: Nino Risteski <95188570+NinoRisteski@users.noreply.github.com> Date: Tue, 19 Sep 2023 00:59:49 +0200 Subject: [PATCH 20/45] Update model-card.md (#1643) fixed a few typos --- model-card.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/model-card.md b/model-card.md index 2ed85cf..b5a571a 100644 --- a/model-card.md +++ b/model-card.md @@ -37,7 +37,7 @@ Sequence-to-sequence ASR (automatic speech recognition) and speech translation m ### Evaluated Use -The primary intended users of these models are AI researchers studying robustness, generalization, capabilities, biases, and constraints of the current model. However, Whisper is also potentially quite useful as an ASR solution for developers, especially for English speech recognition. We recognize that once models are released, it is impossible to restrict access to only “intended” uses or to draw reasonable guidelines around what is or is not research. +The primary intended users of these models are AI researchers studying the robustness, generalization, capabilities, biases, and constraints of the current model. However, Whisper is also potentially quite useful as an ASR solution for developers, especially for English speech recognition. We recognize that once models are released, it is impossible to restrict access to only “intended” uses or to draw reasonable guidelines around what is or is not research. The models are primarily trained and evaluated on ASR and speech translation to English tasks. They show strong ASR results in ~10 languages. They may exhibit additional capabilities, particularly if fine-tuned on certain tasks like voice activity detection, speaker classification, or speaker diarization but have not been robustly evaluated in these areas. We strongly recommend that users perform robust evaluations of the models in a particular context and domain before deploying them. @@ -53,17 +53,17 @@ As discussed in [the accompanying paper](https://arxiv.org/abs/2212.04356), we s ## Performance and Limitations -Our studies show that, over many existing ASR systems, the models exhibit improved robustness to accents, background noise, technical language, as well as zero shot translation from multiple languages into English; and that accuracy on speech recognition and translation is near the state-of-the-art level. +Our studies show that, over many existing ASR systems, the models exhibit improved robustness to accents, background noise, and technical language, as well as zero-shot translation from multiple languages into English; and that accuracy on speech recognition and translation is near the state-of-the-art level. However, because the models are trained in a weakly supervised manner using large-scale noisy data, the predictions may include texts that are not actually spoken in the audio input (i.e. hallucination). We hypothesize that this happens because, given their general knowledge of language, the models combine trying to predict the next word in audio with trying to transcribe the audio itself. -Our models perform unevenly across languages, and we observe lower accuracy on low-resource and/or low-discoverability languages or languages where we have less training data. The models also exhibit disparate performance on different accents and dialects of particular languages, which may include higher word error rate across speakers of different genders, races, ages, or other demographic criteria. Our full evaluation results are presented in [the paper accompanying this release](https://arxiv.org/abs/2212.04356). +Our models perform unevenly across languages, and we observe lower accuracy on low-resource and/or low-discoverability languages or languages where we have less training data. The models also exhibit disparate performance on different accents and dialects of particular languages, which may include a higher word error rate across speakers of different genders, races, ages, or other demographic criteria. Our full evaluation results are presented in [the paper accompanying this release](https://arxiv.org/abs/2212.04356). -In addition, the sequence-to-sequence architecture of the model makes it prone to generating repetitive texts, which can be mitigated to some degree by beam search and temperature scheduling but not perfectly. Further analysis on these limitations are provided in [the paper](https://arxiv.org/abs/2212.04356). It is likely that this behavior and hallucinations may be worse on lower-resource and/or lower-discoverability languages. +In addition, the sequence-to-sequence architecture of the model makes it prone to generating repetitive texts, which can be mitigated to some degree by beam search and temperature scheduling but not perfectly. Further analysis of these limitations is provided in [the paper](https://arxiv.org/abs/2212.04356). It is likely that this behavior and hallucinations may be worse in lower-resource and/or lower-discoverability languages. ## Broader Implications We anticipate that Whisper models’ transcription capabilities may be used for improving accessibility tools. While Whisper models cannot be used for real-time transcription out of the box – their speed and size suggest that others may be able to build applications on top of them that allow for near-real-time speech recognition and translation. The real value of beneficial applications built on top of Whisper models suggests that the disparate performance of these models may have real economic implications. -There are also potential dual use concerns that come with releasing Whisper. While we hope the technology will be used primarily for beneficial purposes, making ASR technology more accessible could enable more actors to build capable surveillance technologies or scale up existing surveillance efforts, as the speed and accuracy allow for affordable automatic transcription and translation of large volumes of audio communication. Moreover, these models may have some capabilities to recognize specific individuals out of the box, which in turn presents safety concerns related both to dual use and disparate performance. In practice, we expect that the cost of transcription is not the limiting factor of scaling up surveillance projects. +There are also potential dual-use concerns that come with releasing Whisper. While we hope the technology will be used primarily for beneficial purposes, making ASR technology more accessible could enable more actors to build capable surveillance technologies or scale up existing surveillance efforts, as the speed and accuracy allow for affordable automatic transcription and translation of large volumes of audio communication. Moreover, these models may have some capabilities to recognize specific individuals out of the box, which in turn presents safety concerns related both to dual use and disparate performance. In practice, we expect that the cost of transcription is not the limiting factor of scaling up surveillance projects. From 21010ef454fb25954b0914785180311fb077add9 Mon Sep 17 00:00:00 2001 From: sqhao Date: Tue, 19 Sep 2023 07:09:59 +0800 Subject: [PATCH 21/45] fix doc of TextDecoder (#1526) Signed-off-by: haoshengqiang Co-authored-by: haoshengqiang --- whisper/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whisper/model.py b/whisper/model.py index 3457fcf..6913002 100644 --- a/whisper/model.py +++ b/whisper/model.py @@ -197,7 +197,7 @@ class TextDecoder(nn.Module): """ x : torch.LongTensor, shape = (batch_size, <= n_ctx) the text tokens - xa : torch.Tensor, shape = (batch_size, n_mels, n_audio_ctx) + xa : torch.Tensor, shape = (batch_size, n_audio_ctx, n_audio_state) the encoded audio features to be attended on """ offset = next(iter(kv_cache.values())).shape[1] if kv_cache else 0 From 8b330df0961ff64499609d70f1ba7c6a8a2f9b1a Mon Sep 17 00:00:00 2001 From: Arthur Kim Date: Tue, 19 Sep 2023 08:15:33 +0900 Subject: [PATCH 22/45] Add .pre-commit-config.yaml (#1528) * Add .pre-commit-config.yaml Co-authored-by: arthur * flake8 E741 --------- Co-authored-by: Jong Wook Kim --- .github/workflows/test.yml | 33 ++++++++++++++++++++++++++++++--- .pre-commit-config.yaml | 28 ++++++++++++++++++++++++++++ whisper/timing.py | 2 +- whisper/tokenizer.py | 2 +- 4 files changed, 60 insertions(+), 5 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3796a39..0f9cfab 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -6,8 +6,38 @@ on: pull_request: branches: - main + jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Fetch base branch + run: git fetch origin ${{ github.base_ref }} + - uses: actions/setup-python@v4 + with: + python-version: "3.8" + architecture: x64 + - name: Get pip cache dir + id: pip-cache + run: | + echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT + - name: pip/pre-commit cache + uses: actions/cache@v3 + with: + path: | + ${{ steps.pip-cache.outputs.dir }} + ~/.cache/pre-commit + key: ${{ runner.os }}-pip-pre-commit-${{ hashFiles('**/.pre-commit-config.yaml') }} + restore-keys: | + ${{ runner.os }}-pip-pre-commit + - name: pre-commit + run: | + pip install -U pre-commit + pre-commit install --install-hooks + pre-commit run --from-ref=origin/${{ github.base_ref }} --to-ref=HEAD whisper-test: + needs: pre-commit runs-on: ubuntu-latest strategy: matrix: @@ -23,7 +53,4 @@ jobs: - uses: actions/checkout@v3 - run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH - run: pip install .["dev"] - - run: black --check --diff -t py38 --include '(\.pyi?)$' . - - run: isort --check --diff . - - run: flake8 --ignore E203,W503,W504,E501,E731,E741 . - run: pytest --durations=0 -vv -k 'not test_transcribe or test_transcribe[tiny] or test_transcribe[tiny.en]' -m 'not requires_cuda' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..3f5a74b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,28 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: check-json + - id: end-of-file-fixer + types: [file, python] + - id: trailing-whitespace + types: [file, python] + - id: mixed-line-ending + - id: check-added-large-files + args: [--maxkb=4096] + - repo: https://github.com/psf/black + rev: 23.7.0 + hooks: + - id: black + - repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + name: isort (python) + args: ["--profile", "black", "-l", "88", "--trailing-comma", "--multi-line", "3"] + - repo: https://github.com/pycqa/flake8.git + rev: 6.0.0 + hooks: + - id: flake8 + types: [python] + args: ["--max-line-length", "88", "--ignore", "E203,E501,W503,W504"] diff --git a/whisper/timing.py b/whisper/timing.py index 56e84d4..befcf46 100644 --- a/whisper/timing.py +++ b/whisper/timing.py @@ -202,7 +202,7 @@ def find_alignment( hook.remove() # heads * tokens * frames - weights = torch.stack([QKs[l][h] for l, h in model.alignment_heads.indices().T]) + weights = torch.stack([QKs[_l][_h] for _l, _h in model.alignment_heads.indices().T]) weights = weights[:, :, : num_frames // 2] weights = (weights * qk_scale).softmax(dim=-1) std, mean = torch.std_mean(weights, dim=-2, keepdim=True, unbiased=False) diff --git a/whisper/tokenizer.py b/whisper/tokenizer.py index 4030e15..3b23991 100644 --- a/whisper/tokenizer.py +++ b/whisper/tokenizer.py @@ -226,7 +226,7 @@ class Tokenizer: @cached_property def all_language_codes(self) -> Tuple[str]: - return tuple(self.decode([l]).strip("<|>") for l in self.all_language_tokens) + return tuple(self.decode([_l]).strip("<|>") for _l in self.all_language_tokens) @cached_property def sot_sequence_including_notimestamps(self) -> Tuple[int]: From 5f957da5ca9ab45a4ac149047e6a8004bc9d25bc Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 18 Sep 2023 16:38:17 -0700 Subject: [PATCH 23/45] Update test.yml --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0f9cfab..dffc17c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -35,7 +35,7 @@ jobs: run: | pip install -U pre-commit pre-commit install --install-hooks - pre-commit run --from-ref=origin/${{ github.base_ref }} --to-ref=HEAD + pre-commit run --all-files whisper-test: needs: pre-commit runs-on: ubuntu-latest From 0a60fcaa9b86748389a656aa013c416030287d47 Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 18 Sep 2023 17:13:19 -0700 Subject: [PATCH 24/45] Release 20230918 --- CHANGELOG.md | 23 +++++++++++++++++++++++ whisper/version.py | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a77d966..50c0536 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,28 @@ # CHANGELOG +## [v20230918](https://github.com/openai/whisper/releases/tag/v20230918) + +* Add .pre-commit-config.yaml ([#1528](https://github.com/openai/whisper/pull/1528)) +* fix doc of TextDecoder ([#1526](https://github.com/openai/whisper/pull/1526)) +* Update model-card.md ([#1643](https://github.com/openai/whisper/pull/1643)) +* word timing tweaks ([#1559](https://github.com/openai/whisper/pull/1559)) +* Avoid rearranging all caches ([#1483](https://github.com/openai/whisper/pull/1483)) +* Improve timestamp heuristics. ([#1461](https://github.com/openai/whisper/pull/1461)) +* fix condition_on_previous_text ([#1224](https://github.com/openai/whisper/pull/1224)) +* Fix numba depreceation notice ([#1233](https://github.com/openai/whisper/pull/1233)) +* Updated README.md to provide more insight on BLEU and specific appendices ([#1236](https://github.com/openai/whisper/pull/1236)) +* Avoid computing higher temperatures on no_speech segments ([#1279](https://github.com/openai/whisper/pull/1279)) +* Dropped unused execute bit from mel_filters.npz. ([#1254](https://github.com/openai/whisper/pull/1254)) +* Drop ffmpeg-python dependency and call ffmpeg directly. ([#1242](https://github.com/openai/whisper/pull/1242)) +* Python 3.11 ([#1171](https://github.com/openai/whisper/pull/1171)) +* Update decoding.py ([#1219](https://github.com/openai/whisper/pull/1219)) +* Update decoding.py ([#1155](https://github.com/openai/whisper/pull/1155)) +* Update README.md to reference tiktoken ([#1105](https://github.com/openai/whisper/pull/1105)) +* Implement max line width and max line count, and make word highlighting optional ([#1184](https://github.com/openai/whisper/pull/1184)) +* Squash long words at window and sentence boundaries. ([#1114](https://github.com/openai/whisper/pull/1114)) +* python-publish.yml: bump actions version to fix node warning ([#1211](https://github.com/openai/whisper/pull/1211)) +* Update tokenizer.py ([#1163](https://github.com/openai/whisper/pull/1163)) + ## [v20230314](https://github.com/openai/whisper/releases/tag/v20230314) * abort find_alignment on empty input ([#1090](https://github.com/openai/whisper/pull/1090)) diff --git a/whisper/version.py b/whisper/version.py index 572259a..c43bf6f 100644 --- a/whisper/version.py +++ b/whisper/version.py @@ -1 +1 @@ -__version__ = "20230314" +__version__ = "20230918" From b38a1f20f4b23f3f3099af2c3e0ca95627276ddf Mon Sep 17 00:00:00 2001 From: Jordi Mas Date: Tue, 10 Oct 2023 19:01:01 +0200 Subject: [PATCH 25/45] Fix exception when an audio file with no speech is provided (#1396) Co-authored-by: Jong Wook Kim --- whisper/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whisper/utils.py b/whisper/utils.py index ba5a10c..22260d0 100644 --- a/whisper/utils.py +++ b/whisper/utils.py @@ -145,7 +145,7 @@ class SubtitlesWriter(ResultWriter): if len(subtitle) > 0: yield subtitle - if "words" in result["segments"][0]: + if len(result["segments"]) > 0 and "words" in result["segments"][0]: for subtitle in iterate_subtitles(): subtitle_start = self.format_timestamp(subtitle[0]["start"]) subtitle_end = self.format_timestamp(subtitle[-1]["end"]) From 6ed314fe4109eed40b79500eaf465935905e1aa7 Mon Sep 17 00:00:00 2001 From: amosal Date: Mon, 6 Nov 2023 10:49:33 +0100 Subject: [PATCH 26/45] Add new option to generate subtitles by a specific number of words (#1729) * ADD parser for new argument --max_words_count * ADD max_words_count in words_options ADD warning for max_line_width compatibility * ADD logic for max_words_count * rename to max_words_per_line * make them kwargs * allow specifying file path by --model * black formatting --------- Co-authored-by: Jong Wook Kim --- whisper/transcribe.py | 21 ++++++- whisper/utils.py | 132 ++++++++++++++++++++++++++++-------------- 2 files changed, 106 insertions(+), 47 deletions(-) diff --git a/whisper/transcribe.py b/whisper/transcribe.py index 6e43a22..509e322 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -378,10 +378,17 @@ def transcribe( def cli(): from . import available_models + def valid_model_name(name): + if name in available_models() or os.path.exists(name): + return name + raise ValueError( + f"model should be one of {available_models()} or path to a model checkpoint" + ) + # fmt: off parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe") - parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use") + parser.add_argument("--model", default="small", type=valid_model_name, help="name of the Whisper model to use") parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default") parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference") parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs") @@ -412,6 +419,7 @@ def cli(): parser.add_argument("--highlight_words", type=str2bool, default=False, help="(requires --word_timestamps True) underline each word as it is spoken in srt and vtt") parser.add_argument("--max_line_width", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of characters in a line before breaking the line") parser.add_argument("--max_line_count", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of lines in a segment") + parser.add_argument("--max_words_per_line", type=optional_int, default=None, help="(requires --word_timestamps True, no effect with --max_line_width) the maximum number of words in a segment") parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS") # fmt: on @@ -444,17 +452,24 @@ def cli(): model = load_model(model_name, device=device, download_root=model_dir) writer = get_writer(output_format, output_dir) - word_options = ["highlight_words", "max_line_count", "max_line_width"] + word_options = [ + "highlight_words", + "max_line_count", + "max_line_width", + "max_words_per_line", + ] if not args["word_timestamps"]: for option in word_options: if args[option]: parser.error(f"--{option} requires --word_timestamps True") if args["max_line_count"] and not args["max_line_width"]: warnings.warn("--max_line_count has no effect without --max_line_width") + if args["max_words_per_line"] and args["max_line_width"]: + warnings.warn("--max_words_per_line has no effect with --max_line_width") writer_args = {arg: args.pop(arg) for arg in word_options} for audio_path in args.pop("audio"): result = transcribe(model, audio_path, temperature=temperature, **args) - writer(result, audio_path, writer_args) + writer(result, audio_path, **writer_args) if __name__ == "__main__": diff --git a/whisper/utils.py b/whisper/utils.py index 22260d0..7a172c4 100644 --- a/whisper/utils.py +++ b/whisper/utils.py @@ -74,7 +74,9 @@ class ResultWriter: def __init__(self, output_dir: str): self.output_dir = output_dir - def __call__(self, result: dict, audio_path: str, options: dict): + def __call__( + self, result: dict, audio_path: str, options: Optional[dict] = None, **kwargs + ): audio_basename = os.path.basename(audio_path) audio_basename = os.path.splitext(audio_basename)[0] output_path = os.path.join( @@ -82,16 +84,20 @@ class ResultWriter: ) with open(output_path, "w", encoding="utf-8") as f: - self.write_result(result, file=f, options=options) + self.write_result(result, file=f, options=options, **kwargs) - def write_result(self, result: dict, file: TextIO, options: dict): + def write_result( + self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs + ): raise NotImplementedError class WriteTXT(ResultWriter): extension: str = "txt" - def write_result(self, result: dict, file: TextIO, options: dict): + def write_result( + self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs + ): for segment in result["segments"]: print(segment["text"].strip(), file=file, flush=True) @@ -100,12 +106,24 @@ class SubtitlesWriter(ResultWriter): always_include_hours: bool decimal_marker: str - def iterate_result(self, result: dict, options: dict): - raw_max_line_width: Optional[int] = options["max_line_width"] - max_line_count: Optional[int] = options["max_line_count"] - highlight_words: bool = options["highlight_words"] - max_line_width = 1000 if raw_max_line_width is None else raw_max_line_width - preserve_segments = max_line_count is None or raw_max_line_width is None + def iterate_result( + self, + result: dict, + options: Optional[dict] = None, + *, + max_line_width: Optional[int] = None, + max_line_count: Optional[int] = None, + highlight_words: bool = False, + max_words_per_line: Optional[int] = None, + ): + options = options or {} + max_line_width = max_line_width or options.get("max_line_width") + max_line_count = max_line_count or options.get("max_line_count") + highlight_words = highlight_words or options.get("highlight_words", False) + max_words_per_line = max_words_per_line or options.get("max_words_per_line") + preserve_segments = max_line_count is None or max_line_width is None + max_line_width = max_line_width or 1000 + max_words_per_line = max_words_per_line or 1000 def iterate_subtitles(): line_len = 0 @@ -114,34 +132,50 @@ class SubtitlesWriter(ResultWriter): subtitle: list[dict] = [] last = result["segments"][0]["words"][0]["start"] for segment in result["segments"]: - for i, original_timing in enumerate(segment["words"]): - timing = original_timing.copy() - long_pause = not preserve_segments and timing["start"] - last > 3.0 - has_room = line_len + len(timing["word"]) <= max_line_width - seg_break = i == 0 and len(subtitle) > 0 and preserve_segments - if line_len > 0 and has_room and not long_pause and not seg_break: - # line continuation - line_len += len(timing["word"]) - else: - # new line - timing["word"] = timing["word"].strip() + chunk_index = 0 + words_count = max_words_per_line + while chunk_index < len(segment["words"]): + remaining_words = len(segment["words"]) - chunk_index + if max_words_per_line > len(segment["words"]) - chunk_index: + words_count = remaining_words + for i, original_timing in enumerate( + segment["words"][chunk_index : chunk_index + words_count] + ): + timing = original_timing.copy() + long_pause = ( + not preserve_segments and timing["start"] - last > 3.0 + ) + has_room = line_len + len(timing["word"]) <= max_line_width + seg_break = i == 0 and len(subtitle) > 0 and preserve_segments if ( - len(subtitle) > 0 - and max_line_count is not None - and (long_pause or line_count >= max_line_count) - or seg_break + line_len > 0 + and has_room + and not long_pause + and not seg_break ): - # subtitle break - yield subtitle - subtitle = [] - line_count = 1 - elif line_len > 0: - # line break - line_count += 1 - timing["word"] = "\n" + timing["word"] - line_len = len(timing["word"].strip()) - subtitle.append(timing) - last = timing["start"] + # line continuation + line_len += len(timing["word"]) + else: + # new line + timing["word"] = timing["word"].strip() + if ( + len(subtitle) > 0 + and max_line_count is not None + and (long_pause or line_count >= max_line_count) + or seg_break + ): + # subtitle break + yield subtitle + subtitle = [] + line_count = 1 + elif line_len > 0: + # line break + line_count += 1 + timing["word"] = "\n" + timing["word"] + line_len = len(timing["word"].strip()) + subtitle.append(timing) + last = timing["start"] + chunk_index += max_words_per_line if len(subtitle) > 0: yield subtitle @@ -190,9 +224,11 @@ class WriteVTT(SubtitlesWriter): always_include_hours: bool = False decimal_marker: str = "." - def write_result(self, result: dict, file: TextIO, options: dict): + def write_result( + self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs + ): print("WEBVTT\n", file=file) - for start, end, text in self.iterate_result(result, options): + for start, end, text in self.iterate_result(result, options, **kwargs): print(f"{start} --> {end}\n{text}\n", file=file, flush=True) @@ -201,9 +237,11 @@ class WriteSRT(SubtitlesWriter): always_include_hours: bool = True decimal_marker: str = "," - def write_result(self, result: dict, file: TextIO, options: dict): + def write_result( + self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs + ): for i, (start, end, text) in enumerate( - self.iterate_result(result, options), start=1 + self.iterate_result(result, options, **kwargs), start=1 ): print(f"{i}\n{start} --> {end}\n{text}\n", file=file, flush=True) @@ -220,7 +258,9 @@ class WriteTSV(ResultWriter): extension: str = "tsv" - def write_result(self, result: dict, file: TextIO, options: dict): + def write_result( + self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs + ): print("start", "end", "text", sep="\t", file=file) for segment in result["segments"]: print(round(1000 * segment["start"]), file=file, end="\t") @@ -231,7 +271,9 @@ class WriteTSV(ResultWriter): class WriteJSON(ResultWriter): extension: str = "json" - def write_result(self, result: dict, file: TextIO, options: dict): + def write_result( + self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs + ): json.dump(result, file) @@ -249,9 +291,11 @@ def get_writer( if output_format == "all": all_writers = [writer(output_dir) for writer in writers.values()] - def write_all(result: dict, file: TextIO, options: dict): + def write_all( + result: dict, file: TextIO, options: Optional[dict] = None, **kwargs + ): for writer in all_writers: - writer(result, file, options) + writer(result, file, options, **kwargs) return write_all From b7d277acd59c19edab3c75b8bf362ddd27fddcc7 Mon Sep 17 00:00:00 2001 From: Marco Zucconelli <16992603+zuccon@users.noreply.github.com> Date: Mon, 6 Nov 2023 11:06:19 +0100 Subject: [PATCH 27/45] handling transcribe exceptions. (#1682) * handling transcribe() exceptions. * printing stacktrace --------- Co-authored-by: invalid Co-authored-by: Jong Wook Kim Co-authored-by: Jong Wook Kim --- whisper/transcribe.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/whisper/transcribe.py b/whisper/transcribe.py index 509e322..d5b3d43 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -1,5 +1,6 @@ import argparse import os +import traceback import warnings from typing import TYPE_CHECKING, Optional, Tuple, Union @@ -468,8 +469,12 @@ def cli(): warnings.warn("--max_words_per_line has no effect with --max_line_width") writer_args = {arg: args.pop(arg) for arg in word_options} for audio_path in args.pop("audio"): - result = transcribe(model, audio_path, temperature=temperature, **args) - writer(result, audio_path, **writer_args) + try: + result = transcribe(model, audio_path, temperature=temperature, **args) + writer(result, audio_path, **writer_args) + except Exception as e: + traceback.print_exc() + print(f"Skipping {audio_path} due to {type(e).__name__}: {str(e)}") if __name__ == "__main__": From 7dfcd56304a7025f949a3d69bd38eb0916622453 Mon Sep 17 00:00:00 2001 From: Mohamad Zamini <32536264+mzamini92@users.noreply.github.com> Date: Mon, 6 Nov 2023 03:28:51 -0700 Subject: [PATCH 28/45] allow_pickle=False while loading of mel matrix IN audio.py (#1511) * Update audio.py The `mel_filters` function is using a `np.load` function to load a pre-computed mel filterbank matrix. This function is not thread-safe, which means that if it is called from multiple threads at the same time, it may corrupt the data. To fix this, you can use the `torch.load` function instead. This function is thread-safe, so it will not corrupt the data if it is called from multiple threads at the same time. * Update audio.py updated the docstring * allow_pickle=False * newline --------- Co-authored-by: Jong Wook Kim Co-authored-by: Jong Wook Kim --- whisper/audio.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/whisper/audio.py b/whisper/audio.py index 4f5b6e0..f959e1c 100644 --- a/whisper/audio.py +++ b/whisper/audio.py @@ -101,9 +101,9 @@ def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor: ) """ assert n_mels == 80, f"Unsupported n_mels: {n_mels}" - with np.load( - os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz") - ) as f: + + filters_path = os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz") + with np.load(filters_path, allow_pickle=False) as f: return torch.from_numpy(f[f"mel_{n_mels}"]).to(device) From b9f17e1f2da9fe735910c94593dba8d7c25f2a4a Mon Sep 17 00:00:00 2001 From: Philippe Hebert Date: Mon, 6 Nov 2023 05:43:07 -0500 Subject: [PATCH 29/45] docs: Disambiguation of the term "relative speed" in the README (#1751) * docs: defines relative speed in README * combined paragraphs --------- Co-authored-by: Jong Wook Kim --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 2053257..3dc26c6 100644 --- a/README.md +++ b/README.md @@ -57,8 +57,7 @@ pip install setuptools-rust ## Available models and languages -There are five model sizes, four with English-only versions, offering speed and accuracy tradeoffs. Below are the names of the available models and their approximate memory requirements and relative speed. - +There are five model sizes, four with English-only versions, offering speed and accuracy tradeoffs. Below are the names of the available models and their approximate memory requirements and inference speed relative to the large model; actual speed may vary depending on many factors including the available hardware. | Size | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed | |:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:| From 746aaaeafa4ae746f97284ff43e3abe9af835b1d Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 6 Nov 2023 03:05:21 -0800 Subject: [PATCH 30/45] remove tiktoken pin (#1759) --- requirements.txt | 2 +- tests/test_tokenizer.py | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3c11ac3..a03dae8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,4 @@ numpy torch tqdm more-itertools -tiktoken==0.3.3 +tiktoken diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 09d0351..be424e5 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -1,7 +1,17 @@ +import pytest + from whisper.tokenizer import get_tokenizer -def test_tokenizer(): +@pytest.mark.parametrize("multilingual", [True, False]) +def test_tokenizer(multilingual): + tokenizer = get_tokenizer(multilingual=False) + assert tokenizer.sot in tokenizer.sot_sequence + assert len(tokenizer.all_language_codes) == len(tokenizer.all_language_tokens) + assert all(c < tokenizer.timestamp_begin for c in tokenizer.all_language_tokens) + + +def test_multilingual_tokenizer(): gpt2_tokenizer = get_tokenizer(multilingual=False) multilingual_tokenizer = get_tokenizer(multilingual=True) @@ -20,5 +30,5 @@ def test_split_on_unicode(): tokens = [8404, 871, 287, 6, 246, 526, 3210, 20378] words, word_tokens = multilingual_tokenizer.split_tokens_on_unicode(tokens) - assert words == [" elle", " est", " l", "'", "�", "é", "rit", "oire"] + assert words == [" elle", " est", " l", "'", "\ufffd", "é", "rit", "oire"] assert word_tokens == [[8404], [871], [287], [6], [246], [526], [3210], [20378]] From f6f01c561c45ad6ab421405e18ae22fd0c698e92 Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 6 Nov 2023 03:08:56 -0800 Subject: [PATCH 31/45] Release 20231105 --- CHANGELOG.md | 9 +++++++++ whisper/version.py | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 50c0536..ea73cf1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # CHANGELOG +## [v20231105](https://github.com/openai/whisper/releases/tag/v20231105) + +* remove tiktoken pin ([#1759](https://github.com/openai/whisper/pull/1759)) +* docs: Disambiguation of the term "relative speed" in the README ([#1751](https://github.com/openai/whisper/pull/1751)) +* allow_pickle=False while loading of mel matrix IN audio.py ([#1511](https://github.com/openai/whisper/pull/1511)) +* handling transcribe exceptions. ([#1682](https://github.com/openai/whisper/pull/1682)) +* Add new option to generate subtitles by a specific number of words ([#1729](https://github.com/openai/whisper/pull/1729)) +* Fix exception when an audio file with no speech is provided ([#1396](https://github.com/openai/whisper/pull/1396)) + ## [v20230918](https://github.com/openai/whisper/releases/tag/v20230918) * Add .pre-commit-config.yaml ([#1528](https://github.com/openai/whisper/pull/1528)) diff --git a/whisper/version.py b/whisper/version.py index c43bf6f..cb85315 100644 --- a/whisper/version.py +++ b/whisper/version.py @@ -1 +1 @@ -__version__ = "20230918" +__version__ = "20231105" From c5d42560760a05584c1c79546a098287e5a771eb Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 6 Nov 2023 10:10:30 -0800 Subject: [PATCH 32/45] large-v3 (#1761) * mel_filters() loads 128 mel bins * can load 100-language models * large-v3 checkpoint and evals * add mandarin alias * remove unused path * flake8 fix * formatting fix --- README.md | 4 +- language-breakdown.svg | 8998 ++++++++++++++++++++++++-------- model-card.md | 4 +- tests/test_transcribe.py | 2 +- whisper/__init__.py | 6 +- whisper/assets/mel_filters.npz | Bin 2048 -> 4271 bytes whisper/audio.py | 8 +- whisper/decoding.py | 9 +- whisper/model.py | 9 +- whisper/tokenizer.py | 27 +- whisper/transcribe.py | 9 +- 11 files changed, 6993 insertions(+), 2083 deletions(-) diff --git a/README.md b/README.md index 3dc26c6..afca9c9 100644 --- a/README.md +++ b/README.md @@ -69,9 +69,9 @@ There are five model sizes, four with English-only versions, offering speed and The `.en` models for English-only applications tend to perform better, especially for the `tiny.en` and `base.en` models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models. -Whisper's performance varies widely depending on the language. The figure below shows a WER (Word Error Rate) breakdown by languages of the Fleurs dataset using the `large-v2` model (The smaller the numbers, the better the performance). Additional WER scores corresponding to the other models and datasets can be found in Appendix D.1, D.2, and D.4. Meanwhile, more BLEU (Bilingual Evaluation Understudy) scores can be found in Appendix D.3. Both are found in [the paper](https://arxiv.org/abs/2212.04356). +Whisper's performance varies widely depending on the language. The figure below shows a performance breakdown of `large-v3` and `large-v2` models by language, using WERs (word error rates) or CER (character error rates, shown in *Italic*) evaluated on the Common Voice 15 and Fleurs datasets. Additional WER/CER metrics corresponding to the other models and datasets can be found in Appendix D.1, D.2, and D.4 of [the paper](https://arxiv.org/abs/2212.04356), as well as the BLEU (Bilingual Evaluation Understudy) scores for translation in Appendix D.3. -![WER breakdown by language](https://raw.githubusercontent.com/openai/whisper/main/language-breakdown.svg) +![WER breakdown by language](https://github.com/openai/whisper/assets/266841/f4619d66-1058-4005-8f67-a9d811b77c62) diff --git a/language-breakdown.svg b/language-breakdown.svg index 49a0653..616fd57 100644 --- a/language-breakdown.svg +++ b/language-breakdown.svg @@ -1,16 +1,16 @@ - + - 2022-12-03T03:56:51.812586 + 2023-11-06T12:34:22.337927 image/svg+xml - Matplotlib v3.5.1, https://matplotlib.org/ + Matplotlib v3.7.3, https://matplotlib.org/ @@ -21,498 +21,42 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - + - + - + - + - + - + - + - + - + - + - + @@ -672,18 +216,18 @@ L 453.44096 7.2 - + - + - + - - + - + - + - + - + - + - + - + - + + + + + + + + - - + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + - + + + + + + + + - + + - @@ -1240,15 +891,185 @@ z - - + + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - @@ -1289,115 +1096,15 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - + + - + - + - - + + - + + - - - - - - - - + + + + - + - + - - - - - - - - - - - - - - - - - - - + @@ -1536,74 +1280,112 @@ z - + - + - - - + + + - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + - + - + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - + + + + - - + + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + @@ -1705,15 +1757,100 @@ z - - + + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + + - + - - - - - - - - - - - - - + + + + + + + + + + + + + + + - - + + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + @@ -2032,158 +2026,15 @@ z - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - + + - + - + + + + + + + + + + + + + + + + + + + + - + + + + @@ -2372,144 +2202,57 @@ z - - + + - + - - - + + + - - - - - - - - - - + + + + + + + + + + + + + - - + + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + + @@ -2531,15 +2284,162 @@ z - - + + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + + - + - - - - - - - - - - - - - - - - - - - + - + @@ -2615,34 +2497,86 @@ z - - + + - + - - - - - - - - - - + + + + + + + + + + + - - + + - + - + + + + + + + + + + + + + + + + + + + + + + + + - + @@ -2651,34 +2585,34 @@ z - - + + - + - - - - - - - - - - + + + + + + + + + + - - + + - + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - + + + + + + + + + - + - + - - - - - - - - + + + + + + + + - + - + - - - - - - - - - - - - - - - - - - - - - - + @@ -2791,71 +2803,15 @@ z - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + @@ -2869,63 +2825,543 @@ z - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + - - - - - - - - + + + + + + + + + + - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - + + - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - + + - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + - - - - - + + + + + + + + + + + - - + + - + - - + + - + - + - - - - - - + + + + + + + + + - - + + - + - - - - - - + + + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + - + - - - + + + - + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - + - - + + + + + + + + + + + - + - - - - - - - - - - - + + - + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - + + + + + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + + + - + + + + + + + + + + - - + + - - - - - - - - - - + - - - + + + - + - - - + + + - + - - - + + + - + + + + + + + + + + - - - - + + + + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - - - - - + + + + + - + - - - + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + + + - + - + + + + + + + + + + - - + + + + + diff --git a/model-card.md b/model-card.md index b5a571a..3c041a1 100644 --- a/model-card.md +++ b/model-card.md @@ -17,12 +17,12 @@ The Whisper models are trained for speech recognition and translation tasks, cap | medium | 769 M | ✓ | ✓ | | large | 1550 M | | ✓ | -In December 2022, we [released an improved large model named `large-v2`](https://github.com/openai/whisper/discussions/661). +In December 2022, we [released an improved large model named `large-v2`](https://github.com/openai/whisper/discussions/661), and `large-v3` in November 2023. ### Release date -September 2022 (original series) and December 2022 (`large-v2`) +September 2022 (original series), December 2022 (`large-v2`), and November 2023 (`large-v3`) ### Model type diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index e4f8fd0..599221a 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -25,7 +25,7 @@ def test_transcribe(model_name: str): assert "your country" in transcription assert "do for you" in transcription - tokenizer = get_tokenizer(model.is_multilingual) + tokenizer = get_tokenizer(model.is_multilingual, num_languages=model.num_languages) all_tokens = [t for s in result["segments"] for t in s["tokens"]] assert tokenizer.decode(all_tokens) == result["text"] assert tokenizer.decode_with_timestamps(all_tokens).startswith("<|0.00|>") diff --git a/whisper/__init__.py b/whisper/__init__.py index 379133b..d7fbba3 100644 --- a/whisper/__init__.py +++ b/whisper/__init__.py @@ -25,7 +25,8 @@ _MODELS = { "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt", "large-v1": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt", "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt", - "large": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt", + "large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt", + "large": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt", } # base85-encoded (n_layers, n_heads) boolean arrays indicating the cross-attention heads that are @@ -41,7 +42,8 @@ _ALIGNMENT_HEADS = { "medium": b"ABzY8B0Jh+0{>%R7}kK1fFL7w6%<-Pf*t^=N)Qr&0RR9", "large-v1": b"ABzY8r9j$a0{>%R7#4sLmoOs{s)o3~84-RPdcFk!JR%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj", - "large": b"ABzY8zd+h!0{>%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj", + "large-v3": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00", + "large": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00", } diff --git a/whisper/assets/mel_filters.npz b/whisper/assets/mel_filters.npz index 1a7839244dfb6b1cc02e4f3cfe12e4817a073bc7..28ea26909dbdfd608aef67afc4d74d7961ae4bb6 100644 GIT binary patch literal 4271 zcmZ`-cQjmYw;lx1g6JcN7QKe3LG%_Oh!VX=^k~teM-XGQ(Mu4$_Y%?jkm$lFBkB+( z3yfKIgF zxGiAhze`A@t->QRNVV!%P+W=o}VHkB) z%g>qyRHfN1IQ4-=`Y@0T9qE#o+;4E3VQ!epW1Xt=ZG`I3U|62t?<>5h*W|9VvJc`KZ+)ghnA**Z~ET21Tjf_f8oe`vy zZQNtlOx?dDhS71hnOus5cqj)hfyF@H&4y?@9z{I#&cf>A+s2~~(I>TQF}SaR3_tqa z(7&ZdN^vR*t<~?{9DEoI>0PL@Sl?wa?Z{rGX`*eEx9Nh=z*J3HZL1*Py4z$TD#+;m zSSW(kcOTe(4hqgib_W6&xx+j~-u(p)Nn6?>a%wHk=h7Ay$%lcGoo;gAY zmVV7|!Nb;w(PlH@c24{ple2Y3<*9J@jE=sfLzwu_BiAFPE$0Axp`^Nq!H}eG0?r-X zFj@Pwp^al*p>K{@_Cz`q#(N0Y=OpZy^ z{P$KjLJuk_Y%I)$mh`b{uOW5C5Xcmxk!gt_Zg zw>}6fkD4zRK9!#ems~H%U$>V;_wK38Zf-baU$S!#i;7!HWsi}GuC>%@?lMdgkUGC& zh9gC?O-5BlS2#}?7x0?eP#bOL(cqE{M%LJD$CZnplD)CgQR#KCttD=dZK+Ck5R52; z*%5hZ+SXU7)8k%Y^_1U>yI*By(INn&+ir-_4$#dUwTlMNyR@iGQIaZ+eiYqucu)CB z#i{Ru1w+aU#}DHSyzjG_9c?ToB_YjU#f;N=qel98WBIjIc1!#ePwRR+(go&-by#}@ z+M+klVke5b@lWfZ+O&|c??YvRe)&W)qAgtc>t-IZtbRTG#X}49_Q$>P%-)=0W_QY-x%DPep2Vm9#ci zyQcCc4p2&dLtV1@rPe!%>Y^#9W8#ZH&}^@wJKT7N;R9A7cEq&;Y2CYvd@R+Mn&b5O zVyfS^*H#kD74=J5uhD)o`TXoX>>Si$!cT?TXRxj2pB)w_ljjhTby&Je;X|BESZZT= zC%G5!-$BJf&a~U78d_3zBjrvrkJ0CCl@Rfcf7I(`VTNPnI^B#B$zOfPW zG&mEd?R0+W<`l08O1dkcWKS8wB!Z*Cs%I1nMs-EeB-uu5?t@PuD3|z>je8DKi#X(B z{Z=Rz{4X%?-UnxnHQtkELIZ&=J;fK_t}yu8|IxG0(85e&K>H3!!~zlhyJrgti~o1i zzBS*jTgdG~Exp#B-T)6A+PB ztD-e`j^@XAx}|L&JSEFkRvS_%3b%m86z02#Hfn{Y+qIqQ_muywgt?roUA7oiS1xBD zFxmDMsj_cbBcn*^rn^KIMP{AlHM`NiVm*D&`z~7FH#hf<$L3HmJ+=NdiY5>W?nKD? z8Ox6{9dKyI1o8a-j9BtV-|=lm`<`v>tR^Cln&x1dMYzu{@wq5KW!#K14_QMnpH5K%Pavag+g6(i8i-#Eq zguc}rH3?BxH4SOqZW#7m*aT(U9-n#_Xn^Q19(}eH!xG`nI!GYziVQNcA0)`FDHD%~ zz2$HnxW4BQ{#*@u`dssbAa`|fESn$8i8FdxGZh48_Uf~_Q@tv?4in)6fwSed)k&ITqu|){^(WL~J z?Lb|0ro06J^>f>^2}^e-+$u5bU4IZNfO?75v8lstS15%XYw2ac^pkU34{QhDR(umt zPu~`w2?FP|nn3!RWZ3{?=77@teulahD9*S*k5KmY3*adlM)%{SR~bkZYlx1q@fkE= zI$7+kiw5!ha=dYlO>Z5KgxnZEJsaBm%v#nkX0MN-h%n&KA?N}xU3K3o-3Jpk?ANq2n9&Lh%K_CTvfiN ze>6w~NSSl8$#NEZ^t7h9YOxI=zcAG|a+m6AWei`3Jw7K;b;T${pJa^4RwRt%F>?>M zBmoQqm1`<_W7i!5P~THp-II)Ka^u;=z;}d{;SVj{G_4`9^HaEb!=@Pa;Dw)CH^DjsGxFqmb%o$Bkop$KnH8 zDYN)Bh)5=5!-*|f0Gh4)oZG=TEBr()g^DCtSQhmT3!ZN`Qd-E%@1cE}hm8&Vq5B+C zVF2_O)9IiZ(v(xzTwJIg5|}KVuE(;}|7dVIrT`$d=q_OG|3PY}x*URYkMXXJ6PT1$IFkNyvY_(9UglDi6TaeikPS(!Bnij z;Szn+)I_oxnRz7(WTYTp+IHSWQ?Xd~tQn(Q1r)kThM?NM< z?d6LaBG!H}R$zRy!Ij(}1?xe^+o+!;tqWJ3NgjHl1XNxzusxQ0I#6qzM(_00UPMw* zF*GWW_q&fqAN=uimSKgBu_@jD%MX3hpNY|*4r=e=k1lw2r**IyD(hcq?A+HtUgUy4Dqh5D7|G9q{)TsUj{g~c!xy>9wk^(LiXA4VKGz_zMvJMX#AgsR z34T3hhJ)#&sUaQ1+0PML(?YA~{5?=(MT}X^Vib%};uoI{qGW@wgJ&_M+8S8clsNz2 zPQkxMi`#3+Khwtl>>K>wxc{71{&!qGu&Zzz_wU(7TLTyG){PAu?!cXs?Dp-y0Ekcn AQvd(} delta 2007 zcmV;|2PpWjA%GAQP)h>@6aWAK2mk;8ApjrBSy*R7c85^~8Y5zVylPRV%1|^ltxNTi@h}3pFi!M?lnJJfIs2P`vlsTbr(zJd*gVYah_9J z7%VYZZs&g=pz{l}6V`SzaEP7C+Ac68Y*CnY!OV~_|A4=)f20l81vFmQ1!)%sG=8^t zc2HTX9US|sti!GUKWz;frSH8U4TwlD-7-TYPd~~|h!p6SMPqgQY<5DVU~k@O{Ig&y zJ0VigxOd693**@dk%Gi7HuA9RB6dQgU~itA+?szrk)04JIOpXrBaEEb36TQbvS_Id zGG`}53RV_xkZY3e;BDUr(Yq|MOPchUypx>}Dfl`dSE`&{*$I(?y?LkQ-B%IfZQlpc zyDY!u4?M#Y{SRByudov$1&cys#pgb!*$I&XmE&kc80E4PA_bR!NJGfso$Q22fpN#X z&<1~PU?)ThcGz~36T3&V6CwpC&N#_6Nq+2vNWqV@y=CwdS9U_A;Hkl4=`pzvJ0VgK z|6ROP=eJ-dL<+X_N|7q(LjDGlNWt@lEctmr3_BrG@RR*P`E11$c0#0Jc*g^>J|mc& z5GlC$V7DA-lFCkq6!?a2mX$_V*$I(?*vfyEa?@45YlTS3s6(D|cic#JLZskELRXnG zRmV<<6s&rfi<+t=**eylXA~kmbUEk6inFn@eZN|ELZl$yVYoc|U=(*RA_dn9iV%~% zTIStN;7lRXV|?9bVihLH`)#c_Q-~C(A5}s(QzxA(6FF0e^f-6ph{$k~a zL4D1nz)H44q(@b(IgGt-BQvhI3@^&TCpY}KhY>Yn{1$I9bwVc2w*H%Z5qbyTSq$TB zA<`qIa3YHO*5J>XrgFuCR4k}k$k{^Fj8+GBh|Mo{VA5Grd1C%8INtN-Y$1QrW96~| zNE=@SZDdDjoplHeS)JJnQ8S`!Hfb8%{80S!HB5MFASb1Mg~9fBxr-4s!*y1e(0vn* zx!PN>-17(hM_h0U|hKlA@i!)3sEylixPxYT`=bT zy#xgXRd{T^1@0Hkd43^kM&Wc`r1#t=0zy-uxp z>DV3bZ`}yrlpmpLaRND0bTFM=#$Je;QEzliboxvSpO^#aR#%RexnH89T#b{q6W9z< z6P~-uG~N-N@yKR1@{G^m>7{(c42eRYVeN3I`#!$=g{X=888O1}nHqntKdnd1sPm{f zmx(@Y0#Ih~SlBOjVKYQce7}2-xIB6Uu2*cp<<72X z_llb0VTd`o7M(1QVadpCI2f$Oipj0erK2U!Gek`cDq1fR7xzJH;}vkZl!Y1aDm zhsgexxL@-Hn;~jq#1(%RQFq@EeJgxmKPL%=5jhx=yc%OBx!_adTOvB{CHFI;H|%zr zqPcl4PwX6OhjODuFbdud_3SO^ay1Ag)xGfS)gDo+c4Rk1Z-`48AS%r&Md1Jq;_RZ} za3l@uYuBPSbRtgIw}*X)WRX0!(D`lOC(*m4b(>kjr>ItRIs<>aNDs&Fh1*bmAr=GG zW08H)1YX~66@fiwao!NULHf-Qhlc(ls!|>C$JYxGbvYR>IkC8z>4_1w77)u1i9f9j z`CNeL4f{Gch-9M@k!5I!Hqk!FGgytIaq%$Boq$1n;E@2nbt*r`=q!rl13cUd$J{Y87V=(HkNh28)}K{}#t{ zO^{$W0@G9G!_p=W1sPhnz2}4?|5n%-mo0`R_GCLmuf?k_JEupD6BXen#Q{rGEUni- zRs_QRLNpSBred8Kj14BOV0(PO@G2Z8qBE?yvk^6#w5os1IriR2;hB&t{^i{gLvO2a zEW{VCFC+2v_%sYZH3ZAfm>?{%MBE*q6PH4F@fjP@YrCGCi6>#nqQ^gOi)}ewvH$%K za5}>u<@W>e;MRwDRo)Z*w*M-|uFDWr2E)bAG40q7Q6trCi*+K4_lddg^BO torch.Tensor: +def mel_filters(device, n_mels: int) -> torch.Tensor: """ load the mel filterbank matrix for projecting STFT into a Mel spectrogram. Allows decoupling librosa dependency; saved using: @@ -98,9 +97,10 @@ def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor: np.savez_compressed( "mel_filters.npz", mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80), + mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128), ) """ - assert n_mels == 80, f"Unsupported n_mels: {n_mels}" + assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}" filters_path = os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz") with np.load(filters_path, allow_pickle=False) as f: @@ -109,7 +109,7 @@ def mel_filters(device, n_mels: int = N_MELS) -> torch.Tensor: def log_mel_spectrogram( audio: Union[str, np.ndarray, torch.Tensor], - n_mels: int = N_MELS, + n_mels: int = 80, padding: int = 0, device: Optional[Union[str, torch.device]] = None, ): diff --git a/whisper/decoding.py b/whisper/decoding.py index ecd98a4..49485d0 100644 --- a/whisper/decoding.py +++ b/whisper/decoding.py @@ -32,7 +32,9 @@ def detect_language( list of dictionaries containing the probability distribution over all languages. """ if tokenizer is None: - tokenizer = get_tokenizer(model.is_multilingual) + tokenizer = get_tokenizer( + model.is_multilingual, num_languages=model.num_languages + ) if ( tokenizer.language is None or tokenizer.language_token not in tokenizer.sot_sequence @@ -514,7 +516,10 @@ class DecodingTask: language = options.language or "en" tokenizer = get_tokenizer( - model.is_multilingual, language=language, task=options.task + model.is_multilingual, + num_languages=model.num_languages, + language=language, + task=options.task, ) self.tokenizer: Tokenizer = tokenizer self.options: DecodingOptions = self._verify_options(options) diff --git a/whisper/model.py b/whisper/model.py index 6913002..a678283 100644 --- a/whisper/model.py +++ b/whisper/model.py @@ -236,7 +236,8 @@ class Whisper(nn.Module): self.dims.n_text_head, self.dims.n_text_layer, ) - # use the last half layers for alignment by default; see `set_alignment_heads()` below + # use the last half among the decoder layers for time alignment by default; + # to use a specific set of heads, see `set_alignment_heads()` below. all_heads = torch.zeros( self.dims.n_text_layer, self.dims.n_text_head, dtype=torch.bool ) @@ -269,7 +270,11 @@ class Whisper(nn.Module): @property def is_multilingual(self): - return self.dims.n_vocab == 51865 + return self.dims.n_vocab >= 51865 + + @property + def num_languages(self): + return self.dims.n_vocab - 51765 - int(self.is_multilingual) def install_kv_cache_hooks(self, cache: Optional[dict] = None): """ diff --git a/whisper/tokenizer.py b/whisper/tokenizer.py index 3b23991..2af8375 100644 --- a/whisper/tokenizer.py +++ b/whisper/tokenizer.py @@ -107,6 +107,7 @@ LANGUAGES = { "ba": "bashkir", "jw": "javanese", "su": "sundanese", + "yue": "cantonese", } # language code lookup by name, with a few language aliases @@ -123,6 +124,7 @@ TO_LANGUAGE_CODE = { "moldovan": "ro", "sinhalese": "si", "castilian": "es", + "mandarin": "zh", } @@ -131,6 +133,7 @@ class Tokenizer: """A thin wrapper around `tiktoken` providing quick access to special tokens""" encoding: tiktoken.Encoding + num_languages: int language: Optional[str] = None task: Optional[str] = None sot_sequence: Tuple[int] = () @@ -145,7 +148,7 @@ class Tokenizer: translate: int = self.special_tokens["<|translate|>"] transcribe: int = self.special_tokens["<|transcribe|>"] - langs = tuple(LANGUAGES.keys()) + langs = tuple(LANGUAGES.keys())[: self.num_languages] sot_sequence = [sot] if self.language is not None: sot_sequence.append(sot + 1 + langs.index(self.language)) @@ -211,10 +214,13 @@ class Tokenizer: if self.language is None: raise ValueError("This tokenizer does not have language token configured") - if token := self.special_tokens.get(f"<|{self.language}|>", None): + return self.to_language_token(self.language) + + def to_language_token(self, language): + if token := self.special_tokens.get(f"<|{language}|>", None): return token - raise KeyError(f"Language {self.language} not found in tokenizer.") + raise KeyError(f"Language {language} not found in tokenizer.") @cached_property def all_language_tokens(self) -> Tuple[int]: @@ -222,7 +228,7 @@ class Tokenizer: for token, token_id in self.special_tokens.items(): if token.strip("<|>") in LANGUAGES: result.append(token_id) - return tuple(result) + return tuple(result)[: self.num_languages] @cached_property def all_language_codes(self) -> Tuple[str]: @@ -269,7 +275,7 @@ class Tokenizer: return tuple(sorted(result)) def split_to_word_tokens(self, tokens: List[int]): - if self.language in {"zh", "ja", "th", "lo", "my"}: + if self.language in {"zh", "ja", "th", "lo", "my", "yue"}: # These languages don't typically use spaces, so it is difficult to split words # without morpheme analysis. Here, we instead split words at any # position where the tokens are decoded as valid unicode points @@ -322,7 +328,7 @@ class Tokenizer: @lru_cache(maxsize=None) -def get_encoding(name: str = "gpt2"): +def get_encoding(name: str = "gpt2", num_languages: int = 99): vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken") ranks = { base64.b64decode(token): int(rank) @@ -334,7 +340,7 @@ def get_encoding(name: str = "gpt2"): specials = [ "<|endoftext|>", "<|startoftranscript|>", - *[f"<|{lang}|>" for lang in LANGUAGES.keys()], + *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]], "<|translate|>", "<|transcribe|>", "<|startoflm|>", @@ -361,6 +367,7 @@ def get_encoding(name: str = "gpt2"): def get_tokenizer( multilingual: bool, *, + num_languages: int = 99, language: Optional[str] = None, task: Optional[str] = None, # Literal["transcribe", "translate", None] ) -> Tokenizer: @@ -381,6 +388,8 @@ def get_tokenizer( language = None task = None - encoding = get_encoding(name=encoding_name) + encoding = get_encoding(name=encoding_name, num_languages=num_languages) - return Tokenizer(encoding=encoding, language=language, task=task) + return Tokenizer( + encoding=encoding, num_languages=num_languages, language=language, task=task + ) diff --git a/whisper/transcribe.py b/whisper/transcribe.py index d5b3d43..e80bede 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -119,7 +119,7 @@ def transcribe( decode_options["fp16"] = False # Pad 30-seconds of silence to the input audio, for slicing - mel = log_mel_spectrogram(audio, padding=N_SAMPLES) + mel = log_mel_spectrogram(audio, model.dims.n_mels, padding=N_SAMPLES) content_frames = mel.shape[-1] - N_FRAMES if decode_options.get("language", None) is None: @@ -140,7 +140,12 @@ def transcribe( language: str = decode_options["language"] task: str = decode_options.get("task", "transcribe") - tokenizer = get_tokenizer(model.is_multilingual, language=language, task=task) + tokenizer = get_tokenizer( + model.is_multilingual, + num_languages=model.num_languages, + language=language, + task=task, + ) if word_timestamps and task == "translate": warnings.warn("Word-level timestamps on translations may not be reliable.") From fcfeaf1b61994c071bba62da47d7846933576ac9 Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 6 Nov 2023 10:14:04 -0800 Subject: [PATCH 33/45] Release 20231106 --- CHANGELOG.md | 4 ++++ whisper/version.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea73cf1..cb0908a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # CHANGELOG +## [v20231106](https://github.com/openai/whisper/releases/tag/v20231106) + +* large-v3 ([#1761](https://github.com/openai/whisper/pull/1761)) + ## [v20231105](https://github.com/openai/whisper/releases/tag/v20231105) * remove tiktoken pin ([#1759](https://github.com/openai/whisper/pull/1759)) diff --git a/whisper/version.py b/whisper/version.py index cb85315..4aaccff 100644 --- a/whisper/version.py +++ b/whisper/version.py @@ -1 +1 @@ -__version__ = "20231105" +__version__ = "20231106" From 1cea4357687b676b293cb5473e1ade25f5b1cef7 Mon Sep 17 00:00:00 2001 From: Eugene Indenbom <45334274+eindenbom@users.noreply.github.com> Date: Mon, 13 Nov 2023 19:43:42 +0200 Subject: [PATCH 34/45] Relax triton requirements for compatibility with pytorch 2.1 and newer (#1802) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1161d81..ae8589e 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def read_version(fname="whisper/version.py"): requirements = [] if sys.platform.startswith("linux") and platform.machine() == "x86_64": - requirements.append("triton==2.0.0") + requirements.append("triton>=2.0.0,<3") setup( name="openai-whisper", From e58f28804528831904c3b6f2c0e473f346223433 Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Fri, 17 Nov 2023 11:59:28 -0800 Subject: [PATCH 35/45] Release 20231117 --- CHANGELOG.md | 4 ++++ whisper/version.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb0908a..5895541 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # CHANGELOG +## [v20231117](https://github.com/openai/whisper/releases/tag/v20231117) + +* Relax triton requirements for compatibility with pytorch 2.1 and newer ([#1802](https://github.com/openai/whisper/pull/1802)) + ## [v20231106](https://github.com/openai/whisper/releases/tag/v20231106) * large-v3 ([#1761](https://github.com/openai/whisper/pull/1761)) diff --git a/whisper/version.py b/whisper/version.py index 4aaccff..c96dd9c 100644 --- a/whisper/version.py +++ b/whisper/version.py @@ -1 +1 @@ -__version__ = "20231106" +__version__ = "20231117" From 8bc8860694949db53c42ba47ddc23786c2e02a8b Mon Sep 17 00:00:00 2001 From: Bob Lin Date: Mon, 11 Dec 2023 23:39:08 +0800 Subject: [PATCH 36/45] Fix triton env marker (#1887) --- requirements.txt | 1 + setup.py | 7 +++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index a03dae8..62f5f9d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ torch tqdm more-itertools tiktoken +triton>=2.0.0,<3;platform_machine=="x86_64" and sys_platform=="linux" or sys_platform=="linux2" diff --git a/setup.py b/setup.py index ae8589e..183b527 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ -import os import platform import sys +from pathlib import Path import pkg_resources from setuptools import find_packages, setup @@ -28,11 +28,10 @@ setup( url="https://github.com/openai/whisper", license="MIT", packages=find_packages(exclude=["tests*"]), - install_requires=requirements - + [ + install_requires=[ str(r) for r in pkg_resources.parse_requirements( - open(os.path.join(os.path.dirname(__file__), "requirements.txt")) + Path(__file__).with_name("requirements.txt").open() ) ], entry_points={ From ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab Mon Sep 17 00:00:00 2001 From: ryanheise Date: Tue, 19 Dec 2023 07:11:16 +1100 Subject: [PATCH 37/45] Skip silence around hallucinations (#1838) * Add clip_timestamps option * Add hallucination_silence_threshold option * Fix typing for python < 3.9 --------- Co-authored-by: Jong Wook Kim --- whisper/timing.py | 1 + whisper/transcribe.py | 151 +++++++++++++++++++++++++++++++++++++----- whisper/utils.py | 20 +++++- 3 files changed, 153 insertions(+), 19 deletions(-) diff --git a/whisper/timing.py b/whisper/timing.py index befcf46..b695ead 100644 --- a/whisper/timing.py +++ b/whisper/timing.py @@ -299,6 +299,7 @@ def add_word_timestamps( word_durations = np.array([t.end - t.start for t in alignment]) word_durations = word_durations[word_durations.nonzero()] median_duration = np.median(word_durations) if len(word_durations) > 0 else 0.0 + median_duration = min(0.7, float(median_duration)) max_duration = median_duration * 2 # hack: truncate long words at sentence boundaries. diff --git a/whisper/transcribe.py b/whisper/transcribe.py index e80bede..1c075a2 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -2,7 +2,7 @@ import argparse import os import traceback import warnings -from typing import TYPE_CHECKING, Optional, Tuple, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Union import numpy as np import torch @@ -23,6 +23,7 @@ from .tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer from .utils import ( exact_div, format_timestamp, + get_end, get_writer, make_safe, optional_float, @@ -48,6 +49,8 @@ def transcribe( word_timestamps: bool = False, prepend_punctuations: str = "\"'“¿([{-", append_punctuations: str = "\"'.。,,!!??::”)]}、", + clip_timestamps: Union[str, List[float]] = "0", + hallucination_silence_threshold: Optional[float] = None, **decode_options, ): """ @@ -102,6 +105,14 @@ def transcribe( decode_options: dict Keyword arguments to construct `DecodingOptions` instances + clip_timestamps: Union[str, List[float]] + Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process. + The last end timestamp defaults to the end of the file. + + hallucination_silence_threshold: Optional[float] + When word_timestamps is True, skip silent periods longer than this threshold (in seconds) + when a possible hallucination is detected + Returns ------- A dictionary containing the resulting text ("text") and segment-level details ("segments"), and @@ -121,6 +132,7 @@ def transcribe( # Pad 30-seconds of silence to the input audio, for slicing mel = log_mel_spectrogram(audio, model.dims.n_mels, padding=N_SAMPLES) content_frames = mel.shape[-1] - N_FRAMES + content_duration = float(content_frames * HOP_LENGTH / SAMPLE_RATE) if decode_options.get("language", None) is None: if not model.is_multilingual: @@ -147,6 +159,19 @@ def transcribe( task=task, ) + if isinstance(clip_timestamps, str): + clip_timestamps = [ + float(ts) for ts in (clip_timestamps.split(",") if clip_timestamps else []) + ] + seek_points: List[int] = [round(ts * FRAMES_PER_SECOND) for ts in clip_timestamps] + if len(seek_points) == 0: + seek_points.append(0) + if len(seek_points) % 2 == 1: + seek_points.append(content_frames) + seek_clips: List[Tuple[int, int]] = list(zip(seek_points[::2], seek_points[1::2])) + + punctuation = "\"'“¿([{-\"'.。,,!!??::”)]}、" + if word_timestamps and task == "translate": warnings.warn("Word-level timestamps on translations may not be reliable.") @@ -190,7 +215,8 @@ def transcribe( return decode_result - seek = 0 + clip_idx = 0 + seek = seek_clips[clip_idx][0] input_stride = exact_div( N_FRAMES, model.dims.n_audio_ctx ) # mel frames per output token: 2 @@ -229,10 +255,23 @@ def transcribe( total=content_frames, unit="frames", disable=verbose is not False ) as pbar: last_speech_timestamp = 0.0 - while seek < content_frames: + # NOTE: This loop is obscurely flattened to make the diff readable. + # A later commit should turn this into a simpler nested loop. + # for seek_clip_start, seek_clip_end in seek_clips: + # while seek < seek_clip_end + while clip_idx < len(seek_clips): + seek_clip_start, seek_clip_end = seek_clips[clip_idx] + if seek < seek_clip_start: + seek = seek_clip_start + if seek >= seek_clip_end: + clip_idx += 1 + if clip_idx < len(seek_clips): + seek = seek_clips[clip_idx][0] + continue time_offset = float(seek * HOP_LENGTH / SAMPLE_RATE) - mel_segment = mel[:, seek : seek + N_FRAMES] - segment_size = min(N_FRAMES, content_frames - seek) + window_end_time = float((seek + N_FRAMES) * HOP_LENGTH / SAMPLE_RATE) + segment_size = min(N_FRAMES, content_frames - seek, seek_clip_end - seek) + mel_segment = mel[:, seek : seek + segment_size] segment_duration = segment_size * HOP_LENGTH / SAMPLE_RATE mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(model.device).to(dtype) @@ -257,6 +296,30 @@ def transcribe( previous_seek = seek current_segments = [] + # anomalous words are very long/short/improbable + def word_anomaly_score(word: dict) -> float: + probability = word.get("probability", 0.0) + duration = word["end"] - word["start"] + score = 0.0 + if probability < 0.15: + score += 1.0 + if duration < 0.133: + score += (0.133 - duration) * 15 + if duration > 2.0: + score += duration - 2.0 + return score + + def is_segment_anomaly(segment: Optional[dict]) -> bool: + if segment is None or not segment["words"]: + return False + words = [w for w in segment["words"] if w["word"] not in punctuation] + words = words[:8] + score = sum(word_anomaly_score(w) for w in words) + return score >= 3 or score + 0.01 >= len(words) + + def next_words_segment(segments: List[dict]) -> Optional[dict]: + return next((s for s in segments if s["words"]), None) + timestamp_tokens: torch.Tensor = tokens.ge(tokenizer.timestamp_begin) single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True] @@ -330,17 +393,71 @@ def transcribe( append_punctuations=append_punctuations, last_speech_timestamp=last_speech_timestamp, ) - word_end_timestamps = [ - w["end"] for s in current_segments for w in s["words"] - ] - if len(word_end_timestamps) > 0: - last_speech_timestamp = word_end_timestamps[-1] - if not single_timestamp_ending and len(word_end_timestamps) > 0: - seek_shift = round( - (word_end_timestamps[-1] - time_offset) * FRAMES_PER_SECOND - ) - if seek_shift > 0: - seek = previous_seek + seek_shift + + if not single_timestamp_ending: + last_word_end = get_end(current_segments) + if last_word_end is not None and last_word_end > time_offset: + seek = round(last_word_end * FRAMES_PER_SECOND) + + # skip silence before possible hallucinations + if hallucination_silence_threshold is not None: + threshold = hallucination_silence_threshold + if not single_timestamp_ending: + last_word_end = get_end(current_segments) + if last_word_end is not None and last_word_end > time_offset: + remaining_duration = window_end_time - last_word_end + if remaining_duration > threshold: + seek = round(last_word_end * FRAMES_PER_SECOND) + else: + seek = previous_seek + segment_size + + # if first segment might be a hallucination, skip leading silence + first_segment = next_words_segment(current_segments) + if first_segment is not None and is_segment_anomaly(first_segment): + gap = first_segment["start"] - time_offset + if gap > threshold: + seek = previous_seek + round(gap * FRAMES_PER_SECOND) + continue + + # skip silence before any possible hallucination that is surrounded + # by silence or more hallucinations + hal_last_end = last_speech_timestamp + for si in range(len(current_segments)): + segment = current_segments[si] + if not segment["words"]: + continue + if is_segment_anomaly(segment): + next_segment = next_words_segment( + current_segments[si + 1 :] + ) + if next_segment is not None: + hal_next_start = next_segment["words"][0]["start"] + else: + hal_next_start = time_offset + segment_duration + silence_before = ( + segment["start"] - hal_last_end > threshold + or segment["start"] < threshold + or segment["start"] - time_offset < 2.0 + ) + silence_after = ( + hal_next_start - segment["end"] > threshold + or is_segment_anomaly(next_segment) + or window_end_time - segment["end"] < 2.0 + ) + if silence_before and silence_after: + seek = round( + max(time_offset + 1, segment["start"]) + * FRAMES_PER_SECOND + ) + if content_duration - segment["end"] < threshold: + seek = content_frames + current_segments[si:] = [] + break + hal_last_end = segment["end"] + + last_word_end = get_end(current_segments) + if last_word_end is not None: + last_speech_timestamp = last_word_end if verbose: for segment in current_segments: @@ -427,6 +544,8 @@ def cli(): parser.add_argument("--max_line_count", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of lines in a segment") parser.add_argument("--max_words_per_line", type=optional_int, default=None, help="(requires --word_timestamps True, no effect with --max_line_width) the maximum number of words in a segment") parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS") + parser.add_argument("--clip_timestamps", type=str, default="0", help="comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process, where the last end timestamp defaults to the end of the file") + parser.add_argument("--hallucination_silence_threshold", type=optional_float, help="(requires --word_timestamps True) skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected") # fmt: on args = parser.parse_args().__dict__ diff --git a/whisper/utils.py b/whisper/utils.py index 7a172c4..9b9b138 100644 --- a/whisper/utils.py +++ b/whisper/utils.py @@ -3,7 +3,7 @@ import os import re import sys import zlib -from typing import Callable, Optional, TextIO +from typing import Callable, List, Optional, TextIO system_encoding = sys.getdefaultencoding() @@ -68,6 +68,20 @@ def format_timestamp( ) +def get_start(segments: List[dict]) -> Optional[float]: + return next( + (w["start"] for s in segments for w in s["words"]), + segments[0]["start"] if segments else None, + ) + + +def get_end(segments: List[dict]) -> Optional[float]: + return next( + (w["end"] for s in reversed(segments) for w in reversed(s["words"])), + segments[-1]["end"] if segments else None, + ) + + class ResultWriter: extension: str @@ -129,8 +143,8 @@ class SubtitlesWriter(ResultWriter): line_len = 0 line_count = 1 # the next subtitle to yield (a list of word timings with whitespace) - subtitle: list[dict] = [] - last = result["segments"][0]["words"][0]["start"] + subtitle: List[dict] = [] + last: float = get_start(result["segments"]) or 0.0 for segment in result["segments"]: chunk_index = 0 words_count = max_words_per_line From 32d55d5d76c9ecbe2dfa3e6735896c648156ab63 Mon Sep 17 00:00:00 2001 From: Jianan Xing <1633398+xingjianan@users.noreply.github.com> Date: Tue, 10 Sep 2024 09:53:08 -0700 Subject: [PATCH 38/45] Relax triton requirements for compatibility with pytorch 2.4 and newer (#2307) * Relax triton requirements for compatibility with pytorch 2.4 and newer Similar to https://github.com/openai/whisper/pull/1802, but now when pytorch upgrades to 2.4, it requires triton==3.0.0. I am not sure if it makes sense to remove the upper bound version constraints * Update requirements.txt --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 62f5f9d..8ee5920 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,4 @@ torch tqdm more-itertools tiktoken -triton>=2.0.0,<3;platform_machine=="x86_64" and sys_platform=="linux" or sys_platform=="linux2" +triton>=2.0.0;platform_machine=="x86_64" and sys_platform=="linux" or sys_platform=="linux2" diff --git a/setup.py b/setup.py index 183b527..73c4eb8 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def read_version(fname="whisper/version.py"): requirements = [] if sys.platform.startswith("linux") and platform.machine() == "x86_64": - requirements.append("triton>=2.0.0,<3") + requirements.append("triton>=2.0.0") setup( name="openai-whisper", From 279133e3107392276dc509148da1f41bfb532c7e Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Tue, 10 Sep 2024 10:43:21 -0700 Subject: [PATCH 39/45] pinning numpy<2 in tests (#2332) * pinning numpy<2 in tests * pip install together * pip install together --- .github/workflows/test.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index dffc17c..1eaf505 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -49,8 +49,7 @@ jobs: steps: - uses: conda-incubator/setup-miniconda@v2 - run: conda install -n test ffmpeg python=${{ matrix.python-version }} - - run: pip3 install torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu - uses: actions/checkout@v3 - run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH - - run: pip install .["dev"] + - run: pip3 install .["dev"] 'numpy<2' torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple - run: pytest --durations=0 -vv -k 'not test_transcribe or test_transcribe[tiny] or test_transcribe[tiny.en]' -m 'not requires_cuda' From 423492dda7806206abe56bdfe427c1096473a020 Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Fri, 27 Sep 2024 16:43:58 -0700 Subject: [PATCH 40/45] Release 20240927 --- CHANGELOG.md | 7 +++++++ whisper/version.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5895541..3f09538 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # CHANGELOG +## [v20240927](https://github.com/openai/whisper/releases/tag/v20240927) + +* pinning numpy<2 in tests ([#2332](https://github.com/openai/whisper/pull/2332)) +* Relax triton requirements for compatibility with pytorch 2.4 and newer ([#2307](https://github.com/openai/whisper/pull/2307)) +* Skip silence around hallucinations ([#1838](https://github.com/openai/whisper/pull/1838)) +* Fix triton env marker ([#1887](https://github.com/openai/whisper/pull/1887)) + ## [v20231117](https://github.com/openai/whisper/releases/tag/v20231117) * Relax triton requirements for compatibility with pytorch 2.1 and newer ([#1802](https://github.com/openai/whisper/pull/1802)) diff --git a/whisper/version.py b/whisper/version.py index c96dd9c..2242d25 100644 --- a/whisper/version.py +++ b/whisper/version.py @@ -1 +1 @@ -__version__ = "20231117" +__version__ = "20240927" From 27f971320a50e65fd510b88be04219a6ade31f9b Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 30 Sep 2024 10:27:14 -0700 Subject: [PATCH 41/45] using sdpa if available (#2359) * using sdpa if available * Update model.py --- whisper/model.py | 51 +++++++++++++++++++++++++++++++++++++---------- whisper/timing.py | 4 +++- 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/whisper/model.py b/whisper/model.py index a678283..e537447 100644 --- a/whisper/model.py +++ b/whisper/model.py @@ -1,7 +1,8 @@ import base64 import gzip +from contextlib import contextmanager from dataclasses import dataclass -from typing import Dict, Iterable, Optional +from typing import Dict, Iterable, Optional, Tuple import numpy as np import torch @@ -12,6 +13,14 @@ from .decoding import decode as decode_function from .decoding import detect_language as detect_language_function from .transcribe import transcribe as transcribe_function +try: + from torch.nn.functional import scaled_dot_product_attention + + SDPA_AVAILABLE = True +except (ImportError, RuntimeError, OSError): + scaled_dot_product_attention = None + SDPA_AVAILABLE = False + @dataclass class ModelDimensions: @@ -59,7 +68,19 @@ def sinusoids(length, channels, max_timescale=10000): return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1) +@contextmanager +def disable_sdpa(): + prev_state = MultiHeadAttention.use_sdpa + try: + MultiHeadAttention.use_sdpa = False + yield + finally: + MultiHeadAttention.use_sdpa = prev_state + + class MultiHeadAttention(nn.Module): + use_sdpa = True + def __init__(self, n_state: int, n_head: int): super().__init__() self.n_head = n_head @@ -92,20 +113,30 @@ class MultiHeadAttention(nn.Module): def qkv_attention( self, q: Tensor, k: Tensor, v: Tensor, mask: Optional[Tensor] = None - ): + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: n_batch, n_ctx, n_state = q.shape scale = (n_state // self.n_head) ** -0.25 - q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) * scale - k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 3, 1) * scale + q = q.view(*q.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) + k = k.view(*k.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) v = v.view(*v.shape[:2], self.n_head, -1).permute(0, 2, 1, 3) - qk = q @ k - if mask is not None: - qk = qk + mask[:n_ctx, :n_ctx] - qk = qk.float() + if SDPA_AVAILABLE and MultiHeadAttention.use_sdpa: + a = scaled_dot_product_attention( + q, k, v, is_causal=mask is not None and n_ctx > 1 + ) + out = a.permute(0, 2, 1, 3).flatten(start_dim=2) + qk = None + else: + qk = (q * scale) @ (k * scale).transpose(-1, -2) + if mask is not None: + qk = qk + mask[:n_ctx, :n_ctx] + qk = qk.float() - w = F.softmax(qk, dim=-1).to(q.dtype) - return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach() + w = F.softmax(qk, dim=-1).to(q.dtype) + out = (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2) + qk = qk.detach() + + return out, qk class ResidualAttentionBlock(nn.Module): diff --git a/whisper/timing.py b/whisper/timing.py index b695ead..e563414 100644 --- a/whisper/timing.py +++ b/whisper/timing.py @@ -191,7 +191,9 @@ def find_alignment( for i, block in enumerate(model.decoder.blocks) ] - with torch.no_grad(): + from .model import disable_sdpa + + with torch.no_grad(), disable_sdpa(): logits = model(mel.unsqueeze(0), tokens.unsqueeze(0))[0] sampled_logits = logits[len(tokenizer.sot_sequence) :, : tokenizer.eot] token_probs = sampled_logits.softmax(dim=-1) From b66b46f32dd3934edd3e79b2821357f52d388501 Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 30 Sep 2024 10:33:56 -0700 Subject: [PATCH 42/45] test on python/pytorch versions up to 3.12 and 2.4.1 (#2360) --- .github/workflows/test.yml | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1eaf505..a1cc48d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -41,11 +41,19 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] - pytorch-version: [1.13.1, 2.0.0] - exclude: - - python-version: '3.11' + include: + - python-version: '3.8' pytorch-version: 1.13.1 + - python-version: '3.8' + pytorch-version: 2.0.1 + - python-version: '3.9' + pytorch-version: 2.1.2 + - python-version: '3.10' + pytorch-version: 2.2.2 + - python-version: '3.11' + pytorch-version: 2.3.1 + - python-version: '3.12' + pytorch-version: 2.4.1 steps: - uses: conda-incubator/setup-miniconda@v2 - run: conda install -n test ffmpeg python=${{ matrix.python-version }} From 25e5c364e0a21ddefee46adb674c591f1ba610ba Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 30 Sep 2024 10:59:51 -0700 Subject: [PATCH 43/45] large-v3-turbo model (#2361) --- README.md | 20 ++++++++++++-------- model-card.md | 4 +++- whisper/__init__.py | 4 ++++ whisper/transcribe.py | 2 +- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index afca9c9..910b7db 100644 --- a/README.md +++ b/README.md @@ -57,17 +57,21 @@ pip install setuptools-rust ## Available models and languages -There are five model sizes, four with English-only versions, offering speed and accuracy tradeoffs. Below are the names of the available models and their approximate memory requirements and inference speed relative to the large model; actual speed may vary depending on many factors including the available hardware. +There are six model sizes, four with English-only versions, offering speed and accuracy tradeoffs. +Below are the names of the available models and their approximate memory requirements and inference speed relative to the large model. +The relative speeds below are measured by transcribing English speech on a A100, and the real-world speed may vary significantly depending on many factors including the language, the speaking speed, and the available hardware. | Size | Parameters | English-only model | Multilingual model | Required VRAM | Relative speed | |:------:|:----------:|:------------------:|:------------------:|:-------------:|:--------------:| -| tiny | 39 M | `tiny.en` | `tiny` | ~1 GB | ~32x | -| base | 74 M | `base.en` | `base` | ~1 GB | ~16x | -| small | 244 M | `small.en` | `small` | ~2 GB | ~6x | +| tiny | 39 M | `tiny.en` | `tiny` | ~1 GB | ~10x | +| base | 74 M | `base.en` | `base` | ~1 GB | ~7x | +| small | 244 M | `small.en` | `small` | ~2 GB | ~4x | | medium | 769 M | `medium.en` | `medium` | ~5 GB | ~2x | | large | 1550 M | N/A | `large` | ~10 GB | 1x | +| turbo | 809 M | N/A | `turbo` | ~6 GB | ~8x | The `.en` models for English-only applications tend to perform better, especially for the `tiny.en` and `base.en` models. We observed that the difference becomes less significant for the `small.en` and `medium.en` models. +Additionally, the `turbo` model is an optimized version of `large-v3` that offers faster transcription speed with a minimal degradation in accuracy. Whisper's performance varies widely depending on the language. The figure below shows a performance breakdown of `large-v3` and `large-v2` models by language, using WERs (word error rates) or CER (character error rates, shown in *Italic*) evaluated on the Common Voice 15 and Fleurs datasets. Additional WER/CER metrics corresponding to the other models and datasets can be found in Appendix D.1, D.2, and D.4 of [the paper](https://arxiv.org/abs/2212.04356), as well as the BLEU (Bilingual Evaluation Understudy) scores for translation in Appendix D.3. @@ -77,9 +81,9 @@ Whisper's performance varies widely depending on the language. The figure below ## Command-line usage -The following command will transcribe speech in audio files, using the `medium` model: +The following command will transcribe speech in audio files, using the `turbo` model: - whisper audio.flac audio.mp3 audio.wav --model medium + whisper audio.flac audio.mp3 audio.wav --model turbo The default setting (which selects the `small` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option: @@ -103,7 +107,7 @@ Transcription can also be performed within Python: ```python import whisper -model = whisper.load_model("base") +model = whisper.load_model("turbo") result = model.transcribe("audio.mp3") print(result["text"]) ``` @@ -115,7 +119,7 @@ Below is an example usage of `whisper.detect_language()` and `whisper.decode()` ```python import whisper -model = whisper.load_model("base") +model = whisper.load_model("turbo") # load audio and pad/trim it to fit 30 seconds audio = whisper.load_audio("audio.mp3") diff --git a/model-card.md b/model-card.md index 3c041a1..291bc4b 100644 --- a/model-card.md +++ b/model-card.md @@ -16,13 +16,15 @@ The Whisper models are trained for speech recognition and translation tasks, cap | small | 244 M | ✓ | ✓ | | medium | 769 M | ✓ | ✓ | | large | 1550 M | | ✓ | +| turbo | 798 M | | ✓ | In December 2022, we [released an improved large model named `large-v2`](https://github.com/openai/whisper/discussions/661), and `large-v3` in November 2023. +Additionally, we've added a `turbo` model in September 2024 which is optimized for inference speed. ### Release date -September 2022 (original series), December 2022 (`large-v2`), and November 2023 (`large-v3`) +September 2022 (original series), December 2022 (`large-v2`), November 2023 (`large-v3`), September 2024 (`large-v3-turbo`) ### Model type diff --git a/whisper/__init__.py b/whisper/__init__.py index d7fbba3..e210718 100644 --- a/whisper/__init__.py +++ b/whisper/__init__.py @@ -27,6 +27,8 @@ _MODELS = { "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt", "large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt", "large": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt", + "large-v3-turbo": "https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt", + "turbo": "https://openaipublic.azureedge.net/main/whisper/models/aff26ae408abcba5fbf8813c21e62b0941638c5f6eebfb145be0c9839262a19a/large-v3-turbo.pt", } # base85-encoded (n_layers, n_heads) boolean arrays indicating the cross-attention heads that are @@ -44,6 +46,8 @@ _ALIGNMENT_HEADS = { "large-v2": b"ABzY8zd+h!0{>%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj", "large-v3": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00", "large": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00", + "large-v3-turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`", + "turbo": b"ABzY8j^C+e0{>%RARaKHP%t(lGR*)0g!tONPyhe`", } diff --git a/whisper/transcribe.py b/whisper/transcribe.py index 1c075a2..8e1240b 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -511,7 +511,7 @@ def cli(): # fmt: off parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe") - parser.add_argument("--model", default="small", type=valid_model_name, help="name of the Whisper model to use") + parser.add_argument("--model", default="turbo", type=valid_model_name, help="name of the Whisper model to use") parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default") parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference") parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs") From 260bbcfcb3cd17a6952f1a51d516e4b2f0e2559a Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 30 Sep 2024 11:18:17 -0700 Subject: [PATCH 44/45] allowing numpy 2 in tests (#2362) * allowing numpy 2 in tests * allowing numpy 2 in tests --- .github/workflows/test.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a1cc48d..88131f5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -44,20 +44,26 @@ jobs: include: - python-version: '3.8' pytorch-version: 1.13.1 + numpy-requirement: "'numpy<2'" - python-version: '3.8' pytorch-version: 2.0.1 + numpy-requirement: "'numpy<2'" - python-version: '3.9' pytorch-version: 2.1.2 + numpy-requirement: "'numpy<2'" - python-version: '3.10' pytorch-version: 2.2.2 + numpy-requirement: "'numpy<2'" - python-version: '3.11' pytorch-version: 2.3.1 + numpy-requirement: "'numpy'" - python-version: '3.12' pytorch-version: 2.4.1 + numpy-requirement: "'numpy'" steps: - uses: conda-incubator/setup-miniconda@v2 - run: conda install -n test ffmpeg python=${{ matrix.python-version }} - uses: actions/checkout@v3 - run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH - - run: pip3 install .["dev"] 'numpy<2' torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple + - run: pip3 install .["dev"] ${{ matrix.numpy-requirement }} torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple - run: pytest --durations=0 -vv -k 'not test_transcribe or test_transcribe[tiny] or test_transcribe[tiny.en]' -m 'not requires_cuda' From 25639fc17ddc013d56c594bfbf7644f2185fad84 Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Mon, 30 Sep 2024 11:20:53 -0700 Subject: [PATCH 45/45] Release 20240930 --- CHANGELOG.md | 7 +++++++ whisper/version.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3f09538..7152899 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # CHANGELOG +## [v20240930](https://github.com/openai/whisper/releases/tag/v20240930) + +* allowing numpy 2 in tests ([#2362](https://github.com/openai/whisper/pull/2362)) +* large-v3-turbo model ([#2361](https://github.com/openai/whisper/pull/2361)) +* test on python/pytorch versions up to 3.12 and 2.4.1 ([#2360](https://github.com/openai/whisper/pull/2360)) +* using sdpa if available ([#2359](https://github.com/openai/whisper/pull/2359)) + ## [v20240927](https://github.com/openai/whisper/releases/tag/v20240927) * pinning numpy<2 in tests ([#2332](https://github.com/openai/whisper/pull/2332)) diff --git a/whisper/version.py b/whisper/version.py index 2242d25..b4b3350 100644 --- a/whisper/version.py +++ b/whisper/version.py @@ -1 +1 @@ -__version__ = "20240927" +__version__ = "20240930"