From cdb81479623391f0651f4f9175ad986e85777f31 Mon Sep 17 00:00:00 2001 From: Jong Wook Kim Date: Fri, 25 Oct 2024 17:30:02 -0700 Subject: [PATCH 1/4] more pytorch versions in tests (#2408) --- .github/workflows/test.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 88131f5..84b81cc 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -42,6 +42,9 @@ jobs: strategy: matrix: include: + - python-version: '3.8' + pytorch-version: 1.10.1 + numpy-requirement: "'numpy<2'" - python-version: '3.8' pytorch-version: 1.13.1 numpy-requirement: "'numpy<2'" @@ -60,6 +63,9 @@ jobs: - python-version: '3.12' pytorch-version: 2.4.1 numpy-requirement: "'numpy'" + - python-version: '3.12' + pytorch-version: 2.5.0 + numpy-requirement: "'numpy'" steps: - uses: conda-incubator/setup-miniconda@v2 - run: conda install -n test ffmpeg python=${{ matrix.python-version }} From 5979f03701209bb035a0a466f14131aeb1116cbb Mon Sep 17 00:00:00 2001 From: kittsil Date: Sat, 26 Oct 2024 09:17:31 -0500 Subject: [PATCH 2/4] Add option to carry initial_prompt with the sliding window (#2343) * Add option to carry initial_prompt with the sliding window Add an option `carry_initial_prompt = False` to `whisper.transcribe()`. When set to `True`, `initial_prompt` is prepended to each internal `decode()` call's `prompt`. If there is not enough context space at the start of the prompt, the prompt is left-sliced to make space. * Prevent redundant initial_prompt_tokens * Revert unnecessary .gitignore change --------- Co-authored-by: Kittsil Co-authored-by: Jong Wook Kim --- whisper/transcribe.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/whisper/transcribe.py b/whisper/transcribe.py index 8e1240b..8eb6a71 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -46,6 +46,7 @@ def transcribe( no_speech_threshold: Optional[float] = 0.6, condition_on_previous_text: bool = True, initial_prompt: Optional[str] = None, + carry_initial_prompt: bool = False, word_timestamps: bool = False, prepend_punctuations: str = "\"'“¿([{-", append_punctuations: str = "\"'.。,,!!??::”)]}、", @@ -102,6 +103,11 @@ def transcribe( "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns to make it more likely to predict those word correctly. + carry_initial_prompt: bool + If carry_initial_prompt is True, `initial_prompt` is prepended to the prompt of each internal + `decode()` call. If there is not enough context space at the start of the prompt, it is + left-sliced to make space. + decode_options: dict Keyword arguments to construct `DecodingOptions` instances @@ -227,9 +233,11 @@ def transcribe( all_segments = [] prompt_reset_since = 0 + remaining_prompt_length = model.dims.n_text_ctx // 2 - 1 if initial_prompt is not None: initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip()) all_tokens.extend(initial_prompt_tokens) + remaining_prompt_length -= len(initial_prompt_tokens) else: initial_prompt_tokens = [] @@ -275,7 +283,13 @@ def transcribe( segment_duration = segment_size * HOP_LENGTH / SAMPLE_RATE mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(model.device).to(dtype) - decode_options["prompt"] = all_tokens[prompt_reset_since:] + if carry_initial_prompt: + nignored = max(len(initial_prompt_tokens), prompt_reset_since) + remaining_prompt = all_tokens[nignored:][-remaining_prompt_length:] + decode_options["prompt"] = initial_prompt_tokens + remaining_prompt + else: + decode_options["prompt"] = all_tokens[prompt_reset_since:] + result: DecodingResult = decode_with_fallback(mel_segment) tokens = torch.tensor(result.tokens) @@ -529,6 +543,8 @@ def cli(): parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations") parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.") + parser.add_argument("--carry_initial_prompt", type=str2bool, default=False, help="if True, prepend initial_prompt to every internal decode() call. May reduce the effectiveness of condition_on_previous_text") + parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop") parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default") From 271445b2f24f00f8175c4fb7ae91876f7451dfc1 Mon Sep 17 00:00:00 2001 From: BotMaster3000 Date: Mon, 4 Nov 2024 08:00:30 +0100 Subject: [PATCH 3/4] Update README.md (#2379) Default now uses Turbo instead of Small --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 910b7db..1a661d7 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,7 @@ The following command will transcribe speech in audio files, using the `turbo` m whisper audio.flac audio.mp3 audio.wav --model turbo -The default setting (which selects the `small` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option: +The default setting (which selects the `turbo` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option: whisper japanese.wav --language Japanese From 173ff7dd1d9fb1c4fddea0d41d704cfefeb8908c Mon Sep 17 00:00:00 2001 From: f1sh <71207078+YuZekai@users.noreply.github.com> Date: Wed, 13 Nov 2024 08:35:54 +0800 Subject: [PATCH 4/4] fix typo data/README.md (#2433) --- data/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/README.md b/data/README.md index 3b4aea1..fcb3200 100644 --- a/data/README.md +++ b/data/README.md @@ -45,7 +45,7 @@ We downloaded the [CHiME-5 dataset](https://spandh.dcs.shef.ac.uk//chime_challen ### AMI-IHM, AMI-SDM1 -We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 ad 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b). +We preprocessed the [AMI Corpus](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) by following the stage 0 and 2 of the [s5b recipe](https://github.com/kaldi-asr/kaldi/tree/master/egs/ami/s5b). ## Long-form English-only datasets