diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..be006de --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,13 @@ +# Keep GitHub Actions up to date with GitHub's Dependabot... +# https://docs.github.com/en/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot +# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#package-ecosystem +version: 2 +updates: + - package-ecosystem: github-actions + directory: / + groups: + github-actions: + patterns: + - "*" # Group all Actions updates into a single larger pull request + schedule: + interval: weekly diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 4b91a2a..715c8eb 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -8,14 +8,14 @@ jobs: deploy: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions-ecosystem/action-regex-match@v2 id: regex-match with: text: ${{ github.event.head_commit.message }} regex: '^Release ([^ ]+)' - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.8' - name: Install dependencies @@ -24,7 +24,7 @@ jobs: pip install setuptools wheel twine - name: Release if: ${{ steps.regex-match.outputs.match != '' }} - uses: softprops/action-gh-release@v1 + uses: softprops/action-gh-release@v2 with: tag_name: v${{ steps.regex-match.outputs.group1 }} - name: Build and publish @@ -33,5 +33,5 @@ jobs: TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} run: | - python setup.py sdist + python -m build --sdist twine upload dist/* diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 84b81cc..3b53de8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -11,19 +11,19 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Fetch base branch run: git fetch origin ${{ github.base_ref }} - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: - python-version: "3.8" + python-version: "3.9" architecture: x64 - name: Get pip cache dir id: pip-cache run: | echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT - name: pip/pre-commit cache - uses: actions/cache@v3 + uses: actions/cache@v4 with: path: | ${{ steps.pip-cache.outputs.dir }} @@ -33,13 +33,14 @@ jobs: ${{ runner.os }}-pip-pre-commit - name: pre-commit run: | - pip install -U pre-commit + pip install --upgrade pre-commit pre-commit install --install-hooks pre-commit run --all-files whisper-test: needs: pre-commit runs-on: ubuntu-latest strategy: + fail-fast: false matrix: include: - python-version: '3.8' @@ -64,12 +65,15 @@ jobs: pytorch-version: 2.4.1 numpy-requirement: "'numpy'" - python-version: '3.12' - pytorch-version: 2.5.0 + pytorch-version: 2.5.1 + numpy-requirement: "'numpy'" + - python-version: '3.13' + pytorch-version: 2.5.1 numpy-requirement: "'numpy'" steps: - - uses: conda-incubator/setup-miniconda@v2 + - uses: conda-incubator/setup-miniconda@v3 - run: conda install -n test ffmpeg python=${{ matrix.python-version }} - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH - run: pip3 install .["dev"] ${{ matrix.numpy-requirement }} torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple - run: pytest --durations=0 -vv -k 'not test_transcribe or test_transcribe[tiny] or test_transcribe[tiny.en]' -m 'not requires_cuda' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3f5a74b..514f940 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.0.1 + rev: v5.0.0 hooks: - id: check-json - id: end-of-file-fixer @@ -11,17 +11,17 @@ repos: - id: check-added-large-files args: [--maxkb=4096] - repo: https://github.com/psf/black - rev: 23.7.0 + rev: 25.1.0 hooks: - id: black - repo: https://github.com/pycqa/isort - rev: 5.12.0 + rev: 6.0.0 hooks: - id: isort name: isort (python) args: ["--profile", "black", "-l", "88", "--trailing-comma", "--multi-line", "3"] - repo: https://github.com/pycqa/flake8.git - rev: 6.0.0 + rev: 7.1.1 hooks: - id: flake8 types: [python] diff --git a/README.md b/README.md index 1a661d7..196b48f 100644 --- a/README.md +++ b/README.md @@ -77,25 +77,35 @@ Whisper's performance varies widely depending on the language. The figure below ![WER breakdown by language](https://github.com/openai/whisper/assets/266841/f4619d66-1058-4005-8f67-a9d811b77c62) - - ## Command-line usage The following command will transcribe speech in audio files, using the `turbo` model: - whisper audio.flac audio.mp3 audio.wav --model turbo +```bash +whisper audio.flac audio.mp3 audio.wav --model turbo +``` -The default setting (which selects the `turbo` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option: +The default setting (which selects the `turbo` model) works well for transcribing English. However, **the `turbo` model is not trained for translation tasks**. If you need to **translate non-English speech into English**, use one of the **multilingual models** (`tiny`, `base`, `small`, `medium`, `large`) instead of `turbo`. - whisper japanese.wav --language Japanese +For example, to transcribe an audio file containing non-English speech, you can specify the language: -Adding `--task translate` will translate the speech into English: +```bash +whisper japanese.wav --language Japanese +``` - whisper japanese.wav --language Japanese --task translate +To **translate** speech into English, use: + +```bash +whisper japanese.wav --model medium --language Japanese --task translate +``` + +> **Note:** The `turbo` model will return the original language even if `--task translate` is specified. Use `medium` or `large` for the best translation results. Run the following to view all available options: - whisper --help +```bash +whisper --help +``` See [tokenizer.py](https://github.com/openai/whisper/blob/main/whisper/tokenizer.py) for the list of all available languages. @@ -126,7 +136,7 @@ audio = whisper.load_audio("audio.mp3") audio = whisper.pad_or_trim(audio) # make log-Mel spectrogram and move to the same device as the model -mel = whisper.log_mel_spectrogram(audio).to(model.device) +mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device) # detect the spoken language _, probs = model.detect_language(mel) diff --git a/notebooks/LibriSpeech.ipynb b/notebooks/LibriSpeech.ipynb index 3d90e65..602bbe4 100644 --- a/notebooks/LibriSpeech.ipynb +++ b/notebooks/LibriSpeech.ipynb @@ -949,7 +949,8 @@ "style": "IPY_MODEL_039b53f2702c4179af7e0548018d0588", "value": " 164/164 [05:08<00:00, 1.86s/it]" } - } + }, + "state": {} } } }, diff --git a/notebooks/Multilingual_ASR.ipynb b/notebooks/Multilingual_ASR.ipynb index 2d32e0e..f19e3e0 100644 --- a/notebooks/Multilingual_ASR.ipynb +++ b/notebooks/Multilingual_ASR.ipynb @@ -4219,7 +4219,8 @@ "_view_name": "StyleView", "description_width": "" } - } + }, + "state": {} } } }, diff --git a/pyproject.toml b/pyproject.toml index 84637eb..21b90e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,50 @@ +[build-system] +build-backend = "setuptools.build_meta" + +requires = [ "setuptools>=61.2" ] + +[project] +name = "openai-whisper" +description = "Robust Speech Recognition via Large-Scale Weak Supervision" +readme.content-type = "text/markdown" +readme.file = "README.md" +license = { text = "MIT" } +authors = [ { name = "OpenAI" } ] +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +dynamic = [ "version" ] +dependencies = [ + "more-itertools", + "numba", + "numpy", + "tiktoken", + "torch", + "tqdm", + "triton>=2; (platform_machine=='x86_64' and sys_platform=='linux') or sys_platform=='linux2'", +] +optional-dependencies.dev = [ "black", "flake8", "isort", "pytest", "scipy" ] +urls = { Homepage = "https://github.com/openai/whisper" } +scripts.whisper = "whisper.transcribe:cli" + +[tool.setuptools] +py-modules = [ "whisper" ] +include-package-data = true + +[tool.setuptools.dynamic] +version = { attr = "whisper.version.__version__" } + +[tool.setuptools.packages.find] +exclude = [ "tests*" ] +namespaces = false + [tool.black] [tool.isort] @@ -5,4 +52,3 @@ profile = "black" include_trailing_comma = true line_length = 88 multi_line_output = 3 - diff --git a/setup.py b/setup.py deleted file mode 100644 index 73c4eb8..0000000 --- a/setup.py +++ /dev/null @@ -1,42 +0,0 @@ -import platform -import sys -from pathlib import Path - -import pkg_resources -from setuptools import find_packages, setup - - -def read_version(fname="whisper/version.py"): - exec(compile(open(fname, encoding="utf-8").read(), fname, "exec")) - return locals()["__version__"] - - -requirements = [] -if sys.platform.startswith("linux") and platform.machine() == "x86_64": - requirements.append("triton>=2.0.0") - -setup( - name="openai-whisper", - py_modules=["whisper"], - version=read_version(), - description="Robust Speech Recognition via Large-Scale Weak Supervision", - long_description=open("README.md", encoding="utf-8").read(), - long_description_content_type="text/markdown", - readme="README.md", - python_requires=">=3.8", - author="OpenAI", - url="https://github.com/openai/whisper", - license="MIT", - packages=find_packages(exclude=["tests*"]), - install_requires=[ - str(r) - for r in pkg_resources.parse_requirements( - Path(__file__).with_name("requirements.txt").open() - ) - ], - entry_points={ - "console_scripts": ["whisper=whisper.transcribe:cli"], - }, - include_package_data=True, - extras_require={"dev": ["pytest", "scipy", "black", "flake8", "isort"]}, -) diff --git a/whisper/audio.py b/whisper/audio.py index cf6c66a..826250f 100644 --- a/whisper/audio.py +++ b/whisper/audio.py @@ -122,7 +122,7 @@ def log_mel_spectrogram( The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz n_mels: int - The number of Mel-frequency filters, only 80 is supported + The number of Mel-frequency filters, only 80 and 128 are supported padding: int Number of zero samples to pad to the right @@ -132,7 +132,7 @@ def log_mel_spectrogram( Returns ------- - torch.Tensor, shape = (80, n_frames) + torch.Tensor, shape = (n_mels, n_frames) A Tensor that contains the Mel spectrogram """ if not torch.is_tensor(audio): diff --git a/whisper/normalizers/basic.py b/whisper/normalizers/basic.py index a824032..8690ae7 100644 --- a/whisper/normalizers/basic.py +++ b/whisper/normalizers/basic.py @@ -30,15 +30,19 @@ def remove_symbols_and_diacritics(s: str, keep=""): and drop any diacritics (category 'Mn' and some manual mappings) """ return "".join( - c - if c in keep - else ADDITIONAL_DIACRITICS[c] - if c in ADDITIONAL_DIACRITICS - else "" - if unicodedata.category(c) == "Mn" - else " " - if unicodedata.category(c)[0] in "MSP" - else c + ( + c + if c in keep + else ( + ADDITIONAL_DIACRITICS[c] + if c in ADDITIONAL_DIACRITICS + else ( + "" + if unicodedata.category(c) == "Mn" + else " " if unicodedata.category(c)[0] in "MSP" else c + ) + ) + ) for c in unicodedata.normalize("NFKD", s) ) diff --git a/whisper/transcribe.py b/whisper/transcribe.py index 8eb6a71..0a4cc36 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -214,6 +214,8 @@ def transcribe( if ( no_speech_threshold is not None and decode_result.no_speech_prob > no_speech_threshold + and logprob_threshold is not None + and decode_result.avg_logprob < logprob_threshold ): needs_fallback = False # silence if not needs_fallback: diff --git a/whisper/triton_ops.py b/whisper/triton_ops.py index edd4564..13d417b 100644 --- a/whisper/triton_ops.py +++ b/whisper/triton_ops.py @@ -60,7 +60,7 @@ def median_kernel(filter_width: int): tl.store(y_ptr + offsets, MIDDLE_ROW_HERE, mask=mask) # noqa: F821 kernel = triton.JITFunction(kernel.fn) - kernel.src = kernel.src.replace( + new_kernel = kernel.src.replace( " LOAD_ALL_ROWS_HERE", "\n".join( [ @@ -69,7 +69,8 @@ def median_kernel(filter_width: int): ] ), ) - kernel.src = kernel.src.replace( + + new_kernel = new_kernel.replace( " BUBBLESORT_HERE", "\n\n".join( [ @@ -90,7 +91,14 @@ def median_kernel(filter_width: int): ] ), ) - kernel.src = kernel.src.replace("MIDDLE_ROW_HERE", f"row{filter_width // 2}") + + new_kernel = new_kernel.replace("MIDDLE_ROW_HERE", f"row{filter_width // 2}") + + if hasattr(kernel, "_unsafe_update_src") is True: + kernel._unsafe_update_src(new_kernel) + kernel.hash = None + else: + kernel.src = new_kernel return kernel diff --git a/whisper/utils.py b/whisper/utils.py index 9b9b138..13792f7 100644 --- a/whisper/utils.py +++ b/whisper/utils.py @@ -209,9 +209,11 @@ class SubtitlesWriter(ResultWriter): yield start, end, "".join( [ - re.sub(r"^(\s*)(.*)$", r"\1\2", word) - if j == i - else word + ( + re.sub(r"^(\s*)(.*)$", r"\1\2", word) + if j == i + else word + ) for j, word in enumerate(all_words) ] )