Merge branch 'main' into main

This commit is contained in:
Jong Wook Kim 2025-06-25 17:05:03 -07:00 committed by GitHub
commit 8c3f996e1e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 136 additions and 87 deletions

13
.github/dependabot.yml vendored Normal file
View File

@ -0,0 +1,13 @@
# Keep GitHub Actions up to date with GitHub's Dependabot...
# https://docs.github.com/en/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot
# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file#package-ecosystem
version: 2
updates:
- package-ecosystem: github-actions
directory: /
groups:
github-actions:
patterns:
- "*" # Group all Actions updates into a single larger pull request
schedule:
interval: weekly

View File

@ -8,14 +8,14 @@ jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- uses: actions-ecosystem/action-regex-match@v2
id: regex-match
with:
text: ${{ github.event.head_commit.message }}
regex: '^Release ([^ ]+)'
- name: Set up Python
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: '3.8'
- name: Install dependencies
@ -24,7 +24,7 @@ jobs:
pip install setuptools wheel twine
- name: Release
if: ${{ steps.regex-match.outputs.match != '' }}
uses: softprops/action-gh-release@v1
uses: softprops/action-gh-release@v2
with:
tag_name: v${{ steps.regex-match.outputs.group1 }}
- name: Build and publish
@ -33,5 +33,5 @@ jobs:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
run: |
python setup.py sdist
python -m build --sdist
twine upload dist/*

View File

@ -11,19 +11,19 @@ jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Fetch base branch
run: git fetch origin ${{ github.base_ref }}
- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
python-version: "3.8"
python-version: "3.9"
architecture: x64
- name: Get pip cache dir
id: pip-cache
run: |
echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT
- name: pip/pre-commit cache
uses: actions/cache@v3
uses: actions/cache@v4
with:
path: |
${{ steps.pip-cache.outputs.dir }}
@ -33,13 +33,14 @@ jobs:
${{ runner.os }}-pip-pre-commit
- name: pre-commit
run: |
pip install -U pre-commit
pip install --upgrade pre-commit
pre-commit install --install-hooks
pre-commit run --all-files
whisper-test:
needs: pre-commit
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- python-version: '3.8'
@ -64,12 +65,15 @@ jobs:
pytorch-version: 2.4.1
numpy-requirement: "'numpy'"
- python-version: '3.12'
pytorch-version: 2.5.0
pytorch-version: 2.5.1
numpy-requirement: "'numpy'"
- python-version: '3.13'
pytorch-version: 2.5.1
numpy-requirement: "'numpy'"
steps:
- uses: conda-incubator/setup-miniconda@v2
- uses: conda-incubator/setup-miniconda@v3
- run: conda install -n test ffmpeg python=${{ matrix.python-version }}
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- run: echo "$CONDA/envs/test/bin" >> $GITHUB_PATH
- run: pip3 install .["dev"] ${{ matrix.numpy-requirement }} torch==${{ matrix.pytorch-version }}+cpu --index-url https://download.pytorch.org/whl/cpu --extra-index-url https://pypi.org/simple
- run: pytest --durations=0 -vv -k 'not test_transcribe or test_transcribe[tiny] or test_transcribe[tiny.en]' -m 'not requires_cuda'

View File

@ -1,6 +1,6 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.0.1
rev: v5.0.0
hooks:
- id: check-json
- id: end-of-file-fixer
@ -11,17 +11,17 @@ repos:
- id: check-added-large-files
args: [--maxkb=4096]
- repo: https://github.com/psf/black
rev: 23.7.0
rev: 25.1.0
hooks:
- id: black
- repo: https://github.com/pycqa/isort
rev: 5.12.0
rev: 6.0.0
hooks:
- id: isort
name: isort (python)
args: ["--profile", "black", "-l", "88", "--trailing-comma", "--multi-line", "3"]
- repo: https://github.com/pycqa/flake8.git
rev: 6.0.0
rev: 7.1.1
hooks:
- id: flake8
types: [python]

View File

@ -77,25 +77,35 @@ Whisper's performance varies widely depending on the language. The figure below
![WER breakdown by language](https://github.com/openai/whisper/assets/266841/f4619d66-1058-4005-8f67-a9d811b77c62)
## Command-line usage
The following command will transcribe speech in audio files, using the `turbo` model:
whisper audio.flac audio.mp3 audio.wav --model turbo
```bash
whisper audio.flac audio.mp3 audio.wav --model turbo
```
The default setting (which selects the `turbo` model) works well for transcribing English. To transcribe an audio file containing non-English speech, you can specify the language using the `--language` option:
The default setting (which selects the `turbo` model) works well for transcribing English. However, **the `turbo` model is not trained for translation tasks**. If you need to **translate non-English speech into English**, use one of the **multilingual models** (`tiny`, `base`, `small`, `medium`, `large`) instead of `turbo`.
whisper japanese.wav --language Japanese
For example, to transcribe an audio file containing non-English speech, you can specify the language:
Adding `--task translate` will translate the speech into English:
```bash
whisper japanese.wav --language Japanese
```
whisper japanese.wav --language Japanese --task translate
To **translate** speech into English, use:
```bash
whisper japanese.wav --model medium --language Japanese --task translate
```
> **Note:** The `turbo` model will return the original language even if `--task translate` is specified. Use `medium` or `large` for the best translation results.
Run the following to view all available options:
whisper --help
```bash
whisper --help
```
See [tokenizer.py](https://github.com/openai/whisper/blob/main/whisper/tokenizer.py) for the list of all available languages.
@ -126,7 +136,7 @@ audio = whisper.load_audio("audio.mp3")
audio = whisper.pad_or_trim(audio)
# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)
mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)
# detect the spoken language
_, probs = model.detect_language(mel)

View File

@ -949,7 +949,8 @@
"style": "IPY_MODEL_039b53f2702c4179af7e0548018d0588",
"value": " 164/164 [05:08<00:00, 1.86s/it]"
}
}
},
"state": {}
}
}
},

View File

@ -4219,7 +4219,8 @@
"_view_name": "StyleView",
"description_width": ""
}
}
},
"state": {}
}
}
},

View File

@ -1,3 +1,50 @@
[build-system]
build-backend = "setuptools.build_meta"
requires = [ "setuptools>=61.2" ]
[project]
name = "openai-whisper"
description = "Robust Speech Recognition via Large-Scale Weak Supervision"
readme.content-type = "text/markdown"
readme.file = "README.md"
license = { text = "MIT" }
authors = [ { name = "OpenAI" } ]
requires-python = ">=3.8"
classifiers = [
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
]
dynamic = [ "version" ]
dependencies = [
"more-itertools",
"numba",
"numpy",
"tiktoken",
"torch",
"tqdm",
"triton>=2; (platform_machine=='x86_64' and sys_platform=='linux') or sys_platform=='linux2'",
]
optional-dependencies.dev = [ "black", "flake8", "isort", "pytest", "scipy" ]
urls = { Homepage = "https://github.com/openai/whisper" }
scripts.whisper = "whisper.transcribe:cli"
[tool.setuptools]
py-modules = [ "whisper" ]
include-package-data = true
[tool.setuptools.dynamic]
version = { attr = "whisper.version.__version__" }
[tool.setuptools.packages.find]
exclude = [ "tests*" ]
namespaces = false
[tool.black]
[tool.isort]
@ -5,4 +52,3 @@ profile = "black"
include_trailing_comma = true
line_length = 88
multi_line_output = 3

View File

@ -1,42 +0,0 @@
import platform
import sys
from pathlib import Path
import pkg_resources
from setuptools import find_packages, setup
def read_version(fname="whisper/version.py"):
exec(compile(open(fname, encoding="utf-8").read(), fname, "exec"))
return locals()["__version__"]
requirements = []
if sys.platform.startswith("linux") and platform.machine() == "x86_64":
requirements.append("triton>=2.0.0")
setup(
name="openai-whisper",
py_modules=["whisper"],
version=read_version(),
description="Robust Speech Recognition via Large-Scale Weak Supervision",
long_description=open("README.md", encoding="utf-8").read(),
long_description_content_type="text/markdown",
readme="README.md",
python_requires=">=3.8",
author="OpenAI",
url="https://github.com/openai/whisper",
license="MIT",
packages=find_packages(exclude=["tests*"]),
install_requires=[
str(r)
for r in pkg_resources.parse_requirements(
Path(__file__).with_name("requirements.txt").open()
)
],
entry_points={
"console_scripts": ["whisper=whisper.transcribe:cli"],
},
include_package_data=True,
extras_require={"dev": ["pytest", "scipy", "black", "flake8", "isort"]},
)

View File

@ -122,7 +122,7 @@ def log_mel_spectrogram(
The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
n_mels: int
The number of Mel-frequency filters, only 80 is supported
The number of Mel-frequency filters, only 80 and 128 are supported
padding: int
Number of zero samples to pad to the right
@ -132,7 +132,7 @@ def log_mel_spectrogram(
Returns
-------
torch.Tensor, shape = (80, n_frames)
torch.Tensor, shape = (n_mels, n_frames)
A Tensor that contains the Mel spectrogram
"""
if not torch.is_tensor(audio):

View File

@ -30,15 +30,19 @@ def remove_symbols_and_diacritics(s: str, keep=""):
and drop any diacritics (category 'Mn' and some manual mappings)
"""
return "".join(
c
if c in keep
else ADDITIONAL_DIACRITICS[c]
if c in ADDITIONAL_DIACRITICS
else ""
if unicodedata.category(c) == "Mn"
else " "
if unicodedata.category(c)[0] in "MSP"
else c
(
c
if c in keep
else (
ADDITIONAL_DIACRITICS[c]
if c in ADDITIONAL_DIACRITICS
else (
""
if unicodedata.category(c) == "Mn"
else " " if unicodedata.category(c)[0] in "MSP" else c
)
)
)
for c in unicodedata.normalize("NFKD", s)
)

View File

@ -214,6 +214,8 @@ def transcribe(
if (
no_speech_threshold is not None
and decode_result.no_speech_prob > no_speech_threshold
and logprob_threshold is not None
and decode_result.avg_logprob < logprob_threshold
):
needs_fallback = False # silence
if not needs_fallback:

View File

@ -60,7 +60,7 @@ def median_kernel(filter_width: int):
tl.store(y_ptr + offsets, MIDDLE_ROW_HERE, mask=mask) # noqa: F821
kernel = triton.JITFunction(kernel.fn)
kernel.src = kernel.src.replace(
new_kernel = kernel.src.replace(
" LOAD_ALL_ROWS_HERE",
"\n".join(
[
@ -69,7 +69,8 @@ def median_kernel(filter_width: int):
]
),
)
kernel.src = kernel.src.replace(
new_kernel = new_kernel.replace(
" BUBBLESORT_HERE",
"\n\n".join(
[
@ -90,7 +91,14 @@ def median_kernel(filter_width: int):
]
),
)
kernel.src = kernel.src.replace("MIDDLE_ROW_HERE", f"row{filter_width // 2}")
new_kernel = new_kernel.replace("MIDDLE_ROW_HERE", f"row{filter_width // 2}")
if hasattr(kernel, "_unsafe_update_src") is True:
kernel._unsafe_update_src(new_kernel)
kernel.hash = None
else:
kernel.src = new_kernel
return kernel

View File

@ -209,9 +209,11 @@ class SubtitlesWriter(ResultWriter):
yield start, end, "".join(
[
re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
if j == i
else word
(
re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
if j == i
else word
)
for j, word in enumerate(all_words)
]
)