mirror of
https://github.com/openai/whisper.git
synced 2025-07-01 10:02:28 +00:00
remove tiktoken pin (#1759)
This commit is contained in:
parent
b9f17e1f2d
commit
746aaaeafa
@ -3,4 +3,4 @@ numpy
|
|||||||
torch
|
torch
|
||||||
tqdm
|
tqdm
|
||||||
more-itertools
|
more-itertools
|
||||||
tiktoken==0.3.3
|
tiktoken
|
||||||
|
@ -1,7 +1,17 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
from whisper.tokenizer import get_tokenizer
|
from whisper.tokenizer import get_tokenizer
|
||||||
|
|
||||||
|
|
||||||
def test_tokenizer():
|
@pytest.mark.parametrize("multilingual", [True, False])
|
||||||
|
def test_tokenizer(multilingual):
|
||||||
|
tokenizer = get_tokenizer(multilingual=False)
|
||||||
|
assert tokenizer.sot in tokenizer.sot_sequence
|
||||||
|
assert len(tokenizer.all_language_codes) == len(tokenizer.all_language_tokens)
|
||||||
|
assert all(c < tokenizer.timestamp_begin for c in tokenizer.all_language_tokens)
|
||||||
|
|
||||||
|
|
||||||
|
def test_multilingual_tokenizer():
|
||||||
gpt2_tokenizer = get_tokenizer(multilingual=False)
|
gpt2_tokenizer = get_tokenizer(multilingual=False)
|
||||||
multilingual_tokenizer = get_tokenizer(multilingual=True)
|
multilingual_tokenizer = get_tokenizer(multilingual=True)
|
||||||
|
|
||||||
@ -20,5 +30,5 @@ def test_split_on_unicode():
|
|||||||
tokens = [8404, 871, 287, 6, 246, 526, 3210, 20378]
|
tokens = [8404, 871, 287, 6, 246, 526, 3210, 20378]
|
||||||
words, word_tokens = multilingual_tokenizer.split_tokens_on_unicode(tokens)
|
words, word_tokens = multilingual_tokenizer.split_tokens_on_unicode(tokens)
|
||||||
|
|
||||||
assert words == [" elle", " est", " l", "'", "<EFBFBD>", "é", "rit", "oire"]
|
assert words == [" elle", " est", " l", "'", "\ufffd", "é", "rit", "oire"]
|
||||||
assert word_tokens == [[8404], [871], [287], [6], [246], [526], [3210], [20378]]
|
assert word_tokens == [[8404], [871], [287], [6], [246], [526], [3210], [20378]]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user