mirror of
https://github.com/openai/whisper.git
synced 2025-03-30 14:28:27 +00:00
15 lines
524 B
Python
15 lines
524 B
Python
from whisper.tokenizer import get_tokenizer
|
|
|
|
|
|
def test_tokenizer():
|
|
gpt2_tokenizer = get_tokenizer(multilingual=False)
|
|
multilingual_tokenizer = get_tokenizer(multilingual=True)
|
|
|
|
text = "다람쥐 헌 쳇바퀴에 타고파"
|
|
gpt2_tokens = gpt2_tokenizer.encode(text)
|
|
multilingual_tokens = multilingual_tokenizer.encode(text)
|
|
|
|
assert gpt2_tokenizer.decode(gpt2_tokens) == text
|
|
assert multilingual_tokenizer.decode(multilingual_tokens) == text
|
|
assert len(gpt2_tokens) > len(multilingual_tokens)
|