Update tokenizer.py (#1163)

This commit is contained in:
Jong Wook Kim 2023-03-29 16:12:36 -04:00 committed by GitHub
parent 6dea21fd7f
commit b5851c6c40
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -6,7 +6,6 @@ from functools import cached_property, lru_cache
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
import tiktoken import tiktoken
from tiktoken_ext.openai_public import gpt2
LANGUAGES = { LANGUAGES = {
"en": "english", "en": "english",
@ -352,7 +351,7 @@ def get_encoding(name: str = "gpt2"):
return tiktoken.Encoding( return tiktoken.Encoding(
name=os.path.basename(vocab_path), name=os.path.basename(vocab_path),
explicit_n_vocab=n_vocab, explicit_n_vocab=n_vocab,
pat_str=gpt2()["pat_str"], pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
mergeable_ranks=ranks, mergeable_ranks=ranks,
special_tokens=special_tokens, special_tokens=special_tokens,
) )