mirror of
https://github.com/openai/whisper.git
synced 2025-11-23 22:15:58 +00:00
formatting fix
This commit is contained in:
parent
4aa8014589
commit
0efe52add2
@ -42,8 +42,8 @@ _ALIGNMENT_HEADS = {
|
|||||||
"medium": b"ABzY8B0Jh+0{>%R7}kK1fFL7w6%<-Pf*t^=N)Qr&0RR9",
|
"medium": b"ABzY8B0Jh+0{>%R7}kK1fFL7w6%<-Pf*t^=N)Qr&0RR9",
|
||||||
"large-v1": b"ABzY8r9j$a0{>%R7#4sLmoOs{s)o3~84-RPdcFk!JR<kSfC2yj",
|
"large-v1": b"ABzY8r9j$a0{>%R7#4sLmoOs{s)o3~84-RPdcFk!JR<kSfC2yj",
|
||||||
"large-v2": b"ABzY8zd+h!0{>%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj",
|
"large-v2": b"ABzY8zd+h!0{>%R7=D0pU<_bnWW*tkYAhobTNnu$jnkEkXqp)j;w1Tzk)UH3X%SZd&fFZ2fC2yj",
|
||||||
"large-v3": b'ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00',
|
"large-v3": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00",
|
||||||
"large": b'ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00',
|
"large": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -32,7 +32,9 @@ def detect_language(
|
|||||||
list of dictionaries containing the probability distribution over all languages.
|
list of dictionaries containing the probability distribution over all languages.
|
||||||
"""
|
"""
|
||||||
if tokenizer is None:
|
if tokenizer is None:
|
||||||
tokenizer = get_tokenizer(model.is_multilingual, num_languages=model.num_languages)
|
tokenizer = get_tokenizer(
|
||||||
|
model.is_multilingual, num_languages=model.num_languages
|
||||||
|
)
|
||||||
if (
|
if (
|
||||||
tokenizer.language is None
|
tokenizer.language is None
|
||||||
or tokenizer.language_token not in tokenizer.sot_sequence
|
or tokenizer.language_token not in tokenizer.sot_sequence
|
||||||
|
|||||||
@ -251,7 +251,9 @@ class Tokenizer:
|
|||||||
keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
|
keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
|
||||||
"""
|
"""
|
||||||
symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
|
symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
|
||||||
symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
|
symbols += (
|
||||||
|
"<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
|
||||||
|
)
|
||||||
|
|
||||||
# symbols that may be a single token or multiple tokens depending on the tokenizer.
|
# symbols that may be a single token or multiple tokens depending on the tokenizer.
|
||||||
# In case they're multiple tokens, suppress the first token, which is safe because:
|
# In case they're multiple tokens, suppress the first token, which is safe because:
|
||||||
@ -388,4 +390,6 @@ def get_tokenizer(
|
|||||||
|
|
||||||
encoding = get_encoding(name=encoding_name, num_languages=num_languages)
|
encoding = get_encoding(name=encoding_name, num_languages=num_languages)
|
||||||
|
|
||||||
return Tokenizer(encoding=encoding, num_languages=num_languages, language=language, task=task)
|
return Tokenizer(
|
||||||
|
encoding=encoding, num_languages=num_languages, language=language, task=task
|
||||||
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user