From 395db62ccd0a78aff19a12f3a680c9fc74eceba9 Mon Sep 17 00:00:00 2001 From: Sinan Date: Thu, 16 Mar 2023 12:46:17 +0100 Subject: [PATCH 1/7] Update README.md --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index eba82ce..85f1aed 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,17 @@ +# Sinan: + +Install: + + pip install git+https://github.com/SinanAkkoyun/openai-whisper.git + +To update the package to the latest version of this repository, please run: + + pip install --upgrade --no-deps --force-reinstall git+https://github.com/SinanAkkoyun/openai-whisper.git + + + + +# OpenAI: # Whisper [[Blog]](https://openai.com/blog/whisper) From db30d12efbb1616a8e6a22e60431cd7c7a8716c2 Mon Sep 17 00:00:00 2001 From: SinanAkkoyun Date: Sun, 19 Mar 2023 13:28:07 +0100 Subject: [PATCH 2/7] committed --- .gitignore | 3 ++ examples/confidence_per_token.py | 57 ++++++++++++++++++++++++++++++++ whisper/decoding.py | 40 ++++++++++++++-------- 3 files changed, 87 insertions(+), 13 deletions(-) create mode 100644 examples/confidence_per_token.py diff --git a/.gitignore b/.gitignore index 7ae8fab..9f5ed86 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,6 @@ thumbs.db .DS_Store .idea +.venv/ + +samples/ diff --git a/examples/confidence_per_token.py b/examples/confidence_per_token.py new file mode 100644 index 0000000..11e1c3c --- /dev/null +++ b/examples/confidence_per_token.py @@ -0,0 +1,57 @@ +# IMPORTANT: This is just for using the local whisper dir as the package directly. Delete until next comment when just installing whisper normally. +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) +# end of dev import +import whisper + +import colorsys +from typing import List +from whisper.tokenizer import get_tokenizer +from colorama import init, Style + + +print('Loading model') +model = whisper.load_model("large") + + +print('Loading audio') # load audio and pad/trim it to fit 30 seconds +audio = whisper.load_audio("samples/your_audio.wav") +audio = whisper.pad_or_trim(audio) + + +mel = whisper.log_mel_spectrogram(audio).to(model.device) # make log-Mel spectrogram and move to the same device as the model + + +detect_lang = False +language = "en" +if detect_lang: # detect the spoken language + print('Detecting language') + _, probs = model.detect_language(mel) + print(f"Detected language: {max(probs, key=probs.get)}") + language=max(probs, key=probs.get) + + +print('Decoding audio') # decode the audio +options = whisper.DecodingOptions() +result = whisper.decode(model, mel, options) + + +def print_colored_text(tokens: List[int], token_probs: List[float], tokenizer): + init(autoreset=True) # Initialize colorama + text_tokens = [tokenizer.decode([t]) for t in tokens] + + for token, prob in zip(text_tokens, token_probs): + # Interpolate between red and green in the HSV color space + r, g, b = colorsys.hsv_to_rgb(prob * (1/3), 1, 1) + r, g, b = int(r * 255), int(g * 255), int(b * 255) + color_code = f"\033[38;2;{r};{g};{b}m" + + colored_token = f"{color_code}{Style.BRIGHT}{token}{Style.RESET_ALL}" + print(colored_token, end="") + + print() + + +tokenizer = get_tokenizer(multilingual=model.is_multilingual, language=language, task=options.task) +print_colored_text(result.tokens, result.token_probs, tokenizer) # print text with fancy confidence colors diff --git a/whisper/decoding.py b/whisper/decoding.py index 81cd845..f8e8200 100644 --- a/whisper/decoding.py +++ b/whisper/decoding.py @@ -118,6 +118,7 @@ class DecodingResult: language: str language_probs: Optional[Dict[str, float]] = None tokens: List[int] = field(default_factory=list) + token_probs: List[float] = field(default_factory=list) text: str = "" avg_logprob: float = np.nan no_speech_prob: float = np.nan @@ -211,7 +212,7 @@ class TokenDecoder: """Initialize any stateful variables for decoding a new sequence""" def update( - self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor + self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor, token_probs: Tensor ) -> Tuple[Tensor, bool]: """Specify how to select the next token, based on the current trace and logits @@ -238,7 +239,7 @@ class TokenDecoder: raise NotImplementedError def finalize( - self, tokens: Tensor, sum_logprobs: Tensor + self, tokens: Tensor, sum_logprobs: Tensor, token_probs: Tensor ) -> Tuple[Sequence[Sequence[Tensor]], List[List[float]]]: """Finalize search and return the final candidate sequences @@ -268,7 +269,7 @@ class GreedyDecoder(TokenDecoder): self.eot = eot def update( - self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor + self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor, token_probs: Tensor ) -> Tuple[Tensor, bool]: if self.temperature == 0: next_tokens = logits.argmax(dim=-1) @@ -276,19 +277,28 @@ class GreedyDecoder(TokenDecoder): next_tokens = Categorical(logits=logits / self.temperature).sample() logprobs = F.log_softmax(logits.float(), dim=-1) + probs = torch.exp(logprobs) current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens] sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot) next_tokens[tokens[:, -1] == self.eot] = self.eot tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1) - completed = (tokens[:, -1] == self.eot).all() - return tokens, completed + current_token_probs = probs[torch.arange(probs.shape[0]), next_tokens] + token_probs = torch.cat([token_probs, current_token_probs[:, None]], dim=-1) - def finalize(self, tokens: Tensor, sum_logprobs: Tensor): + # token_logits = torch.stack([logits[k, next_tokens[k]] for k in range(next_tokens .shape[0])], dim=0) + # or use logprobs, the log softmax of the logits + # return it along with tokens and completed + + completed = (tokens[:, -1] == self.eot).all() + return tokens, completed, token_probs + + def finalize(self, tokens: Tensor, sum_logprobs: Tensor, token_probs: Tensor): # make sure each sequence has at least one EOT token at the end tokens = F.pad(tokens, (0, 1), value=self.eot) - return tokens, sum_logprobs.tolist() + token_probs = F.pad(token_probs, (0, 1), value=0) # 0 ok? + return tokens, sum_logprobs.tolist(), token_probs.tolist() class BeamSearchDecoder(TokenDecoder): @@ -374,7 +384,7 @@ class BeamSearchDecoder(TokenDecoder): ) return tokens, completed - def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor): + def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor, token_probs: Tensor): # collect all finished sequences, including patience, and add unfinished ones if not enough sum_logprobs = sum_logprobs.cpu() for i, sequences in enumerate(self.finished_sequences): @@ -668,6 +678,8 @@ class DecodingTask: sum_logprobs: Tensor = torch.zeros(n_batch, device=audio_features.device) no_speech_probs = [np.nan] * n_batch + token_probs = torch.zeros_like(tokens).float() + try: for i in range(self.sample_len): logits = self.inference.logits(tokens, audio_features) @@ -686,14 +698,14 @@ class DecodingTask: logit_filter.apply(logits, tokens) # expand the tokens tensor with the selected next tokens - tokens, completed = self.decoder.update(tokens, logits, sum_logprobs) + tokens, completed, token_probs = self.decoder.update(tokens, logits, sum_logprobs, token_probs) if completed or tokens.shape[-1] > self.n_ctx: break finally: self.inference.cleanup_caching() - return tokens, sum_logprobs, no_speech_probs + return tokens, sum_logprobs, no_speech_probs, token_probs @torch.no_grad() def run(self, mel: Tensor) -> List[DecodingResult]: @@ -721,7 +733,7 @@ class DecodingTask: tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device) # call the main sampling loop - tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features, tokens) + tokens, sum_logprobs, no_speech_probs, token_probs = self._main_loop(audio_features, tokens) # reshape the tensors to have (n_audio, n_group) as the first two dimensions audio_features = audio_features[:: self.n_group] @@ -732,7 +744,7 @@ class DecodingTask: sum_logprobs = sum_logprobs.reshape(n_audio, self.n_group) # get the final candidates for each group, and slice between the first sampled token and EOT - tokens, sum_logprobs = self.decoder.finalize(tokens, sum_logprobs) + tokens, sum_logprobs, token_probs = self.decoder.finalize(tokens, sum_logprobs, token_probs) tokens: List[List[Tensor]] = [ [t[self.sample_begin : (t == tokenizer.eot).nonzero()[0, 0]] for t in s] for s in tokens @@ -755,6 +767,7 @@ class DecodingTask: audio_features, avg_logprobs, no_speech_probs, + token_probs ) if len(set(map(len, fields))) != 1: raise RuntimeError(f"inconsistent result lengths: {list(map(len, fields))}") @@ -769,8 +782,9 @@ class DecodingTask: no_speech_prob=no_speech_prob, temperature=self.options.temperature, compression_ratio=compression_ratio(text), + token_probs=token_probs ) - for text, language, tokens, features, avg_logprob, no_speech_prob in zip( + for text, language, tokens, features, avg_logprob, no_speech_prob, token_probs in zip( *fields ) ] From b24d29355d91a3b30f63948dbe31182fc029c5bd Mon Sep 17 00:00:00 2001 From: SinanAkkoyun Date: Sun, 19 Mar 2023 13:39:10 +0100 Subject: [PATCH 3/7] committed --- README.md | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/README.md b/README.md index 85f1aed..9ea3a38 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,3 @@ -# Sinan: - -Install: - - pip install git+https://github.com/SinanAkkoyun/openai-whisper.git - -To update the package to the latest version of this repository, please run: - - pip install --upgrade --no-deps --force-reinstall git+https://github.com/SinanAkkoyun/openai-whisper.git - - - - -# OpenAI: # Whisper [[Blog]](https://openai.com/blog/whisper) @@ -158,4 +144,4 @@ Please use the [🙌 Show and tell](https://github.com/openai/whisper/discussion ## License -Whisper's code and model weights are released under the MIT License. See [LICENSE](https://github.com/openai/whisper/blob/main/LICENSE) for further details. +Whisper's code and model weights are released under the MIT License. See [LICENSE](https://github.com/openai/whisper/blob/main/LICENSE) for further details. \ No newline at end of file From 5e6714ef111e2eab0adf572fc3442a30a16fe4af Mon Sep 17 00:00:00 2001 From: SinanAkkoyun Date: Thu, 23 Mar 2023 02:20:01 +0100 Subject: [PATCH 4/7] committed --- examples/confidence_per_token.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/confidence_per_token.py b/examples/confidence_per_token.py index 11e1c3c..a6c90d0 100644 --- a/examples/confidence_per_token.py +++ b/examples/confidence_per_token.py @@ -37,21 +37,24 @@ options = whisper.DecodingOptions() result = whisper.decode(model, mel, options) -def print_colored_text(tokens: List[int], token_probs: List[float], tokenizer): +def get_colored_text(tokens: List[int], token_probs: List[float], tokenizer, prompt: str=""): init(autoreset=True) # Initialize colorama text_tokens = [tokenizer.decode([t]) for t in tokens] + token_probs = token_probs[-len(text_tokens):] - for token, prob in zip(text_tokens, token_probs): + output_text = "" + for i, (token, prob) in enumerate(zip(text_tokens, token_probs)): # Interpolate between red and green in the HSV color space r, g, b = colorsys.hsv_to_rgb(prob * (1/3), 1, 1) r, g, b = int(r * 255), int(g * 255), int(b * 255) color_code = f"\033[38;2;{r};{g};{b}m" colored_token = f"{color_code}{Style.BRIGHT}{token}{Style.RESET_ALL}" - print(colored_token, end="") + output_text += colored_token - print() + return output_text tokenizer = get_tokenizer(multilingual=model.is_multilingual, language=language, task=options.task) -print_colored_text(result.tokens, result.token_probs, tokenizer) # print text with fancy confidence colors +print(get_colored_text(result.tokens, result.token_probs, tokenizer)) # print text with fancy confidence colors +# HINT: when using a prompt, you must provide it in the get_colored_text as well From 2ff7dbb41a3b21cd8f6c6fa26a9d300f1d95f4d8 Mon Sep 17 00:00:00 2001 From: SinanAkkoyun Date: Thu, 23 Mar 2023 02:25:21 +0100 Subject: [PATCH 5/7] committed --- examples/confidence_per_token.py | 2 +- whisper/decoding.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/confidence_per_token.py b/examples/confidence_per_token.py index a6c90d0..8d8ed50 100644 --- a/examples/confidence_per_token.py +++ b/examples/confidence_per_token.py @@ -40,7 +40,7 @@ result = whisper.decode(model, mel, options) def get_colored_text(tokens: List[int], token_probs: List[float], tokenizer, prompt: str=""): init(autoreset=True) # Initialize colorama text_tokens = [tokenizer.decode([t]) for t in tokens] - token_probs = token_probs[-len(text_tokens):] + # token_probs = token_probs[-len(text_tokens):] output_text = "" for i, (token, prob) in enumerate(zip(text_tokens, token_probs)): diff --git a/whisper/decoding.py b/whisper/decoding.py index f8e8200..29c7801 100644 --- a/whisper/decoding.py +++ b/whisper/decoding.py @@ -750,6 +750,9 @@ class DecodingTask: for s in tokens ] + # fix token_probs length + token_probs = token_probs[-len(tokens):] + # select the top-ranked sample in each group selected = self.sequence_ranker.rank(tokens, sum_logprobs) tokens: List[List[int]] = [t[i].tolist() for i, t in zip(selected, tokens)] From 6750a98bdd2072626ff352474950f5f36f1919e5 Mon Sep 17 00:00:00 2001 From: SinanAkkoyun Date: Thu, 23 Mar 2023 02:42:25 +0100 Subject: [PATCH 6/7] Fixed token_prob length! :) --- examples/confidence_per_token.py | 1 - whisper/decoding.py | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/confidence_per_token.py b/examples/confidence_per_token.py index 8d8ed50..1150c30 100644 --- a/examples/confidence_per_token.py +++ b/examples/confidence_per_token.py @@ -40,7 +40,6 @@ result = whisper.decode(model, mel, options) def get_colored_text(tokens: List[int], token_probs: List[float], tokenizer, prompt: str=""): init(autoreset=True) # Initialize colorama text_tokens = [tokenizer.decode([t]) for t in tokens] - # token_probs = token_probs[-len(text_tokens):] output_text = "" for i, (token, prob) in enumerate(zip(text_tokens, token_probs)): diff --git a/whisper/decoding.py b/whisper/decoding.py index 29c7801..4ded71b 100644 --- a/whisper/decoding.py +++ b/whisper/decoding.py @@ -750,9 +750,6 @@ class DecodingTask: for s in tokens ] - # fix token_probs length - token_probs = token_probs[-len(tokens):] - # select the top-ranked sample in each group selected = self.sequence_ranker.rank(tokens, sum_logprobs) tokens: List[List[int]] = [t[i].tolist() for i, t in zip(selected, tokens)] @@ -785,7 +782,7 @@ class DecodingTask: no_speech_prob=no_speech_prob, temperature=self.options.temperature, compression_ratio=compression_ratio(text), - token_probs=token_probs + token_probs=token_probs[-len(tokens):] ) for text, language, tokens, features, avg_logprob, no_speech_prob, token_probs in zip( *fields From b589d3b46714080dc269488ae252cec0ca11112e Mon Sep 17 00:00:00 2001 From: Sinan Date: Thu, 23 Mar 2023 14:59:00 +0100 Subject: [PATCH 7/7] Fixed perf bug with color --- examples/confidence_per_token.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/confidence_per_token.py b/examples/confidence_per_token.py index 1150c30..0bf2a30 100644 --- a/examples/confidence_per_token.py +++ b/examples/confidence_per_token.py @@ -38,7 +38,7 @@ result = whisper.decode(model, mel, options) def get_colored_text(tokens: List[int], token_probs: List[float], tokenizer, prompt: str=""): - init(autoreset=True) # Initialize colorama + init(autoreset=False) # Initialize colorama text_tokens = [tokenizer.decode([t]) for t in tokens] output_text = ""