From 395db62ccd0a78aff19a12f3a680c9fc74eceba9 Mon Sep 17 00:00:00 2001
From: Sinan <shuriken209master@googlemail.com>
Date: Thu, 16 Mar 2023 12:46:17 +0100
Subject: [PATCH 1/7] Update README.md

---
 README.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/README.md b/README.md
index eba82ce..85f1aed 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,17 @@
+# Sinan:
+
+Install:
+
+    pip install git+https://github.com/SinanAkkoyun/openai-whisper.git
+
+To update the package to the latest version of this repository, please run:
+
+    pip install --upgrade --no-deps --force-reinstall git+https://github.com/SinanAkkoyun/openai-whisper.git
+
+
+
+
+# OpenAI:
 # Whisper
 
 [[Blog]](https://openai.com/blog/whisper)

From db30d12efbb1616a8e6a22e60431cd7c7a8716c2 Mon Sep 17 00:00:00 2001
From: SinanAkkoyun <akkoyunsinan2@gmx.de>
Date: Sun, 19 Mar 2023 13:28:07 +0100
Subject: [PATCH 2/7] committed

---
 .gitignore                       |  3 ++
 examples/confidence_per_token.py | 57 ++++++++++++++++++++++++++++++++
 whisper/decoding.py              | 40 ++++++++++++++--------
 3 files changed, 87 insertions(+), 13 deletions(-)
 create mode 100644 examples/confidence_per_token.py

diff --git a/.gitignore b/.gitignore
index 7ae8fab..9f5ed86 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,6 @@ thumbs.db
 .DS_Store
 .idea
 
+.venv/
+
+samples/
diff --git a/examples/confidence_per_token.py b/examples/confidence_per_token.py
new file mode 100644
index 0000000..11e1c3c
--- /dev/null
+++ b/examples/confidence_per_token.py
@@ -0,0 +1,57 @@
+# IMPORTANT: This is just for using the local whisper dir as the package directly. Delete until next comment when just installing whisper normally.
+import sys
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+# end of dev import
+import whisper
+
+import colorsys
+from typing import List
+from whisper.tokenizer import get_tokenizer
+from colorama import init, Style
+
+
+print('Loading model')
+model = whisper.load_model("large")
+
+
+print('Loading audio') # load audio and pad/trim it to fit 30 seconds
+audio = whisper.load_audio("samples/your_audio.wav")
+audio = whisper.pad_or_trim(audio)
+
+
+mel = whisper.log_mel_spectrogram(audio).to(model.device) # make log-Mel spectrogram and move to the same device as the model
+
+
+detect_lang = False
+language = "en"
+if detect_lang: # detect the spoken language
+    print('Detecting language')
+    _, probs = model.detect_language(mel)
+    print(f"Detected language: {max(probs, key=probs.get)}")
+    language=max(probs, key=probs.get)
+
+
+print('Decoding audio') # decode the audio
+options = whisper.DecodingOptions()
+result = whisper.decode(model, mel, options)
+
+
+def print_colored_text(tokens: List[int], token_probs: List[float], tokenizer):
+    init(autoreset=True)  # Initialize colorama
+    text_tokens = [tokenizer.decode([t]) for t in tokens]
+
+    for token, prob in zip(text_tokens, token_probs):
+        # Interpolate between red and green in the HSV color space
+        r, g, b = colorsys.hsv_to_rgb(prob * (1/3), 1, 1)
+        r, g, b = int(r * 255), int(g * 255), int(b * 255)
+        color_code = f"\033[38;2;{r};{g};{b}m"
+
+        colored_token = f"{color_code}{Style.BRIGHT}{token}{Style.RESET_ALL}"
+        print(colored_token, end="")
+
+    print()
+
+
+tokenizer = get_tokenizer(multilingual=model.is_multilingual, language=language, task=options.task)
+print_colored_text(result.tokens, result.token_probs, tokenizer)  # print text with fancy confidence colors
diff --git a/whisper/decoding.py b/whisper/decoding.py
index 81cd845..f8e8200 100644
--- a/whisper/decoding.py
+++ b/whisper/decoding.py
@@ -118,6 +118,7 @@ class DecodingResult:
     language: str
     language_probs: Optional[Dict[str, float]] = None
     tokens: List[int] = field(default_factory=list)
+    token_probs: List[float] = field(default_factory=list)
     text: str = ""
     avg_logprob: float = np.nan
     no_speech_prob: float = np.nan
@@ -211,7 +212,7 @@ class TokenDecoder:
         """Initialize any stateful variables for decoding a new sequence"""
 
     def update(
-        self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor
+        self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor, token_probs: Tensor
     ) -> Tuple[Tensor, bool]:
         """Specify how to select the next token, based on the current trace and logits
 
@@ -238,7 +239,7 @@ class TokenDecoder:
         raise NotImplementedError
 
     def finalize(
-        self, tokens: Tensor, sum_logprobs: Tensor
+        self, tokens: Tensor, sum_logprobs: Tensor, token_probs: Tensor
     ) -> Tuple[Sequence[Sequence[Tensor]], List[List[float]]]:
         """Finalize search and return the final candidate sequences
 
@@ -268,7 +269,7 @@ class GreedyDecoder(TokenDecoder):
         self.eot = eot
 
     def update(
-        self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor
+        self, tokens: Tensor, logits: Tensor, sum_logprobs: Tensor, token_probs: Tensor
     ) -> Tuple[Tensor, bool]:
         if self.temperature == 0:
             next_tokens = logits.argmax(dim=-1)
@@ -276,19 +277,28 @@ class GreedyDecoder(TokenDecoder):
             next_tokens = Categorical(logits=logits / self.temperature).sample()
 
         logprobs = F.log_softmax(logits.float(), dim=-1)
+        probs = torch.exp(logprobs)
         current_logprobs = logprobs[torch.arange(logprobs.shape[0]), next_tokens]
         sum_logprobs += current_logprobs * (tokens[:, -1] != self.eot)
 
         next_tokens[tokens[:, -1] == self.eot] = self.eot
         tokens = torch.cat([tokens, next_tokens[:, None]], dim=-1)
 
-        completed = (tokens[:, -1] == self.eot).all()
-        return tokens, completed
+        current_token_probs = probs[torch.arange(probs.shape[0]), next_tokens]
+        token_probs = torch.cat([token_probs, current_token_probs[:, None]], dim=-1)
 
-    def finalize(self, tokens: Tensor, sum_logprobs: Tensor):
+        # token_logits = torch.stack([logits[k, next_tokens[k]] for k in range(next_tokens .shape[0])], dim=0)
+        # or use logprobs, the log softmax of the logits
+        # return it along with tokens and completed
+
+        completed = (tokens[:, -1] == self.eot).all()
+        return tokens, completed, token_probs
+
+    def finalize(self, tokens: Tensor, sum_logprobs: Tensor, token_probs: Tensor):
         # make sure each sequence has at least one EOT token at the end
         tokens = F.pad(tokens, (0, 1), value=self.eot)
-        return tokens, sum_logprobs.tolist()
+        token_probs = F.pad(token_probs, (0, 1), value=0) # 0 ok?
+        return tokens, sum_logprobs.tolist(), token_probs.tolist()
 
 
 class BeamSearchDecoder(TokenDecoder):
@@ -374,7 +384,7 @@ class BeamSearchDecoder(TokenDecoder):
         )
         return tokens, completed
 
-    def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor):
+    def finalize(self, preceding_tokens: Tensor, sum_logprobs: Tensor, token_probs: Tensor):
         # collect all finished sequences, including patience, and add unfinished ones if not enough
         sum_logprobs = sum_logprobs.cpu()
         for i, sequences in enumerate(self.finished_sequences):
@@ -668,6 +678,8 @@ class DecodingTask:
         sum_logprobs: Tensor = torch.zeros(n_batch, device=audio_features.device)
         no_speech_probs = [np.nan] * n_batch
 
+        token_probs = torch.zeros_like(tokens).float()
+
         try:
             for i in range(self.sample_len):
                 logits = self.inference.logits(tokens, audio_features)
@@ -686,14 +698,14 @@ class DecodingTask:
                     logit_filter.apply(logits, tokens)
 
                 # expand the tokens tensor with the selected next tokens
-                tokens, completed = self.decoder.update(tokens, logits, sum_logprobs)
+                tokens, completed, token_probs = self.decoder.update(tokens, logits, sum_logprobs, token_probs)
 
                 if completed or tokens.shape[-1] > self.n_ctx:
                     break
         finally:
             self.inference.cleanup_caching()
 
-        return tokens, sum_logprobs, no_speech_probs
+        return tokens, sum_logprobs, no_speech_probs, token_probs
 
     @torch.no_grad()
     def run(self, mel: Tensor) -> List[DecodingResult]:
@@ -721,7 +733,7 @@ class DecodingTask:
         tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device)
 
         # call the main sampling loop
-        tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features, tokens)
+        tokens, sum_logprobs, no_speech_probs, token_probs = self._main_loop(audio_features, tokens)
 
         # reshape the tensors to have (n_audio, n_group) as the first two dimensions
         audio_features = audio_features[:: self.n_group]
@@ -732,7 +744,7 @@ class DecodingTask:
         sum_logprobs = sum_logprobs.reshape(n_audio, self.n_group)
 
         # get the final candidates for each group, and slice between the first sampled token and EOT
-        tokens, sum_logprobs = self.decoder.finalize(tokens, sum_logprobs)
+        tokens, sum_logprobs, token_probs = self.decoder.finalize(tokens, sum_logprobs, token_probs)
         tokens: List[List[Tensor]] = [
             [t[self.sample_begin : (t == tokenizer.eot).nonzero()[0, 0]] for t in s]
             for s in tokens
@@ -755,6 +767,7 @@ class DecodingTask:
             audio_features,
             avg_logprobs,
             no_speech_probs,
+            token_probs
         )
         if len(set(map(len, fields))) != 1:
             raise RuntimeError(f"inconsistent result lengths: {list(map(len, fields))}")
@@ -769,8 +782,9 @@ class DecodingTask:
                 no_speech_prob=no_speech_prob,
                 temperature=self.options.temperature,
                 compression_ratio=compression_ratio(text),
+                token_probs=token_probs
             )
-            for text, language, tokens, features, avg_logprob, no_speech_prob in zip(
+            for text, language, tokens, features, avg_logprob, no_speech_prob, token_probs in zip(
                 *fields
             )
         ]

From b24d29355d91a3b30f63948dbe31182fc029c5bd Mon Sep 17 00:00:00 2001
From: SinanAkkoyun <akkoyunsinan2@gmx.de>
Date: Sun, 19 Mar 2023 13:39:10 +0100
Subject: [PATCH 3/7] committed

---
 README.md | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 85f1aed..9ea3a38 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,3 @@
-# Sinan:
-
-Install:
-
-    pip install git+https://github.com/SinanAkkoyun/openai-whisper.git
-
-To update the package to the latest version of this repository, please run:
-
-    pip install --upgrade --no-deps --force-reinstall git+https://github.com/SinanAkkoyun/openai-whisper.git
-
-
-
-
-# OpenAI:
 # Whisper
 
 [[Blog]](https://openai.com/blog/whisper)
@@ -158,4 +144,4 @@ Please use the [🙌 Show and tell](https://github.com/openai/whisper/discussion
 
 ## License
 
-Whisper's code and model weights are released under the MIT License. See [LICENSE](https://github.com/openai/whisper/blob/main/LICENSE) for further details.
+Whisper's code and model weights are released under the MIT License. See [LICENSE](https://github.com/openai/whisper/blob/main/LICENSE) for further details.
\ No newline at end of file

From 5e6714ef111e2eab0adf572fc3442a30a16fe4af Mon Sep 17 00:00:00 2001
From: SinanAkkoyun <akkoyunsinan2@gmx.de>
Date: Thu, 23 Mar 2023 02:20:01 +0100
Subject: [PATCH 4/7] committed

---
 examples/confidence_per_token.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/confidence_per_token.py b/examples/confidence_per_token.py
index 11e1c3c..a6c90d0 100644
--- a/examples/confidence_per_token.py
+++ b/examples/confidence_per_token.py
@@ -37,21 +37,24 @@ options = whisper.DecodingOptions()
 result = whisper.decode(model, mel, options)
 
 
-def print_colored_text(tokens: List[int], token_probs: List[float], tokenizer):
+def get_colored_text(tokens: List[int], token_probs: List[float], tokenizer, prompt: str=""):
     init(autoreset=True)  # Initialize colorama
     text_tokens = [tokenizer.decode([t]) for t in tokens]
+    token_probs = token_probs[-len(text_tokens):]
 
-    for token, prob in zip(text_tokens, token_probs):
+    output_text = ""
+    for i, (token, prob) in enumerate(zip(text_tokens, token_probs)):
         # Interpolate between red and green in the HSV color space
         r, g, b = colorsys.hsv_to_rgb(prob * (1/3), 1, 1)
         r, g, b = int(r * 255), int(g * 255), int(b * 255)
         color_code = f"\033[38;2;{r};{g};{b}m"
 
         colored_token = f"{color_code}{Style.BRIGHT}{token}{Style.RESET_ALL}"
-        print(colored_token, end="")
+        output_text += colored_token
 
-    print()
+    return output_text
 
 
 tokenizer = get_tokenizer(multilingual=model.is_multilingual, language=language, task=options.task)
-print_colored_text(result.tokens, result.token_probs, tokenizer)  # print text with fancy confidence colors
+print(get_colored_text(result.tokens, result.token_probs, tokenizer))  # print text with fancy confidence colors
+# HINT: when using a prompt, you must provide it in the get_colored_text as well

From 2ff7dbb41a3b21cd8f6c6fa26a9d300f1d95f4d8 Mon Sep 17 00:00:00 2001
From: SinanAkkoyun <akkoyunsinan2@gmx.de>
Date: Thu, 23 Mar 2023 02:25:21 +0100
Subject: [PATCH 5/7] committed

---
 examples/confidence_per_token.py | 2 +-
 whisper/decoding.py              | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/confidence_per_token.py b/examples/confidence_per_token.py
index a6c90d0..8d8ed50 100644
--- a/examples/confidence_per_token.py
+++ b/examples/confidence_per_token.py
@@ -40,7 +40,7 @@ result = whisper.decode(model, mel, options)
 def get_colored_text(tokens: List[int], token_probs: List[float], tokenizer, prompt: str=""):
     init(autoreset=True)  # Initialize colorama
     text_tokens = [tokenizer.decode([t]) for t in tokens]
-    token_probs = token_probs[-len(text_tokens):]
+    # token_probs = token_probs[-len(text_tokens):]
 
     output_text = ""
     for i, (token, prob) in enumerate(zip(text_tokens, token_probs)):
diff --git a/whisper/decoding.py b/whisper/decoding.py
index f8e8200..29c7801 100644
--- a/whisper/decoding.py
+++ b/whisper/decoding.py
@@ -750,6 +750,9 @@ class DecodingTask:
             for s in tokens
         ]
 
+        # fix token_probs length
+        token_probs = token_probs[-len(tokens):]
+
         # select the top-ranked sample in each group
         selected = self.sequence_ranker.rank(tokens, sum_logprobs)
         tokens: List[List[int]] = [t[i].tolist() for i, t in zip(selected, tokens)]

From 6750a98bdd2072626ff352474950f5f36f1919e5 Mon Sep 17 00:00:00 2001
From: SinanAkkoyun <akkoyunsinan2@gmx.de>
Date: Thu, 23 Mar 2023 02:42:25 +0100
Subject: [PATCH 6/7] Fixed token_prob length! :)

---
 examples/confidence_per_token.py | 1 -
 whisper/decoding.py              | 5 +----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/examples/confidence_per_token.py b/examples/confidence_per_token.py
index 8d8ed50..1150c30 100644
--- a/examples/confidence_per_token.py
+++ b/examples/confidence_per_token.py
@@ -40,7 +40,6 @@ result = whisper.decode(model, mel, options)
 def get_colored_text(tokens: List[int], token_probs: List[float], tokenizer, prompt: str=""):
     init(autoreset=True)  # Initialize colorama
     text_tokens = [tokenizer.decode([t]) for t in tokens]
-    # token_probs = token_probs[-len(text_tokens):]
 
     output_text = ""
     for i, (token, prob) in enumerate(zip(text_tokens, token_probs)):
diff --git a/whisper/decoding.py b/whisper/decoding.py
index 29c7801..4ded71b 100644
--- a/whisper/decoding.py
+++ b/whisper/decoding.py
@@ -750,9 +750,6 @@ class DecodingTask:
             for s in tokens
         ]
 
-        # fix token_probs length
-        token_probs = token_probs[-len(tokens):]
-
         # select the top-ranked sample in each group
         selected = self.sequence_ranker.rank(tokens, sum_logprobs)
         tokens: List[List[int]] = [t[i].tolist() for i, t in zip(selected, tokens)]
@@ -785,7 +782,7 @@ class DecodingTask:
                 no_speech_prob=no_speech_prob,
                 temperature=self.options.temperature,
                 compression_ratio=compression_ratio(text),
-                token_probs=token_probs
+                token_probs=token_probs[-len(tokens):]
             )
             for text, language, tokens, features, avg_logprob, no_speech_prob, token_probs in zip(
                 *fields

From b589d3b46714080dc269488ae252cec0ca11112e Mon Sep 17 00:00:00 2001
From: Sinan <shuriken209master@googlemail.com>
Date: Thu, 23 Mar 2023 14:59:00 +0100
Subject: [PATCH 7/7] Fixed perf bug with color

---
 examples/confidence_per_token.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/confidence_per_token.py b/examples/confidence_per_token.py
index 1150c30..0bf2a30 100644
--- a/examples/confidence_per_token.py
+++ b/examples/confidence_per_token.py
@@ -38,7 +38,7 @@ result = whisper.decode(model, mel, options)
 
 
 def get_colored_text(tokens: List[int], token_probs: List[float], tokenizer, prompt: str=""):
-    init(autoreset=True)  # Initialize colorama
+    init(autoreset=False)  # Initialize colorama
     text_tokens = [tokenizer.decode([t]) for t in tokens]
 
     output_text = ""