whisper/examples/confidence_per_token.py
SinanAkkoyun 35530894b4 committed
2024-05-21 18:18:18 +02:00

58 lines
1.9 KiB
Python

# IMPORTANT: This is just for using the local whisper dir as the package directly. Delete until next comment when just installing whisper normally.
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
# end of dev import
import whisper
import colorsys
from typing import List
from whisper.tokenizer import get_tokenizer
from colorama import init, Style
print('Loading model')
model = whisper.load_model("large")
print('Loading audio') # load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio("samples/your_audio.wav")
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device) # make log-Mel spectrogram and move to the same device as the model
detect_lang = False
language = "en"
if detect_lang: # detect the spoken language
print('Detecting language')
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")
language=max(probs, key=probs.get)
print('Decoding audio') # decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)
def print_colored_text(tokens: List[int], token_probs: List[float], tokenizer):
init(autoreset=True) # Initialize colorama
text_tokens = [tokenizer.decode([t]) for t in tokens]
for token, prob in zip(text_tokens, token_probs):
# Interpolate between red and green in the HSV color space
r, g, b = colorsys.hsv_to_rgb(prob * (1/3), 1, 1)
r, g, b = int(r * 255), int(g * 255), int(b * 255)
color_code = f"\033[38;2;{r};{g};{b}m"
colored_token = f"{color_code}{Style.BRIGHT}{token}{Style.RESET_ALL}"
print(colored_token, end="")
print()
tokenizer = get_tokenizer(multilingual=model.is_multilingual, language=language, task=options.task)
print_colored_text(result.tokens, result.token_probs, tokenizer) # print text with fancy confidence colors