# IMPORTANT: This is just for using the local whisper dir as the package directly. Delete until next comment when just installing whisper normally. import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parents[1])) # end of dev import import whisper import colorsys from typing import List from whisper.tokenizer import get_tokenizer from colorama import init, Style print('Loading model') model = whisper.load_model("large") print('Loading audio') # load audio and pad/trim it to fit 30 seconds audio = whisper.load_audio("samples/your_audio.wav") audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) # make log-Mel spectrogram and move to the same device as the model detect_lang = False language = "en" if detect_lang: # detect the spoken language print('Detecting language') _, probs = model.detect_language(mel) print(f"Detected language: {max(probs, key=probs.get)}") language=max(probs, key=probs.get) print('Decoding audio') # decode the audio options = whisper.DecodingOptions() result = whisper.decode(model, mel, options) def print_colored_text(tokens: List[int], token_probs: List[float], tokenizer): init(autoreset=True) # Initialize colorama text_tokens = [tokenizer.decode([t]) for t in tokens] for token, prob in zip(text_tokens, token_probs): # Interpolate between red and green in the HSV color space r, g, b = colorsys.hsv_to_rgb(prob * (1/3), 1, 1) r, g, b = int(r * 255), int(g * 255), int(b * 255) color_code = f"\033[38;2;{r};{g};{b}m" colored_token = f"{color_code}{Style.BRIGHT}{token}{Style.RESET_ALL}" print(colored_token, end="") print() tokenizer = get_tokenizer(multilingual=model.is_multilingual, language=language, task=options.task) print_colored_text(result.tokens, result.token_probs, tokenizer) # print text with fancy confidence colors