{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "v5hvo8QWN-a9" }, "source": [ "# Installing Whisper\n", "\n", "The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "ZsJUxc0aRsAf" }, "source": [ "! pip install git+https://github.com/openai/whisper.git\n", "! pip install jiwer" ], "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "1IMEkgyagYto" }, "source": [ "# Loading the LibriSpeech dataset\n", "\n", "The following will load the test-clean split of the LibriSpeech corpus using torchaudio." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "3CqtR2Fi5-vP" }, "source": [ "import os\n", "import numpy as np\n", "\n", "try:\n", " import tensorflow # required in Colab to avoid protobuf compatibility issues\n", "except ImportError:\n", " pass\n", "\n", "import torch\n", "import pandas as pd\n", "import whisper\n", "import torchaudio\n", "\n", "from tqdm.notebook import tqdm\n", "\n", "\n", "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"" ], "outputs": [] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "GuCCB2KYOJCE" }, "source": [ "class LibriSpeech(torch.utils.data.Dataset):\n", " \"\"\"\n", " A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.\n", " It will drop the last few seconds of a very small portion of the utterances.\n", " \"\"\"\n", " def __init__(self, split=\"test-clean\", device=DEVICE):\n", " self.dataset = torchaudio.datasets.LIBRISPEECH(\n", " root=os.path.expanduser(\"~/.cache\"),\n", " url=split,\n", " download=True,\n", " )\n", " self.device = device\n", "\n", " def __len__(self):\n", " return len(self.dataset)\n", "\n", " def __getitem__(self, item):\n", " audio, sample_rate, text, _, _, _ = self.dataset[item]\n", " assert sample_rate == 16000\n", " audio = whisper.pad_or_trim(audio.flatten()).to(self.device)\n", " mel = whisper.log_mel_spectrogram(audio)\n", " \n", " return (mel, text)" ], "outputs": [] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "-YcRU5jqNqo2" }, "source": [ "dataset = LibriSpeech(\"test-clean\")\n", "loader = torch.utils.data.DataLoader(dataset, batch_size=16)" ], "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "0ljocCNuUAde" }, "source": [ "# Running inference on the dataset using a base Whisper model\n", "\n", "The following will take a few minutes to transcribe all utterances in the dataset." ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_PokfNJtOYNu", "outputId": "2c53ec44-bc93-4107-b4fa-214e3f71fe8e" }, "source": [ "model = whisper.load_model(\"base.en\")\n", "print(\n", " f\"Model is {'multilingual' if model.is_multilingual else 'English-only'} \"\n", " f\"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters.\"\n", ")" ], "outputs": [] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "source": [ "# predict without timestamps for short-form transcription\n", "options = whisper.DecodingOptions(language=\"en\", without_timestamps=True)" ], "outputs": [] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 49, "referenced_widgets": [ "09a29a91f58d4462942505a3cc415801", "83391f98a240490987c397048fc1a0d4", "06b9aa5f49fa44ba8c93b647dc7db224", "da9c231ee67047fb89073c95326b72a5", "48da931ebe7f4fd299f8c98c7d2460ff", "7a901f447c1d477bb49f954e0feacedd", "39f5a6ae8ba74c8598f9c6d5b8ad2d65", "a0d10a42c753453283e5219c22239337", "09f4cb79ff86465aaf48b0de24869af9", "1b9cecf5b3584fba8258a81d4279a25b", "039b53f2702c4179af7e0548018d0588" ] }, "id": "7OWTn_KvNk59", "outputId": "a813a792-3c91-4144-f11f-054fd6778023" }, "source": [ "hypotheses = []\n", "references = []\n", "\n", "for mels, texts in tqdm(loader):\n", " results = model.decode(mels, options)\n", " hypotheses.extend([result.text for result in results])\n", " references.extend(texts)" ], "outputs": [] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 424 }, "id": "4nTyynELQ42j", "outputId": "1c72d25a-3e87-4c60-a8d1-1da9d2f73bd7" }, "source": [ "data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))\n", "data" ], "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "HPppEJRXX4ox" }, "source": [ "# Calculating the word error rate\n", "\n", "Now, we use our English normalizer implementation to standardize the transcription and calculate the WER." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "id": "dl-KBDflMhrg" }, "source": [ "import jiwer\n", "from whisper.normalizers import EnglishTextNormalizer\n", "\n", "normalizer = EnglishTextNormalizer()" ], "outputs": [] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 641 }, "id": "6-O048q4WI4o", "outputId": "f2089bc9-f535-441e-f192-26e52ae82b5e" }, "source": [ "data[\"hypothesis_clean\"] = [normalizer(text) for text in data[\"hypothesis\"]]\n", "data[\"reference_clean\"] = [normalizer(text) for text in data[\"reference\"]]\n", "data" ], "outputs": [] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EBGSITeBYPTT", "outputId": "7b3dbe7c-a37e-4a07-a50a-b27d5f88b68f" }, "source": [ "wer = jiwer.wer(list(data[\"reference_clean\"]), list(data[\"hypothesis_clean\"]))\n", "\n", "print(f\"WER: {wer * 100:.2f} %\")" ], "outputs": [] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "provenance": [] }, "gpuClass": "standard", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.9" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "039b53f2702c4179af7e0548018d0588": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "06b9aa5f49fa44ba8c93b647dc7db224": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "FloatProgressModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a0d10a42c753453283e5219c22239337", "max": 164, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_09f4cb79ff86465aaf48b0de24869af9", "value": 164 } }, "09a29a91f58d4462942505a3cc415801": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_83391f98a240490987c397048fc1a0d4", "IPY_MODEL_06b9aa5f49fa44ba8c93b647dc7db224", "IPY_MODEL_da9c231ee67047fb89073c95326b72a5" ], "layout": "IPY_MODEL_48da931ebe7f4fd299f8c98c7d2460ff" } }, "09f4cb79ff86465aaf48b0de24869af9": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "ProgressStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "1b9cecf5b3584fba8258a81d4279a25b": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "39f5a6ae8ba74c8598f9c6d5b8ad2d65": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "48da931ebe7f4fd299f8c98c7d2460ff": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7a901f447c1d477bb49f954e0feacedd": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "83391f98a240490987c397048fc1a0d4": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7a901f447c1d477bb49f954e0feacedd", "placeholder": "​", "style": "IPY_MODEL_39f5a6ae8ba74c8598f9c6d5b8ad2d65", "value": "100%" } }, "a0d10a42c753453283e5219c22239337": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "da9c231ee67047fb89073c95326b72a5": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "HTMLModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_1b9cecf5b3584fba8258a81d4279a25b", "placeholder": "​", "style": "IPY_MODEL_039b53f2702c4179af7e0548018d0588", "value": " 164/164 [05:08<00:00, 1.86s/it]" } } } } }, "nbformat": 4, "nbformat_minor": 1 }