whisper/notebooks/LibriSpeech.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "v5hvo8QWN-a9"
   },
   "source": [
    "# Installing Whisper\n",
    "\n",
    "The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "ZsJUxc0aRsAf"
   },
   "source": [
    "! pip install git+https://github.com/openai/whisper.git\n",
    "! pip install jiwer"
   ],
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "1IMEkgyagYto"
   },
   "source": [
    "# Loading the LibriSpeech dataset\n",
    "\n",
    "The following will load the test-clean split of the LibriSpeech corpus using torchaudio."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "3CqtR2Fi5-vP"
   },
   "source": [
    "import os\n",
    "import numpy as np\n",
    "\n",
    "try:\n",
    "    import tensorflow  # required in Colab to avoid protobuf compatibility issues\n",
    "except ImportError:\n",
    "    pass\n",
    "\n",
    "import torch\n",
    "import pandas as pd\n",
    "import whisper\n",
    "import torchaudio\n",
    "\n",
    "from tqdm.notebook import tqdm\n",
    "\n",
    "\n",
    "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\""
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "id": "GuCCB2KYOJCE"
   },
   "source": [
    "class LibriSpeech(torch.utils.data.Dataset):\n",
    "    \"\"\"\n",
    "    A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.\n",
    "    It will drop the last few seconds of a very small portion of the utterances.\n",
    "    \"\"\"\n",
    "    def __init__(self, split=\"test-clean\", device=DEVICE):\n",
    "        self.dataset = torchaudio.datasets.LIBRISPEECH(\n",
    "            root=os.path.expanduser(\"~/.cache\"),\n",
    "            url=split,\n",
    "            download=True,\n",
    "        )\n",
    "        self.device = device\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.dataset)\n",
    "\n",
    "    def __getitem__(self, item):\n",
    "        audio, sample_rate, text, _, _, _ = self.dataset[item]\n",
    "        assert sample_rate == 16000\n",
    "        audio = whisper.pad_or_trim(audio.flatten()).to(self.device)\n",
    "        mel = whisper.log_mel_spectrogram(audio)\n",
    "        \n",
    "        return (mel, text)"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "id": "-YcRU5jqNqo2"
   },
   "source": [
    "dataset = LibriSpeech(\"test-clean\")\n",
    "loader = torch.utils.data.DataLoader(dataset, batch_size=16)"
   ],
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "0ljocCNuUAde"
   },
   "source": [
    "# Running inference on the dataset using a base Whisper model\n",
    "\n",
    "The following will take a few minutes to transcribe all utterances in the dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "_PokfNJtOYNu",
    "outputId": "2c53ec44-bc93-4107-b4fa-214e3f71fe8e"
   },
   "source": [
    "model = whisper.load_model(\"base.en\")\n",
    "print(\n",
    "    f\"Model is {'multilingual' if model.is_multilingual else 'English-only'} \"\n",
    "    f\"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters.\"\n",
    ")"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "source": [
    "# predict without timestamps for short-form transcription\n",
    "options = whisper.DecodingOptions(language=\"en\", without_timestamps=True)"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 49,
     "referenced_widgets": [
      "09a29a91f58d4462942505a3cc415801",
      "83391f98a240490987c397048fc1a0d4",
      "06b9aa5f49fa44ba8c93b647dc7db224",
      "da9c231ee67047fb89073c95326b72a5",
      "48da931ebe7f4fd299f8c98c7d2460ff",
      "7a901f447c1d477bb49f954e0feacedd",
      "39f5a6ae8ba74c8598f9c6d5b8ad2d65",
      "a0d10a42c753453283e5219c22239337",
      "09f4cb79ff86465aaf48b0de24869af9",
      "1b9cecf5b3584fba8258a81d4279a25b",
      "039b53f2702c4179af7e0548018d0588"
     ]
    },
    "id": "7OWTn_KvNk59",
    "outputId": "a813a792-3c91-4144-f11f-054fd6778023"
   },
   "source": [
    "hypotheses = []\n",
    "references = []\n",
    "\n",
    "for mels, texts in tqdm(loader):\n",
    "    results = model.decode(mels, options)\n",
    "    hypotheses.extend([result.text for result in results])\n",
    "    references.extend(texts)"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 424
    },
    "id": "4nTyynELQ42j",
    "outputId": "1c72d25a-3e87-4c60-a8d1-1da9d2f73bd7"
   },
   "source": [
    "data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))\n",
    "data"
   ],
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "HPppEJRXX4ox"
   },
   "source": [
    "# Calculating the word error rate\n",
    "\n",
    "Now, we use our English normalizer implementation to standardize the transcription and calculate the WER."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "id": "dl-KBDflMhrg"
   },
   "source": [
    "import jiwer\n",
    "from whisper.normalizers import EnglishTextNormalizer\n",
    "\n",
    "normalizer = EnglishTextNormalizer()"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 641
    },
    "id": "6-O048q4WI4o",
    "outputId": "f2089bc9-f535-441e-f192-26e52ae82b5e"
   },
   "source": [
    "data[\"hypothesis_clean\"] = [normalizer(text) for text in data[\"hypothesis\"]]\n",
    "data[\"reference_clean\"] = [normalizer(text) for text in data[\"reference\"]]\n",
    "data"
   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "EBGSITeBYPTT",
    "outputId": "7b3dbe7c-a37e-4a07-a50a-b27d5f88b68f"
   },
   "source": [
    "wer = jiwer.wer(list(data[\"reference_clean\"]), list(data[\"hypothesis_clean\"]))\n",
    "\n",
    "print(f\"WER: {wer * 100:.2f} %\")"
   ],
   "outputs": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "provenance": []
  },
  "gpuClass": "standard",
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.9"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "039b53f2702c4179af7e0548018d0588": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "06b9aa5f49fa44ba8c93b647dc7db224": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_a0d10a42c753453283e5219c22239337",
      "max": 164,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_09f4cb79ff86465aaf48b0de24869af9",
      "value": 164
     }
    },
    "09a29a91f58d4462942505a3cc415801": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_83391f98a240490987c397048fc1a0d4",
       "IPY_MODEL_06b9aa5f49fa44ba8c93b647dc7db224",
       "IPY_MODEL_da9c231ee67047fb89073c95326b72a5"
      ],
      "layout": "IPY_MODEL_48da931ebe7f4fd299f8c98c7d2460ff"
     }
    },
    "09f4cb79ff86465aaf48b0de24869af9": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "1b9cecf5b3584fba8258a81d4279a25b": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "39f5a6ae8ba74c8598f9c6d5b8ad2d65": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "48da931ebe7f4fd299f8c98c7d2460ff": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "7a901f447c1d477bb49f954e0feacedd": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "83391f98a240490987c397048fc1a0d4": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_7a901f447c1d477bb49f954e0feacedd",
      "placeholder": "",
      "style": "IPY_MODEL_39f5a6ae8ba74c8598f9c6d5b8ad2d65",
      "value": "100%"
     }
    },
    "a0d10a42c753453283e5219c22239337": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "da9c231ee67047fb89073c95326b72a5": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_1b9cecf5b3584fba8258a81d4279a25b",
      "placeholder": "",
      "style": "IPY_MODEL_039b53f2702c4179af7e0548018d0588",
      "value": " 164/164 [05:08&lt;00:00,  1.86s/it]"
     }
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}