whisper/notebooks/LibriSpeech.ipynb
2024-05-24 17:32:32 +08:00

636 lines
17 KiB
Plaintext
Generated
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "v5hvo8QWN-a9"
},
"source": [
"# Installing Whisper\n",
"\n",
"The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "ZsJUxc0aRsAf"
},
"source": [
"! pip install git+https://github.com/openai/whisper.git\n",
"! pip install jiwer"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "1IMEkgyagYto"
},
"source": [
"# Loading the LibriSpeech dataset\n",
"\n",
"The following will load the test-clean split of the LibriSpeech corpus using torchaudio."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "3CqtR2Fi5-vP"
},
"source": [
"import os\n",
"import numpy as np\n",
"\n",
"try:\n",
" import tensorflow # required in Colab to avoid protobuf compatibility issues\n",
"except ImportError:\n",
" pass\n",
"\n",
"import torch\n",
"import pandas as pd\n",
"import whisper\n",
"import torchaudio\n",
"\n",
"from tqdm.notebook import tqdm\n",
"\n",
"\n",
"DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\""
],
"outputs": []
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"id": "GuCCB2KYOJCE"
},
"source": [
"class LibriSpeech(torch.utils.data.Dataset):\n",
" \"\"\"\n",
" A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.\n",
" It will drop the last few seconds of a very small portion of the utterances.\n",
" \"\"\"\n",
" def __init__(self, split=\"test-clean\", device=DEVICE):\n",
" self.dataset = torchaudio.datasets.LIBRISPEECH(\n",
" root=os.path.expanduser(\"~/.cache\"),\n",
" url=split,\n",
" download=True,\n",
" )\n",
" self.device = device\n",
"\n",
" def __len__(self):\n",
" return len(self.dataset)\n",
"\n",
" def __getitem__(self, item):\n",
" audio, sample_rate, text, _, _, _ = self.dataset[item]\n",
" assert sample_rate == 16000\n",
" audio = whisper.pad_or_trim(audio.flatten()).to(self.device)\n",
" mel = whisper.log_mel_spectrogram(audio)\n",
" \n",
" return (mel, text)"
],
"outputs": []
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"id": "-YcRU5jqNqo2"
},
"source": [
"dataset = LibriSpeech(\"test-clean\")\n",
"loader = torch.utils.data.DataLoader(dataset, batch_size=16)"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "0ljocCNuUAde"
},
"source": [
"# Running inference on the dataset using a base Whisper model\n",
"\n",
"The following will take a few minutes to transcribe all utterances in the dataset."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "_PokfNJtOYNu",
"outputId": "2c53ec44-bc93-4107-b4fa-214e3f71fe8e"
},
"source": [
"model = whisper.load_model(\"base.en\")\n",
"print(\n",
" f\"Model is {'multilingual' if model.is_multilingual else 'English-only'} \"\n",
" f\"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters.\"\n",
")"
],
"outputs": []
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"source": [
"# predict without timestamps for short-form transcription\n",
"options = whisper.DecodingOptions(language=\"en\", without_timestamps=True)"
],
"outputs": []
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 49,
"referenced_widgets": [
"09a29a91f58d4462942505a3cc415801",
"83391f98a240490987c397048fc1a0d4",
"06b9aa5f49fa44ba8c93b647dc7db224",
"da9c231ee67047fb89073c95326b72a5",
"48da931ebe7f4fd299f8c98c7d2460ff",
"7a901f447c1d477bb49f954e0feacedd",
"39f5a6ae8ba74c8598f9c6d5b8ad2d65",
"a0d10a42c753453283e5219c22239337",
"09f4cb79ff86465aaf48b0de24869af9",
"1b9cecf5b3584fba8258a81d4279a25b",
"039b53f2702c4179af7e0548018d0588"
]
},
"id": "7OWTn_KvNk59",
"outputId": "a813a792-3c91-4144-f11f-054fd6778023"
},
"source": [
"hypotheses = []\n",
"references = []\n",
"\n",
"for mels, texts in tqdm(loader):\n",
" results = model.decode(mels, options)\n",
" hypotheses.extend([result.text for result in results])\n",
" references.extend(texts)"
],
"outputs": []
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 424
},
"id": "4nTyynELQ42j",
"outputId": "1c72d25a-3e87-4c60-a8d1-1da9d2f73bd7"
},
"source": [
"data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))\n",
"data"
],
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "HPppEJRXX4ox"
},
"source": [
"# Calculating the word error rate\n",
"\n",
"Now, we use our English normalizer implementation to standardize the transcription and calculate the WER."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"id": "dl-KBDflMhrg"
},
"source": [
"import jiwer\n",
"from whisper.normalizers import EnglishTextNormalizer\n",
"\n",
"normalizer = EnglishTextNormalizer()"
],
"outputs": []
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 641
},
"id": "6-O048q4WI4o",
"outputId": "f2089bc9-f535-441e-f192-26e52ae82b5e"
},
"source": [
"data[\"hypothesis_clean\"] = [normalizer(text) for text in data[\"hypothesis\"]]\n",
"data[\"reference_clean\"] = [normalizer(text) for text in data[\"reference\"]]\n",
"data"
],
"outputs": []
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "EBGSITeBYPTT",
"outputId": "7b3dbe7c-a37e-4a07-a50a-b27d5f88b68f"
},
"source": [
"wer = jiwer.wer(list(data[\"reference_clean\"]), list(data[\"hypothesis_clean\"]))\n",
"\n",
"print(f\"WER: {wer * 100:.2f} %\")"
],
"outputs": []
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"collapsed_sections": [],
"provenance": []
},
"gpuClass": "standard",
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.9"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"039b53f2702c4179af7e0548018d0588": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "DescriptionStyleModel",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"06b9aa5f49fa44ba8c93b647dc7db224": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "FloatProgressModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "success",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_a0d10a42c753453283e5219c22239337",
"max": 164,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_09f4cb79ff86465aaf48b0de24869af9",
"value": 164
}
},
"09a29a91f58d4462942505a3cc415801": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HBoxModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HBoxView",
"box_style": "",
"children": [
"IPY_MODEL_83391f98a240490987c397048fc1a0d4",
"IPY_MODEL_06b9aa5f49fa44ba8c93b647dc7db224",
"IPY_MODEL_da9c231ee67047fb89073c95326b72a5"
],
"layout": "IPY_MODEL_48da931ebe7f4fd299f8c98c7d2460ff"
}
},
"09f4cb79ff86465aaf48b0de24869af9": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "ProgressStyleModel",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"1b9cecf5b3584fba8258a81d4279a25b": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"39f5a6ae8ba74c8598f9c6d5b8ad2d65": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "DescriptionStyleModel",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"48da931ebe7f4fd299f8c98c7d2460ff": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"7a901f447c1d477bb49f954e0feacedd": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"83391f98a240490987c397048fc1a0d4": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HTMLModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_7a901f447c1d477bb49f954e0feacedd",
"placeholder": "",
"style": "IPY_MODEL_39f5a6ae8ba74c8598f9c6d5b8ad2d65",
"value": "100%"
}
},
"a0d10a42c753453283e5219c22239337": {
"model_module": "@jupyter-widgets/base",
"model_module_version": "1.2.0",
"model_name": "LayoutModel",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"da9c231ee67047fb89073c95326b72a5": {
"model_module": "@jupyter-widgets/controls",
"model_module_version": "1.5.0",
"model_name": "HTMLModel",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_1b9cecf5b3584fba8258a81d4279a25b",
"placeholder": "",
"style": "IPY_MODEL_039b53f2702c4179af7e0548018d0588",
"value": " 164/164 [05:08<00:00, 1.86s/it]"
}
}
}
}
},
"nbformat": 4,
"nbformat_minor": 1
}