mirror of
https://github.com/openai/whisper.git
synced 2025-03-30 14:28:27 +00:00
959 lines
31 KiB
Plaintext
959 lines
31 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "v5hvo8QWN-a9"
|
||
},
|
||
"source": [
|
||
"# Installing Whisper\n",
|
||
"\n",
|
||
"The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {
|
||
"id": "ZsJUxc0aRsAf"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"! pip install git+https://github.com/openai/whisper.git\n",
|
||
"! pip install jiwer"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "1IMEkgyagYto"
|
||
},
|
||
"source": [
|
||
"# Loading the LibriSpeech dataset\n",
|
||
"\n",
|
||
"The following will load the test-clean split of the LibriSpeech corpus using torchaudio."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {
|
||
"id": "3CqtR2Fi5-vP"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import os\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"try:\n",
|
||
" import tensorflow # required in Colab to avoid protobuf compatibility issues\n",
|
||
"except ImportError:\n",
|
||
" pass\n",
|
||
"\n",
|
||
"import torch\n",
|
||
"import pandas as pd\n",
|
||
"import whisper\n",
|
||
"import torchaudio\n",
|
||
"\n",
|
||
"from tqdm.notebook import tqdm\n",
|
||
"\n",
|
||
"\n",
|
||
"DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {
|
||
"id": "GuCCB2KYOJCE"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"class LibriSpeech(torch.utils.data.Dataset):\n",
|
||
" \"\"\"\n",
|
||
" A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.\n",
|
||
" It will drop the last few seconds of a very small portion of the utterances.\n",
|
||
" \"\"\"\n",
|
||
" def __init__(self, split=\"test-clean\", device=DEVICE):\n",
|
||
" self.dataset = torchaudio.datasets.LIBRISPEECH(\n",
|
||
" root=os.path.expanduser(\"~/.cache\"),\n",
|
||
" url=split,\n",
|
||
" download=True,\n",
|
||
" )\n",
|
||
" self.device = device\n",
|
||
"\n",
|
||
" def __len__(self):\n",
|
||
" return len(self.dataset)\n",
|
||
"\n",
|
||
" def __getitem__(self, item):\n",
|
||
" audio, sample_rate, text, _, _, _ = self.dataset[item]\n",
|
||
" assert sample_rate == 16000\n",
|
||
" audio = whisper.pad_or_trim(audio.flatten()).to(self.device)\n",
|
||
" mel = whisper.log_mel_spectrogram(audio)\n",
|
||
" \n",
|
||
" return (mel, text)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {
|
||
"id": "-YcRU5jqNqo2"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"dataset = LibriSpeech(\"test-clean\")\n",
|
||
"loader = torch.utils.data.DataLoader(dataset, batch_size=16)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "0ljocCNuUAde"
|
||
},
|
||
"source": [
|
||
"# Running inference on the dataset using a base Whisper model\n",
|
||
"\n",
|
||
"The following will take a few minutes to transcribe all utterances in the dataset."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "_PokfNJtOYNu",
|
||
"outputId": "2c53ec44-bc93-4107-b4fa-214e3f71fe8e"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Model is English-only and has 71,825,408 parameters.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"model = whisper.load_model(\"base.en\")\n",
|
||
"print(\n",
|
||
" f\"Model is {'multilingual' if model.is_multilingual else 'English-only'} \"\n",
|
||
" f\"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters.\"\n",
|
||
")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# predict without timestamps for short-form transcription\n",
|
||
"options = whisper.DecodingOptions(language=\"en\", without_timestamps=True)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 49,
|
||
"referenced_widgets": [
|
||
"09a29a91f58d4462942505a3cc415801",
|
||
"83391f98a240490987c397048fc1a0d4",
|
||
"06b9aa5f49fa44ba8c93b647dc7db224",
|
||
"da9c231ee67047fb89073c95326b72a5",
|
||
"48da931ebe7f4fd299f8c98c7d2460ff",
|
||
"7a901f447c1d477bb49f954e0feacedd",
|
||
"39f5a6ae8ba74c8598f9c6d5b8ad2d65",
|
||
"a0d10a42c753453283e5219c22239337",
|
||
"09f4cb79ff86465aaf48b0de24869af9",
|
||
"1b9cecf5b3584fba8258a81d4279a25b",
|
||
"039b53f2702c4179af7e0548018d0588"
|
||
]
|
||
},
|
||
"id": "7OWTn_KvNk59",
|
||
"outputId": "a813a792-3c91-4144-f11f-054fd6778023"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "9df048b46f764cf68cbe0045b8ff73a8",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/164 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"hypotheses = []\n",
|
||
"references = []\n",
|
||
"\n",
|
||
"for mels, texts in tqdm(loader):\n",
|
||
" results = model.decode(mels, options)\n",
|
||
" hypotheses.extend([result.text for result in results])\n",
|
||
" references.extend(texts)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 424
|
||
},
|
||
"id": "4nTyynELQ42j",
|
||
"outputId": "1c72d25a-3e87-4c60-a8d1-1da9d2f73bd7"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>hypothesis</th>\n",
|
||
" <th>reference</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>He hoped there would be stew for dinner, turni...</td>\n",
|
||
" <td>HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Stuffered into you, his belly counseled him.</td>\n",
|
||
" <td>STUFF IT INTO YOU HIS BELLY COUNSELLED HIM</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>After early nightfall the yellow lamps would l...</td>\n",
|
||
" <td>AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>Hello Bertie, any good in your mind?</td>\n",
|
||
" <td>HELLO BERTIE ANY GOOD IN YOUR MIND</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Number 10. Fresh Nelly is waiting on you. Good...</td>\n",
|
||
" <td>NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2615</th>\n",
|
||
" <td>Oh, to shoot my soul's full meaning into futur...</td>\n",
|
||
" <td>OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2616</th>\n",
|
||
" <td>Then I, long tried by natural ills, received t...</td>\n",
|
||
" <td>THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2617</th>\n",
|
||
" <td>I love thee freely as men strive for right. I ...</td>\n",
|
||
" <td>I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2618</th>\n",
|
||
" <td>I love thee with the passion put to use, in my...</td>\n",
|
||
" <td>I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2619</th>\n",
|
||
" <td>I love thee with the love I seemed to lose wit...</td>\n",
|
||
" <td>I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>2620 rows × 2 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" hypothesis \\\n",
|
||
"0 He hoped there would be stew for dinner, turni... \n",
|
||
"1 Stuffered into you, his belly counseled him. \n",
|
||
"2 After early nightfall the yellow lamps would l... \n",
|
||
"3 Hello Bertie, any good in your mind? \n",
|
||
"4 Number 10. Fresh Nelly is waiting on you. Good... \n",
|
||
"... ... \n",
|
||
"2615 Oh, to shoot my soul's full meaning into futur... \n",
|
||
"2616 Then I, long tried by natural ills, received t... \n",
|
||
"2617 I love thee freely as men strive for right. I ... \n",
|
||
"2618 I love thee with the passion put to use, in my... \n",
|
||
"2619 I love thee with the love I seemed to lose wit... \n",
|
||
"\n",
|
||
" reference \n",
|
||
"0 HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP... \n",
|
||
"1 STUFF IT INTO YOU HIS BELLY COUNSELLED HIM \n",
|
||
"2 AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L... \n",
|
||
"3 HELLO BERTIE ANY GOOD IN YOUR MIND \n",
|
||
"4 NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ... \n",
|
||
"... ... \n",
|
||
"2615 OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE... \n",
|
||
"2616 THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE... \n",
|
||
"2617 I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L... \n",
|
||
"2618 I LOVE THEE WITH THE PASSION PUT TO USE IN MY ... \n",
|
||
"2619 I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ... \n",
|
||
"\n",
|
||
"[2620 rows x 2 columns]"
|
||
]
|
||
},
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))\n",
|
||
"data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "HPppEJRXX4ox"
|
||
},
|
||
"source": [
|
||
"# Calculating the word error rate\n",
|
||
"\n",
|
||
"Now, we use our English normalizer implementation to standardize the transcription and calculate the WER."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {
|
||
"id": "dl-KBDflMhrg"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import jiwer\n",
|
||
"from whisper.normalizers import EnglishTextNormalizer\n",
|
||
"\n",
|
||
"normalizer = EnglishTextNormalizer()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 641
|
||
},
|
||
"id": "6-O048q4WI4o",
|
||
"outputId": "f2089bc9-f535-441e-f192-26e52ae82b5e"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>hypothesis</th>\n",
|
||
" <th>reference</th>\n",
|
||
" <th>hypothesis_clean</th>\n",
|
||
" <th>reference_clean</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>He hoped there would be stew for dinner, turni...</td>\n",
|
||
" <td>HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...</td>\n",
|
||
" <td>he hoped there would be stew for dinner turnip...</td>\n",
|
||
" <td>he hoped there would be stew for dinner turnip...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Stuffered into you, his belly counseled him.</td>\n",
|
||
" <td>STUFF IT INTO YOU HIS BELLY COUNSELLED HIM</td>\n",
|
||
" <td>stuffered into you his belly counseled him</td>\n",
|
||
" <td>stuff it into you his belly counseled him</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>After early nightfall the yellow lamps would l...</td>\n",
|
||
" <td>AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...</td>\n",
|
||
" <td>after early nightfall the yellow lamps would l...</td>\n",
|
||
" <td>after early nightfall the yellow lamps would l...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>Hello Bertie, any good in your mind?</td>\n",
|
||
" <td>HELLO BERTIE ANY GOOD IN YOUR MIND</td>\n",
|
||
" <td>hello bertie any good in your mind</td>\n",
|
||
" <td>hello bertie any good in your mind</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Number 10. Fresh Nelly is waiting on you. Good...</td>\n",
|
||
" <td>NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...</td>\n",
|
||
" <td>number 10 fresh nelly is waiting on you good n...</td>\n",
|
||
" <td>number 10 fresh nelly is waiting on you good n...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2615</th>\n",
|
||
" <td>Oh, to shoot my soul's full meaning into futur...</td>\n",
|
||
" <td>OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...</td>\n",
|
||
" <td>0 to shoot my soul is full meaning into future...</td>\n",
|
||
" <td>0 to shoot my soul is full meaning into future...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2616</th>\n",
|
||
" <td>Then I, long tried by natural ills, received t...</td>\n",
|
||
" <td>THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...</td>\n",
|
||
" <td>then i long tried by natural ills received the...</td>\n",
|
||
" <td>then i long tried by natural ills received the...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2617</th>\n",
|
||
" <td>I love thee freely as men strive for right. I ...</td>\n",
|
||
" <td>I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...</td>\n",
|
||
" <td>i love thee freely as men strive for right i l...</td>\n",
|
||
" <td>i love thee freely as men strive for right i l...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2618</th>\n",
|
||
" <td>I love thee with the passion put to use, in my...</td>\n",
|
||
" <td>I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...</td>\n",
|
||
" <td>i love thee with the passion put to use in my ...</td>\n",
|
||
" <td>i love thee with the passion put to use in my ...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2619</th>\n",
|
||
" <td>I love thee with the love I seemed to lose wit...</td>\n",
|
||
" <td>I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ...</td>\n",
|
||
" <td>i love thee with the love i seemed to lose wit...</td>\n",
|
||
" <td>i love thee with a love i seemed to lose with ...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>2620 rows × 4 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" hypothesis \\\n",
|
||
"0 He hoped there would be stew for dinner, turni... \n",
|
||
"1 Stuffered into you, his belly counseled him. \n",
|
||
"2 After early nightfall the yellow lamps would l... \n",
|
||
"3 Hello Bertie, any good in your mind? \n",
|
||
"4 Number 10. Fresh Nelly is waiting on you. Good... \n",
|
||
"... ... \n",
|
||
"2615 Oh, to shoot my soul's full meaning into futur... \n",
|
||
"2616 Then I, long tried by natural ills, received t... \n",
|
||
"2617 I love thee freely as men strive for right. I ... \n",
|
||
"2618 I love thee with the passion put to use, in my... \n",
|
||
"2619 I love thee with the love I seemed to lose wit... \n",
|
||
"\n",
|
||
" reference \\\n",
|
||
"0 HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP... \n",
|
||
"1 STUFF IT INTO YOU HIS BELLY COUNSELLED HIM \n",
|
||
"2 AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L... \n",
|
||
"3 HELLO BERTIE ANY GOOD IN YOUR MIND \n",
|
||
"4 NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ... \n",
|
||
"... ... \n",
|
||
"2615 OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE... \n",
|
||
"2616 THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE... \n",
|
||
"2617 I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L... \n",
|
||
"2618 I LOVE THEE WITH THE PASSION PUT TO USE IN MY ... \n",
|
||
"2619 I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ... \n",
|
||
"\n",
|
||
" hypothesis_clean \\\n",
|
||
"0 he hoped there would be stew for dinner turnip... \n",
|
||
"1 stuffered into you his belly counseled him \n",
|
||
"2 after early nightfall the yellow lamps would l... \n",
|
||
"3 hello bertie any good in your mind \n",
|
||
"4 number 10 fresh nelly is waiting on you good n... \n",
|
||
"... ... \n",
|
||
"2615 0 to shoot my soul is full meaning into future... \n",
|
||
"2616 then i long tried by natural ills received the... \n",
|
||
"2617 i love thee freely as men strive for right i l... \n",
|
||
"2618 i love thee with the passion put to use in my ... \n",
|
||
"2619 i love thee with the love i seemed to lose wit... \n",
|
||
"\n",
|
||
" reference_clean \n",
|
||
"0 he hoped there would be stew for dinner turnip... \n",
|
||
"1 stuff it into you his belly counseled him \n",
|
||
"2 after early nightfall the yellow lamps would l... \n",
|
||
"3 hello bertie any good in your mind \n",
|
||
"4 number 10 fresh nelly is waiting on you good n... \n",
|
||
"... ... \n",
|
||
"2615 0 to shoot my soul is full meaning into future... \n",
|
||
"2616 then i long tried by natural ills received the... \n",
|
||
"2617 i love thee freely as men strive for right i l... \n",
|
||
"2618 i love thee with the passion put to use in my ... \n",
|
||
"2619 i love thee with a love i seemed to lose with ... \n",
|
||
"\n",
|
||
"[2620 rows x 4 columns]"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data[\"hypothesis_clean\"] = [normalizer(text) for text in data[\"hypothesis\"]]\n",
|
||
"data[\"reference_clean\"] = [normalizer(text) for text in data[\"reference\"]]\n",
|
||
"data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "EBGSITeBYPTT",
|
||
"outputId": "7b3dbe7c-a37e-4a07-a50a-b27d5f88b68f"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"WER: 4.26 %\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"wer = jiwer.wer(list(data[\"reference_clean\"]), list(data[\"hypothesis_clean\"]))\n",
|
||
"\n",
|
||
"print(f\"WER: {wer * 100:.2f} %\")"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"accelerator": "GPU",
|
||
"colab": {
|
||
"collapsed_sections": [],
|
||
"provenance": []
|
||
},
|
||
"gpuClass": "standard",
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.9.9"
|
||
},
|
||
"widgets": {
|
||
"application/vnd.jupyter.widget-state+json": {
|
||
"039b53f2702c4179af7e0548018d0588": {
|
||
"model_module": "@jupyter-widgets/controls",
|
||
"model_module_version": "1.5.0",
|
||
"model_name": "DescriptionStyleModel",
|
||
"state": {
|
||
"_model_module": "@jupyter-widgets/controls",
|
||
"_model_module_version": "1.5.0",
|
||
"_model_name": "DescriptionStyleModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/base",
|
||
"_view_module_version": "1.2.0",
|
||
"_view_name": "StyleView",
|
||
"description_width": ""
|
||
}
|
||
},
|
||
"06b9aa5f49fa44ba8c93b647dc7db224": {
|
||
"model_module": "@jupyter-widgets/controls",
|
||
"model_module_version": "1.5.0",
|
||
"model_name": "FloatProgressModel",
|
||
"state": {
|
||
"_dom_classes": [],
|
||
"_model_module": "@jupyter-widgets/controls",
|
||
"_model_module_version": "1.5.0",
|
||
"_model_name": "FloatProgressModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/controls",
|
||
"_view_module_version": "1.5.0",
|
||
"_view_name": "ProgressView",
|
||
"bar_style": "success",
|
||
"description": "",
|
||
"description_tooltip": null,
|
||
"layout": "IPY_MODEL_a0d10a42c753453283e5219c22239337",
|
||
"max": 164,
|
||
"min": 0,
|
||
"orientation": "horizontal",
|
||
"style": "IPY_MODEL_09f4cb79ff86465aaf48b0de24869af9",
|
||
"value": 164
|
||
}
|
||
},
|
||
"09a29a91f58d4462942505a3cc415801": {
|
||
"model_module": "@jupyter-widgets/controls",
|
||
"model_module_version": "1.5.0",
|
||
"model_name": "HBoxModel",
|
||
"state": {
|
||
"_dom_classes": [],
|
||
"_model_module": "@jupyter-widgets/controls",
|
||
"_model_module_version": "1.5.0",
|
||
"_model_name": "HBoxModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/controls",
|
||
"_view_module_version": "1.5.0",
|
||
"_view_name": "HBoxView",
|
||
"box_style": "",
|
||
"children": [
|
||
"IPY_MODEL_83391f98a240490987c397048fc1a0d4",
|
||
"IPY_MODEL_06b9aa5f49fa44ba8c93b647dc7db224",
|
||
"IPY_MODEL_da9c231ee67047fb89073c95326b72a5"
|
||
],
|
||
"layout": "IPY_MODEL_48da931ebe7f4fd299f8c98c7d2460ff"
|
||
}
|
||
},
|
||
"09f4cb79ff86465aaf48b0de24869af9": {
|
||
"model_module": "@jupyter-widgets/controls",
|
||
"model_module_version": "1.5.0",
|
||
"model_name": "ProgressStyleModel",
|
||
"state": {
|
||
"_model_module": "@jupyter-widgets/controls",
|
||
"_model_module_version": "1.5.0",
|
||
"_model_name": "ProgressStyleModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/base",
|
||
"_view_module_version": "1.2.0",
|
||
"_view_name": "StyleView",
|
||
"bar_color": null,
|
||
"description_width": ""
|
||
}
|
||
},
|
||
"1b9cecf5b3584fba8258a81d4279a25b": {
|
||
"model_module": "@jupyter-widgets/base",
|
||
"model_module_version": "1.2.0",
|
||
"model_name": "LayoutModel",
|
||
"state": {
|
||
"_model_module": "@jupyter-widgets/base",
|
||
"_model_module_version": "1.2.0",
|
||
"_model_name": "LayoutModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/base",
|
||
"_view_module_version": "1.2.0",
|
||
"_view_name": "LayoutView",
|
||
"align_content": null,
|
||
"align_items": null,
|
||
"align_self": null,
|
||
"border": null,
|
||
"bottom": null,
|
||
"display": null,
|
||
"flex": null,
|
||
"flex_flow": null,
|
||
"grid_area": null,
|
||
"grid_auto_columns": null,
|
||
"grid_auto_flow": null,
|
||
"grid_auto_rows": null,
|
||
"grid_column": null,
|
||
"grid_gap": null,
|
||
"grid_row": null,
|
||
"grid_template_areas": null,
|
||
"grid_template_columns": null,
|
||
"grid_template_rows": null,
|
||
"height": null,
|
||
"justify_content": null,
|
||
"justify_items": null,
|
||
"left": null,
|
||
"margin": null,
|
||
"max_height": null,
|
||
"max_width": null,
|
||
"min_height": null,
|
||
"min_width": null,
|
||
"object_fit": null,
|
||
"object_position": null,
|
||
"order": null,
|
||
"overflow": null,
|
||
"overflow_x": null,
|
||
"overflow_y": null,
|
||
"padding": null,
|
||
"right": null,
|
||
"top": null,
|
||
"visibility": null,
|
||
"width": null
|
||
}
|
||
},
|
||
"39f5a6ae8ba74c8598f9c6d5b8ad2d65": {
|
||
"model_module": "@jupyter-widgets/controls",
|
||
"model_module_version": "1.5.0",
|
||
"model_name": "DescriptionStyleModel",
|
||
"state": {
|
||
"_model_module": "@jupyter-widgets/controls",
|
||
"_model_module_version": "1.5.0",
|
||
"_model_name": "DescriptionStyleModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/base",
|
||
"_view_module_version": "1.2.0",
|
||
"_view_name": "StyleView",
|
||
"description_width": ""
|
||
}
|
||
},
|
||
"48da931ebe7f4fd299f8c98c7d2460ff": {
|
||
"model_module": "@jupyter-widgets/base",
|
||
"model_module_version": "1.2.0",
|
||
"model_name": "LayoutModel",
|
||
"state": {
|
||
"_model_module": "@jupyter-widgets/base",
|
||
"_model_module_version": "1.2.0",
|
||
"_model_name": "LayoutModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/base",
|
||
"_view_module_version": "1.2.0",
|
||
"_view_name": "LayoutView",
|
||
"align_content": null,
|
||
"align_items": null,
|
||
"align_self": null,
|
||
"border": null,
|
||
"bottom": null,
|
||
"display": null,
|
||
"flex": null,
|
||
"flex_flow": null,
|
||
"grid_area": null,
|
||
"grid_auto_columns": null,
|
||
"grid_auto_flow": null,
|
||
"grid_auto_rows": null,
|
||
"grid_column": null,
|
||
"grid_gap": null,
|
||
"grid_row": null,
|
||
"grid_template_areas": null,
|
||
"grid_template_columns": null,
|
||
"grid_template_rows": null,
|
||
"height": null,
|
||
"justify_content": null,
|
||
"justify_items": null,
|
||
"left": null,
|
||
"margin": null,
|
||
"max_height": null,
|
||
"max_width": null,
|
||
"min_height": null,
|
||
"min_width": null,
|
||
"object_fit": null,
|
||
"object_position": null,
|
||
"order": null,
|
||
"overflow": null,
|
||
"overflow_x": null,
|
||
"overflow_y": null,
|
||
"padding": null,
|
||
"right": null,
|
||
"top": null,
|
||
"visibility": null,
|
||
"width": null
|
||
}
|
||
},
|
||
"7a901f447c1d477bb49f954e0feacedd": {
|
||
"model_module": "@jupyter-widgets/base",
|
||
"model_module_version": "1.2.0",
|
||
"model_name": "LayoutModel",
|
||
"state": {
|
||
"_model_module": "@jupyter-widgets/base",
|
||
"_model_module_version": "1.2.0",
|
||
"_model_name": "LayoutModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/base",
|
||
"_view_module_version": "1.2.0",
|
||
"_view_name": "LayoutView",
|
||
"align_content": null,
|
||
"align_items": null,
|
||
"align_self": null,
|
||
"border": null,
|
||
"bottom": null,
|
||
"display": null,
|
||
"flex": null,
|
||
"flex_flow": null,
|
||
"grid_area": null,
|
||
"grid_auto_columns": null,
|
||
"grid_auto_flow": null,
|
||
"grid_auto_rows": null,
|
||
"grid_column": null,
|
||
"grid_gap": null,
|
||
"grid_row": null,
|
||
"grid_template_areas": null,
|
||
"grid_template_columns": null,
|
||
"grid_template_rows": null,
|
||
"height": null,
|
||
"justify_content": null,
|
||
"justify_items": null,
|
||
"left": null,
|
||
"margin": null,
|
||
"max_height": null,
|
||
"max_width": null,
|
||
"min_height": null,
|
||
"min_width": null,
|
||
"object_fit": null,
|
||
"object_position": null,
|
||
"order": null,
|
||
"overflow": null,
|
||
"overflow_x": null,
|
||
"overflow_y": null,
|
||
"padding": null,
|
||
"right": null,
|
||
"top": null,
|
||
"visibility": null,
|
||
"width": null
|
||
}
|
||
},
|
||
"83391f98a240490987c397048fc1a0d4": {
|
||
"model_module": "@jupyter-widgets/controls",
|
||
"model_module_version": "1.5.0",
|
||
"model_name": "HTMLModel",
|
||
"state": {
|
||
"_dom_classes": [],
|
||
"_model_module": "@jupyter-widgets/controls",
|
||
"_model_module_version": "1.5.0",
|
||
"_model_name": "HTMLModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/controls",
|
||
"_view_module_version": "1.5.0",
|
||
"_view_name": "HTMLView",
|
||
"description": "",
|
||
"description_tooltip": null,
|
||
"layout": "IPY_MODEL_7a901f447c1d477bb49f954e0feacedd",
|
||
"placeholder": "",
|
||
"style": "IPY_MODEL_39f5a6ae8ba74c8598f9c6d5b8ad2d65",
|
||
"value": "100%"
|
||
}
|
||
},
|
||
"a0d10a42c753453283e5219c22239337": {
|
||
"model_module": "@jupyter-widgets/base",
|
||
"model_module_version": "1.2.0",
|
||
"model_name": "LayoutModel",
|
||
"state": {
|
||
"_model_module": "@jupyter-widgets/base",
|
||
"_model_module_version": "1.2.0",
|
||
"_model_name": "LayoutModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/base",
|
||
"_view_module_version": "1.2.0",
|
||
"_view_name": "LayoutView",
|
||
"align_content": null,
|
||
"align_items": null,
|
||
"align_self": null,
|
||
"border": null,
|
||
"bottom": null,
|
||
"display": null,
|
||
"flex": null,
|
||
"flex_flow": null,
|
||
"grid_area": null,
|
||
"grid_auto_columns": null,
|
||
"grid_auto_flow": null,
|
||
"grid_auto_rows": null,
|
||
"grid_column": null,
|
||
"grid_gap": null,
|
||
"grid_row": null,
|
||
"grid_template_areas": null,
|
||
"grid_template_columns": null,
|
||
"grid_template_rows": null,
|
||
"height": null,
|
||
"justify_content": null,
|
||
"justify_items": null,
|
||
"left": null,
|
||
"margin": null,
|
||
"max_height": null,
|
||
"max_width": null,
|
||
"min_height": null,
|
||
"min_width": null,
|
||
"object_fit": null,
|
||
"object_position": null,
|
||
"order": null,
|
||
"overflow": null,
|
||
"overflow_x": null,
|
||
"overflow_y": null,
|
||
"padding": null,
|
||
"right": null,
|
||
"top": null,
|
||
"visibility": null,
|
||
"width": null
|
||
}
|
||
},
|
||
"da9c231ee67047fb89073c95326b72a5": {
|
||
"model_module": "@jupyter-widgets/controls",
|
||
"model_module_version": "1.5.0",
|
||
"model_name": "HTMLModel",
|
||
"state": {
|
||
"_dom_classes": [],
|
||
"_model_module": "@jupyter-widgets/controls",
|
||
"_model_module_version": "1.5.0",
|
||
"_model_name": "HTMLModel",
|
||
"_view_count": null,
|
||
"_view_module": "@jupyter-widgets/controls",
|
||
"_view_module_version": "1.5.0",
|
||
"_view_name": "HTMLView",
|
||
"description": "",
|
||
"description_tooltip": null,
|
||
"layout": "IPY_MODEL_1b9cecf5b3584fba8258a81d4279a25b",
|
||
"placeholder": "",
|
||
"style": "IPY_MODEL_039b53f2702c4179af7e0548018d0588",
|
||
"value": " 164/164 [05:08<00:00, 1.86s/it]"
|
||
}
|
||
}
|
||
}
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 1
|
||
}
|