add comments

2025-11-24 06:26:03 +00:00 · 2024-05-24 17:32:32 +08:00 · 2024-05-24 17:32:32 +08:00 · ed4b0d14a2
commit ed4b0d14a2
parent ba3f3cd54b
11 changed files with 182 additions and 378 deletions
--- a/notebooks/LibriSpeech.ipynb
+++ b/notebooks/LibriSpeech.ipynb
@ -17,11 +17,11 @@
   "metadata": {
    "id": "ZsJUxc0aRsAf"
   },
   "outputs": [],
   "source": [
    "! pip install git+https://github.com/openai/whisper.git\n",
    "! pip install jiwer"
-   ]
+   ],
   "outputs": []
  },
  {
   "cell_type": "markdown",
@ -40,7 +40,6 @@
   "metadata": {
    "id": "3CqtR2Fi5-vP"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
@ -59,7 +58,8 @@
    "\n",
    "\n",
    "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\""
-   ]
+   ],
   "outputs": []
  },
  {
   "cell_type": "code",
@ -67,7 +67,6 @@
   "metadata": {
    "id": "GuCCB2KYOJCE"
   },
   "outputs": [],
   "source": [
    "class LibriSpeech(torch.utils.data.Dataset):\n",
    "    \"\"\"\n",
@ -92,7 +91,8 @@
    "        mel = whisper.log_mel_spectrogram(audio)\n",
    "        \n",
    "        return (mel, text)"
-   ]
+   ],
   "outputs": []
  },
  {
   "cell_type": "code",
@ -100,11 +100,11 @@
   "metadata": {
    "id": "-YcRU5jqNqo2"
   },
   "outputs": [],
   "source": [
    "dataset = LibriSpeech(\"test-clean\")\n",
    "loader = torch.utils.data.DataLoader(dataset, batch_size=16)"
-   ]
+   ],
   "outputs": []
  },
  {
   "cell_type": "markdown",
@ -127,32 +127,24 @@
    "id": "_PokfNJtOYNu",
    "outputId": "2c53ec44-bc93-4107-b4fa-214e3f71fe8e"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model is English-only and has 71,825,408 parameters.\n"
     ]
    }
   ],
   "source": [
    "model = whisper.load_model(\"base.en\")\n",
    "print(\n",
    "    f\"Model is {'multilingual' if model.is_multilingual else 'English-only'} \"\n",
    "    f\"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters.\"\n",
    ")"
-   ]
+   ],
   "outputs": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# predict without timestamps for short-form transcription\n",
    "options = whisper.DecodingOptions(language=\"en\", without_timestamps=True)"
-   ]
+   ],
   "outputs": []
  },
  {
   "cell_type": "code",
@ -178,22 +170,6 @@
    "id": "7OWTn_KvNk59",
    "outputId": "a813a792-3c91-4144-f11f-054fd6778023"
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9df048b46f764cf68cbe0045b8ff73a8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/164 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "hypotheses = []\n",
    "references = []\n",
@ -202,7 +178,8 @@
    "    results = model.decode(mels, options)\n",
    "    hypotheses.extend([result.text for result in results])\n",
    "    references.extend(texts)"
-   ]
+   ],
   "outputs": []
  },
  {
   "cell_type": "code",
@ -215,132 +192,11 @@
    "id": "4nTyynELQ42j",
    "outputId": "1c72d25a-3e87-4c60-a8d1-1da9d2f73bd7"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>hypothesis</th>\n",
       "      <th>reference</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>He hoped there would be stew for dinner, turni...</td>\n",
       "      <td>HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Stuffered into you, his belly counseled him.</td>\n",
       "      <td>STUFF IT INTO YOU HIS BELLY COUNSELLED HIM</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>After early nightfall the yellow lamps would l...</td>\n",
       "      <td>AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Hello Bertie, any good in your mind?</td>\n",
       "      <td>HELLO BERTIE ANY GOOD IN YOUR MIND</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Number 10. Fresh Nelly is waiting on you. Good...</td>\n",
       "      <td>NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2615</th>\n",
       "      <td>Oh, to shoot my soul's full meaning into futur...</td>\n",
       "      <td>OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2616</th>\n",
       "      <td>Then I, long tried by natural ills, received t...</td>\n",
       "      <td>THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2617</th>\n",
       "      <td>I love thee freely as men strive for right. I ...</td>\n",
       "      <td>I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2618</th>\n",
       "      <td>I love thee with the passion put to use, in my...</td>\n",
       "      <td>I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2619</th>\n",
       "      <td>I love thee with the love I seemed to lose wit...</td>\n",
       "      <td>I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2620 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             hypothesis  \\\n",
       "0     He hoped there would be stew for dinner, turni...   \n",
       "1          Stuffered into you, his belly counseled him.   \n",
       "2     After early nightfall the yellow lamps would l...   \n",
       "3                  Hello Bertie, any good in your mind?   \n",
       "4     Number 10. Fresh Nelly is waiting on you. Good...   \n",
       "...                                                 ...   \n",
       "2615  Oh, to shoot my soul's full meaning into futur...   \n",
       "2616  Then I, long tried by natural ills, received t...   \n",
       "2617  I love thee freely as men strive for right. I ...   \n",
       "2618  I love thee with the passion put to use, in my...   \n",
       "2619  I love thee with the love I seemed to lose wit...   \n",
       "\n",
       "                                              reference  \n",
       "0     HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...  \n",
       "1            STUFF IT INTO YOU HIS BELLY COUNSELLED HIM  \n",
       "2     AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...  \n",
       "3                    HELLO BERTIE ANY GOOD IN YOUR MIND  \n",
       "4     NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...  \n",
       "...                                                 ...  \n",
       "2615  OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...  \n",
       "2616  THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...  \n",
       "2617  I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...  \n",
       "2618  I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...  \n",
       "2619  I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ...  \n",
       "\n",
       "[2620 rows x 2 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))\n",
    "data"
-   ]
+   ],
   "outputs": []
  },
  {
   "cell_type": "markdown",
@ -359,13 +215,13 @@
   "metadata": {
    "id": "dl-KBDflMhrg"
   },
   "outputs": [],
   "source": [
    "import jiwer\n",
    "from whisper.normalizers import EnglishTextNormalizer\n",
    "\n",
    "normalizer = EnglishTextNormalizer()"
-   ]
+   ],
   "outputs": []
  },
  {
   "cell_type": "code",
@ -378,183 +234,12 @@
    "id": "6-O048q4WI4o",
    "outputId": "f2089bc9-f535-441e-f192-26e52ae82b5e"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>hypothesis</th>\n",
       "      <th>reference</th>\n",
       "      <th>hypothesis_clean</th>\n",
       "      <th>reference_clean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>He hoped there would be stew for dinner, turni...</td>\n",
       "      <td>HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...</td>\n",
       "      <td>he hoped there would be stew for dinner turnip...</td>\n",
       "      <td>he hoped there would be stew for dinner turnip...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Stuffered into you, his belly counseled him.</td>\n",
       "      <td>STUFF IT INTO YOU HIS BELLY COUNSELLED HIM</td>\n",
       "      <td>stuffered into you his belly counseled him</td>\n",
       "      <td>stuff it into you his belly counseled him</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>After early nightfall the yellow lamps would l...</td>\n",
       "      <td>AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...</td>\n",
       "      <td>after early nightfall the yellow lamps would l...</td>\n",
       "      <td>after early nightfall the yellow lamps would l...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Hello Bertie, any good in your mind?</td>\n",
       "      <td>HELLO BERTIE ANY GOOD IN YOUR MIND</td>\n",
       "      <td>hello bertie any good in your mind</td>\n",
       "      <td>hello bertie any good in your mind</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Number 10. Fresh Nelly is waiting on you. Good...</td>\n",
       "      <td>NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...</td>\n",
       "      <td>number 10 fresh nelly is waiting on you good n...</td>\n",
       "      <td>number 10 fresh nelly is waiting on you good n...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2615</th>\n",
       "      <td>Oh, to shoot my soul's full meaning into futur...</td>\n",
       "      <td>OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...</td>\n",
       "      <td>0 to shoot my soul is full meaning into future...</td>\n",
       "      <td>0 to shoot my soul is full meaning into future...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2616</th>\n",
       "      <td>Then I, long tried by natural ills, received t...</td>\n",
       "      <td>THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...</td>\n",
       "      <td>then i long tried by natural ills received the...</td>\n",
       "      <td>then i long tried by natural ills received the...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2617</th>\n",
       "      <td>I love thee freely as men strive for right. I ...</td>\n",
       "      <td>I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...</td>\n",
       "      <td>i love thee freely as men strive for right i l...</td>\n",
       "      <td>i love thee freely as men strive for right i l...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2618</th>\n",
       "      <td>I love thee with the passion put to use, in my...</td>\n",
       "      <td>I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...</td>\n",
       "      <td>i love thee with the passion put to use in my ...</td>\n",
       "      <td>i love thee with the passion put to use in my ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2619</th>\n",
       "      <td>I love thee with the love I seemed to lose wit...</td>\n",
       "      <td>I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ...</td>\n",
       "      <td>i love thee with the love i seemed to lose wit...</td>\n",
       "      <td>i love thee with a love i seemed to lose with ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2620 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             hypothesis  \\\n",
       "0     He hoped there would be stew for dinner, turni...   \n",
       "1          Stuffered into you, his belly counseled him.   \n",
       "2     After early nightfall the yellow lamps would l...   \n",
       "3                  Hello Bertie, any good in your mind?   \n",
       "4     Number 10. Fresh Nelly is waiting on you. Good...   \n",
       "...                                                 ...   \n",
       "2615  Oh, to shoot my soul's full meaning into futur...   \n",
       "2616  Then I, long tried by natural ills, received t...   \n",
       "2617  I love thee freely as men strive for right. I ...   \n",
       "2618  I love thee with the passion put to use, in my...   \n",
       "2619  I love thee with the love I seemed to lose wit...   \n",
       "\n",
       "                                              reference  \\\n",
       "0     HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP...   \n",
       "1            STUFF IT INTO YOU HIS BELLY COUNSELLED HIM   \n",
       "2     AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L...   \n",
       "3                    HELLO BERTIE ANY GOOD IN YOUR MIND   \n",
       "4     NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ...   \n",
       "...                                                 ...   \n",
       "2615  OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE...   \n",
       "2616  THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE...   \n",
       "2617  I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L...   \n",
       "2618  I LOVE THEE WITH THE PASSION PUT TO USE IN MY ...   \n",
       "2619  I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ...   \n",
       "\n",
       "                                       hypothesis_clean  \\\n",
       "0     he hoped there would be stew for dinner turnip...   \n",
       "1            stuffered into you his belly counseled him   \n",
       "2     after early nightfall the yellow lamps would l...   \n",
       "3                    hello bertie any good in your mind   \n",
       "4     number 10 fresh nelly is waiting on you good n...   \n",
       "...                                                 ...   \n",
       "2615  0 to shoot my soul is full meaning into future...   \n",
       "2616  then i long tried by natural ills received the...   \n",
       "2617  i love thee freely as men strive for right i l...   \n",
       "2618  i love thee with the passion put to use in my ...   \n",
       "2619  i love thee with the love i seemed to lose wit...   \n",
       "\n",
       "                                        reference_clean  \n",
       "0     he hoped there would be stew for dinner turnip...  \n",
       "1             stuff it into you his belly counseled him  \n",
       "2     after early nightfall the yellow lamps would l...  \n",
       "3                    hello bertie any good in your mind  \n",
       "4     number 10 fresh nelly is waiting on you good n...  \n",
       "...                                                 ...  \n",
       "2615  0 to shoot my soul is full meaning into future...  \n",
       "2616  then i long tried by natural ills received the...  \n",
       "2617  i love thee freely as men strive for right i l...  \n",
       "2618  i love thee with the passion put to use in my ...  \n",
       "2619  i love thee with a love i seemed to lose with ...  \n",
       "\n",
       "[2620 rows x 4 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[\"hypothesis_clean\"] = [normalizer(text) for text in data[\"hypothesis\"]]\n",
    "data[\"reference_clean\"] = [normalizer(text) for text in data[\"reference\"]]\n",
    "data"
-   ]
+   ],
   "outputs": []
  },
  {
   "cell_type": "code",
@ -566,20 +251,12 @@
    "id": "EBGSITeBYPTT",
    "outputId": "7b3dbe7c-a37e-4a07-a50a-b27d5f88b68f"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WER: 4.26 %\n"
     ]
    }
   ],
   "source": [
    "wer = jiwer.wer(list(data[\"reference_clean\"]), list(data[\"hypothesis_clean\"]))\n",
    "\n",
    "print(f\"WER: {wer * 100:.2f} %\")"
-   ]
+   ],
   "outputs": []
  }
 ],
 "metadata": {
--- a/whisper/init.py
+++ b/whisper/init.py
@ -14,6 +14,7 @@ from .model import ModelDimensions, Whisper
 from .transcribe import transcribe
 from .version import __version__
 # what are these models? a: they are the pre-trained models that are available for use
 _MODELS = {
    "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
    "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
@ -46,10 +47,12 @@ _ALIGNMENT_HEADS = {
    "large": b"ABzY8gWO1E0{>%R7(9S+Kn!D~%ngiGaR?*L!iJG9p-nab0JQ=-{D1-g00",
 }
-
+# q: download the model from the given url and save it to the given root directory
 def _download(url: str, root: str, in_memory: bool) -> Union[bytes, str]:
    os.makedirs(root, exist_ok=True)
    # what is sha256?
    # a: it is a cryptographic hash function that produces a fixed-size hash value
    expected_sha256 = url.split("/")[-2]
    download_target = os.path.join(root, os.path.basename(url))
@ -59,6 +62,7 @@ def _download(url: str, root: str, in_memory: bool) -> Union[bytes, str]:
    if os.path.isfile(download_target):
        with open(download_target, "rb") as f:
            model_bytes = f.read()
        # what is the purpose of this if statement? a: to check if the SHA256 checksum matches
        if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
            return model_bytes if in_memory else download_target
        else:
@ -66,6 +70,9 @@ def _download(url: str, root: str, in_memory: bool) -> Union[bytes, str]:
                f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file"
            )
    # is the following line re-downloading the model? a: yes
    # so this function checks whether the model is already downloaded and if not, it downloads the model?
    # a: yes
    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
        with tqdm(
            total=int(source.info().get("Content-Length")),
@ -88,14 +95,17 @@ def _download(url: str, root: str, in_memory: bool) -> Union[bytes, str]:
            "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
        )
    # is this the checkpoint file? a: yes
    return model_bytes if in_memory else download_target
-
+# q: what is the purpose of this function? a: to return the names of the available models
 def available_models() -> List[str]:
    """Returns the names of available models"""
    return list(_MODELS.keys())
-
+# q: what is the purpose of this function? a: to load the model from the given name
 # what does -> Whisper in Python mean? a: it means that the function returns an object of type Whisper
 # load 一个模型，返回一个Whisper对象
 def load_model(
    name: str,
    device: Optional[Union[str, torch.device]] = None,
@ -140,14 +150,24 @@ def load_model(
            f"Model {name} not found; available models = {available_models()}"
        )
    # what is "with" in Python?
    # a: it is used to open a file and automatically close it after the block of code is executed
    with (
        # what if checkpoint_file is in memory? a: it uses io.BytesIO to read the file
        # what if checkpoint_file is not in memory? a: it uses open to read the file
        io.BytesIO(checkpoint_file) if in_memory else open(checkpoint_file, "rb")
    ) as fp:
        checkpoint = torch.load(fp, map_location=device)
    del checkpoint_file
    # what is the **checkpoint["dims"]? a: it unpacks the dictionary into keyword arguments
    # are arguments in ModelDimensions nullable? a: no
    # so what if the checkpoint["dims"] is missing? a: it will raise an error
    # how to confirm that checkpoint contains the "dims" key? a: by checking the keys of the dictionary
    dims = ModelDimensions(**checkpoint["dims"])
    model = Whisper(dims)
    # what is load_state_dict? a: it loads the model weights
    model.load_state_dict(checkpoint["model_state_dict"])
    if alignment_heads is not None:
--- a/whisper/audio.py
+++ b/whisper/audio.py
@ -1,3 +1,7 @@
 """
 q: what is the usage of this file? a: this file contains the audio processing functions
 """
 import os
 from functools import lru_cache
 from subprocess import CalledProcessError, run
@ -11,10 +15,13 @@ from .utils import exact_div
 # hard-coded audio hyperparameters
 SAMPLE_RATE = 16000
-N_FFT = 400
+N_FFT = 400  # 25ms window
 HOP_LENGTH = 160
 CHUNK_LENGTH = 30
 N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE  # 480000 samples in a 30-second chunk
 # what is frame? a: a frame is a short segment of audio, usually 10ms
 # what is frame used for? a: it is used to compute the spectrogram
 N_FRAMES = exact_div(N_SAMPLES, HOP_LENGTH)  # 3000 frames in a mel spectrogram input
 N_SAMPLES_PER_TOKEN = HOP_LENGTH * 2  # the initial convolutions has stride 2
@ -135,23 +142,23 @@ def log_mel_spectrogram(
    torch.Tensor, shape = (80, n_frames)
        A Tensor that contains the Mel spectrogram
    """
-    if not torch.is_tensor(audio):
+    if not torch.is_tensor(audio):  # load audio if not already a tensor
        if isinstance(audio, str):
-            audio = load_audio(audio)
+            audio = load_audio(audio)  # load audio from file
-        audio = torch.from_numpy(audio)
+        audio = torch.from_numpy(audio)  # convert to tensor
    if device is not None:
        audio = audio.to(device)
    if padding > 0:
-        audio = F.pad(audio, (0, padding))
+        audio = F.pad(audio, (0, padding))  # pad audio to the right
-    window = torch.hann_window(N_FFT).to(audio.device)
+    window = torch.hann_window(N_FFT).to(audio.device)  # create a Hann window
-    stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
+    stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True) # compute STFT
-    magnitudes = stft[..., :-1].abs() ** 2
+    magnitudes = stft[..., :-1].abs() ** 2  # compute magnitudes
-    filters = mel_filters(audio.device, n_mels)
+    filters = mel_filters(audio.device, n_mels)  # load mel filters
-    mel_spec = filters @ magnitudes
+    mel_spec = filters @ magnitudes  # apply mel filters
-    log_spec = torch.clamp(mel_spec, min=1e-10).log10()
+    log_spec = torch.clamp(mel_spec, min=1e-10).log10()  # compute log spectrogram
    log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
    log_spec = (log_spec + 4.0) / 4.0
    return log_spec
--- a/whisper/decoding.py
+++ b/whisper/decoding.py
@ -1,3 +1,7 @@
 """
 q: what is the usage of this file? a: this file contains the audio processing functions
 """
 from dataclasses import dataclass, field, replace
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Sequence, Tuple, Union
@ -152,6 +156,7 @@ class PyTorchInference(Inference):
        value_modules = [block.attn.value for block in self.model.decoder.blocks]
        self.kv_modules = key_modules + value_modules
    # forward pass through the decoder, with key-value caching
    def logits(self, tokens: Tensor, audio_features: Tensor) -> Tensor:
        if not self.kv_cache:
            self.kv_cache, self.hooks = self.model.install_kv_cache_hooks()
--- a/whisper/model.py
+++ b/whisper/model.py
@ -8,11 +8,14 @@ import torch
 import torch.nn.functional as F
 from torch import Tensor, nn
 # q: why the decoding has a dot before it? a: it is a relative import
 # q: what is relative import? a: it is a way to import modules from the same package
 from .decoding import decode as decode_function
 from .decoding import detect_language as detect_language_function
 from .transcribe import transcribe as transcribe_function
 # Q: what is ModelDimensions? a: it is a data class that stores the dimensions of the model
@dataclass
 class ModelDimensions:
    n_mels: int
@ -27,13 +30,18 @@ class ModelDimensions:
    n_text_layer: int
 # q: What is layer norm? a: https://arxiv.org/abs/1607.06450
 # q: explain it in short words? a: it normalizes the input tensor across the last dimension
 # you are so cool! thanks! I know! 😎
 class LayerNorm(nn.LayerNorm):
    def forward(self, x: Tensor) -> Tensor:
        return super().forward(x.float()).type(x.dtype)
-
+# q: what is the usage of this class? a: it is a linear layer that converts the input tensor to the output tensor
 class Linear(nn.Linear):
    def forward(self, x: Tensor) -> Tensor:
        # q: what is F.linear? a: it is a function that applies a linear transformation to the input tensor
        # q: what is F here? a: it is the torch.nn.functional module
        return F.linear(
            x,
            self.weight.to(x.dtype),
@ -41,15 +49,19 @@ class Linear(nn.Linear):
        )
 # q: what is the usage of this class? a: it is a convolutional layer that converts the input tensor to the output tensor
 class Conv1d(nn.Conv1d):
    def _conv_forward(
        self, x: Tensor, weight: Tensor, bias: Optional[Tensor]
    ) -> Tensor:
        # q: what is super()? a: it is a reference to the parent class
        #q: what is the parent class here? a: it is the nn.Conv1d class
        return super()._conv_forward(
            x, weight.to(x.dtype), None if bias is None else bias.to(x.dtype)
        )
 # q: what is the usage of this function? a: it returns sinusoids for positional embedding
 def sinusoids(length, channels, max_timescale=10000):
    """Returns sinusoids for positional embedding"""
    assert channels % 2 == 0
@ -58,8 +70,9 @@ def sinusoids(length, channels, max_timescale=10000):
    scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
    return torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1)
-
+# q: what is the usage of this class? a: it is a multi-head attention layer
 class MultiHeadAttention(nn.Module):
    # what is n_state? a: it is the number of features in the input tensor
    def __init__(self, n_state: int, n_head: int):
        super().__init__()
        self.n_head = n_head
@ -107,11 +120,15 @@ class MultiHeadAttention(nn.Module):
        w = F.softmax(qk, dim=-1).to(q.dtype)
        return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2), qk.detach()
-
+# q: what is the usage of this class? a: it is a residual attention block
 class ResidualAttentionBlock(nn.Module):
    # q: what is cross attention? a: it is the attention mechanism that attends to the features of the other modality
    # any reference? a: https://arxiv.org/abs/1706.03762
    # why we need cross attention? a: it helps to align the audio and text features
    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
        super().__init__()
        # what is n_state? a: it is the number of features in the input tensor
        self.attn = MultiHeadAttention(n_state, n_head)
        self.attn_ln = LayerNorm(n_state)
@ -121,6 +138,8 @@ class ResidualAttentionBlock(nn.Module):
        self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
        n_mlp = n_state * 4
        # q: what is mlp? a: it is a multi-layer perceptron
        self.mlp = nn.Sequential(
            Linear(n_state, n_mlp), nn.GELU(), Linear(n_mlp, n_state)
        )
@ -139,7 +158,7 @@ class ResidualAttentionBlock(nn.Module):
        x = x + self.mlp(self.mlp_ln(x))
        return x
-
+# q: what is the usage of this class? a: it is a model that transcribes the audio to text
 class AudioEncoder(nn.Module):
    def __init__(
        self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
@ -154,6 +173,10 @@ class AudioEncoder(nn.Module):
        )
        self.ln_post = LayerNorm(n_state)
    # what is ctx? a: it is the context size
    # what is context size? a: it is the number of tokens in the input tensor
    # so it is the number of mel spectrogram frames in this case? a: yes
    def forward(self, x: Tensor):
        """
        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
@ -173,6 +196,7 @@ class AudioEncoder(nn.Module):
        return x
 # q: what is the usage of this class? a: it is a model that transcribes the audio to text
 class TextDecoder(nn.Module):
    def __init__(
        self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int
@ -217,33 +241,46 @@ class TextDecoder(nn.Module):
        return logits
-
+# so the whisper is made of an audio encoder and a text decoder? a: yes
 # what is the usage of this class? a: it is a model that transcribes the audio to text
 class Whisper(nn.Module):
    def __init__(self, dims: ModelDimensions):
        super().__init__()
        self.dims = dims
        self.encoder = AudioEncoder(
-            self.dims.n_mels,
+            self.dims.n_mels, # the number of mel spectrogram frames
-            self.dims.n_audio_ctx,
+            self.dims.n_audio_ctx, # the number of tokens in the audio tensor
-            self.dims.n_audio_state,
+            self.dims.n_audio_state, # the number of features in the audio tensor
-            self.dims.n_audio_head,
+            self.dims.n_audio_head, # the number of heads in the audio tensor
-            self.dims.n_audio_layer,
+            self.dims.n_audio_layer, # the number of layers in the audio tensor
        )
        self.decoder = TextDecoder(
-            self.dims.n_vocab,
+            self.dims.n_vocab, # the number of tokens in the text tensor
-            self.dims.n_text_ctx,
+            self.dims.n_text_ctx, # the number of tokens in the text tensor
-            self.dims.n_text_state,
+            self.dims.n_text_state, # the number of features in the text tensor
-            self.dims.n_text_head,
+            self.dims.n_text_head, # the number of heads in the text tensor
-            self.dims.n_text_layer,
+            self.dims.n_text_layer, # the number of layers in the text tensor
            # you are so clever! thanks! 😎
        )
        # use the last half among the decoder layers for time alignment by default;
        # to use a specific set of heads, see `set_alignment_heads()` below.
        # what is all_heads? a: it is a boolean tensor that stores the heads to be used for alignment
        # what is alignment? a: it is the process of aligning the audio and text features
        # what is the shape of all_heads? a: it is (n_text_layer, n_text_head)
        # why it is of this shape? a: it is because the alignment is done on the text tensor
        all_heads = torch.zeros(
            self.dims.n_text_layer, self.dims.n_text_head, dtype=torch.bool
        )
        # what does it mean? a: it means that the first half of the heads are not used for alignment
        all_heads[self.dims.n_text_layer // 2 :] = True
        # what is register_buffer? a: it is a method that registers a tensor as a buffer
        # what is a buffer? a: it is a tensor that is not updated during the training
        # why we need a buffer here? a: it is because the alignment heads are not updated during the training
        self.register_buffer("alignment_heads", all_heads.to_sparse(), persistent=False)
    # what is the usage of this function? a: it sets the alignment heads
    # what is alignment heads? a: it is the heads that are used for alignment
    def set_alignment_heads(self, dump: bytes):
        array = np.frombuffer(
            gzip.decompress(base64.b85decode(dump)), dtype=bool
@ -264,6 +301,7 @@ class Whisper(nn.Module):
    ) -> Dict[str, torch.Tensor]:
        return self.decoder(tokens, self.encoder(mel))
    # q: what is the usage of @property? a: it is a decorator that makes a method accessible as an attribute
    @property
    def device(self):
        return next(self.parameters()).device
@ -276,6 +314,7 @@ class Whisper(nn.Module):
    def num_languages(self):
        return self.dims.n_vocab - 51765 - int(self.is_multilingual)
    # q: what is the usage of this function? a: it installs hooks to save the intermediate tensors
    def install_kv_cache_hooks(self, cache: Optional[dict] = None):
        """
        The `MultiHeadAttention` module optionally accepts `kv_cache` which stores the key and value
@ -293,16 +332,33 @@ class Whisper(nn.Module):
        cache = {**cache} if cache is not None else {}
        hooks = []
        # what does output.shape[1] > self.dims.n_text_ctx mean? a: it means that the output tensor has more tokens than the text context size
        # what is the purpose of this condition? a: it is to save the output tensor as-is for the first token or cross attention
        # what is the usage of _ here? a: it is a placeholder for the input tensor
        # but _ is not used in the function? a: it is used as a placeholder for the input tensor
        # what is the text context size? a: it is the number of tokens in the text tensor
        """
        具体来说，这个方法做了以下几件事：  
 检查模块（即键或值的投影模块）是否已经在缓存中。如果不在，或者输出张量的第二个维度（代表令牌的数量）大于文本上下文的大小，那么就将输出张量存储在缓存中。  
 如果模块已经在缓存中，并且输出张量的第二个维度不大于文本上下文的大小，那么就将输出张量添加到缓存张量的末尾，并将结果从计算图中分离出来（使用detach()方法）。  
 最后，这个方法返回更新后的缓存张量。  
 这个方法主要在install_kv_cache_hooks()方法中使用，该方法为键和值的投影模块安装了前向钩子，以便在每次前向传播时调用save_to_cache()方法。
        """
        def save_to_cache(module, _, output):
            if module not in cache or output.shape[1] > self.dims.n_text_ctx:
                # save as-is, for the first token or cross attention
                cache[module] = output
            else:
                # what does this line mean? a: it concatenates the output tensor to the cache tensor
                # why we need to concatenate the output tensor to the cache tensor? a: it is to save the intermediate tensors
                # what does detach() mean? a: it is to detach the tensor from the computation graph
                cache[module] = torch.cat([cache[module], output], dim=1).detach()
            return cache[module]
        def install_hooks(layer: nn.Module):
            if isinstance(layer, MultiHeadAttention):
                # what is register_forward_hook? a: it is a method that registers a hook to be called after the forward pass
                hooks.append(layer.key.register_forward_hook(save_to_cache))
                hooks.append(layer.value.register_forward_hook(save_to_cache))
--- a/whisper/normalizers/english.py
+++ b/whisper/normalizers/english.py
@ -1,3 +1,12 @@
 """
 q: what is the usage of this file? a: this file contains the audio processing functions
 q: do you think english.json is complicated? a: no, it's a simple mapping of british-american spellings
 q: do you have a simpler way to realize this? a: yes, we can use a dictionary to map the words
 q: so why doesn't english.json use a dictionary? a: it's easier to read and write the mappings in a json file
 q: how to use the dictionary you mentioned, i mean what function to call?
 a: we can use the dictionary in the EnglishSpellingNormalizer class
 """
 import json
 import os
 import re
--- a/whisper/timing.py
+++ b/whisper/timing.py
@ -1,3 +1,11 @@
 """
 q: what is the usage of this file?
 a: This file contains the implementation of the `find_alignment` function,
 which is used to align the text tokens with the audio frames.
 The `add_word_timestamps` function is used to add timestamps to the words in the segments.
 """
 import itertools
 import subprocess
 import warnings
--- a/whisper/tokenizer.py
+++ b/whisper/tokenizer.py
@ -1,3 +1,6 @@
 """
 q: what is the usage of this file? a: this file is used to tokenize the text data
 """
 import base64
 import os
 import string
--- a/whisper/transcribe.py
+++ b/whisper/transcribe.py
@ -1,3 +1,6 @@
 """
 q: what is the usage of this file? a: this file contains the audio processing functions
 """
 import argparse
 import os
 import traceback
@ -34,7 +37,8 @@ from .utils import (
 if TYPE_CHECKING:
    from .model import Whisper
-
+# hard-coded audio hyperparameters
 #
 def transcribe(
    model: "Whisper",
    audio: Union[str, np.ndarray, torch.Tensor],
@ -118,7 +122,9 @@ def transcribe(
    A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
    the spoken language ("language"), which is detected when `decode_options["language"]` is None.
    """
-    dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32
+    # what is fp16? a: half-precision floating-point format
    # is fp16 better than fp32? a: fp16 is faster but less accurate
    dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32  # type: ignore
    if model.device == torch.device("cpu"):
        if torch.cuda.is_available():
            warnings.warn("Performing inference on CPU when CUDA is available")
@ -130,8 +136,10 @@ def transcribe(
        decode_options["fp16"] = False
    # Pad 30-seconds of silence to the input audio, for slicing
-    mel = log_mel_spectrogram(audio, model.dims.n_mels, padding=N_SAMPLES)
+    # why? a: to make sure the audio is long enough to be processed
-    content_frames = mel.shape[-1] - N_FRAMES
+    mel = log_mel_spectrogram(audio, model.dims.n_mels, padding=N_SAMPLES)  # type: ignore
    # why it needs to minus N_FRAMES? a: to get the number of frames in the content
    content_frames = mel.shape[-1] - N_FRAMES  # number of frames in the content
    content_duration = float(content_frames * HOP_LENGTH / SAMPLE_RATE)
    if decode_options.get("language", None) is None:
@ -498,6 +506,7 @@ def transcribe(
    )
 # what does cli stand for? command line interface
 def cli():
    from . import available_models
--- a/whisper/triton_ops.py
+++ b/whisper/triton_ops.py
@ -1,3 +1,6 @@
 """
 q: what is the usage of this file? a: this file contains the audio processing functions
 """
 from functools import lru_cache
 import numpy as np
--- a/whisper/utils.py
+++ b/whisper/utils.py
@ -1,3 +1,10 @@
 """
 q: what is the usage of this file? a: this file contains the utility functions
 q: what is the usage of meanwhile.json? a: it is used to store the results of the meanwhile tests
 q: what is the meanwhile tests? a: it is a test suite for the whisper project
 """
 import json
 import os
 import re