Use ndimage.median_filter instead of signal.medfilter (#812)

For a 30s long audio file which didn't have any silence, ndimage.median_filter took 7s where signa.medfilter took 30s. Co-authored-by: Umar Farooqi <umar@paystash.com> Co-authored-by: Jong Wook Kim <jongwook@nyu.edu>
2025-07-08 20:52:29 +00:00 · 2023-01-17 17:43:05 -05:00 · 2023-01-17 17:43:05 -05:00 · f0083e7eb2
commit f0083e7eb2
parent a84191faae
1 changed files with 2 additions and 2 deletions
--- a/notebooks/Multilingual_ASR.ipynb
+++ b/notebooks/Multilingual_ASR.ipynb
@ -874,7 +874,7 @@
        "from IPython.display import display, HTML\n",
        "from whisper.tokenizer import get_tokenizer\n",
        "from dtw import dtw\n",
-        "from scipy.signal import medfilt\n",
+        "from scipy.ndimage import median_filter\n",
        "\n",
        "%matplotlib inline\n",
        "%config InlineBackend.figure_format = \"retina\""
@ -3610,7 +3610,7 @@
        "\n",
        "    weights = torch.cat(QKs)  # layers * heads * tokens * frames    \n",
        "    weights = weights[:, :, :, : duration // AUDIO_SAMPLES_PER_TOKEN].cpu()\n",
-        "    weights = medfilt(weights, (1, 1, 1, medfilt_width))\n",
+        "    weights = median_filter(weights, (1, 1, 1, medfilt_width))\n",
        "    weights = torch.tensor(weights * qk_scale).softmax(dim=-1)\n",
        "    \n",
        "    w = weights / weights.norm(dim=-2, keepdim=True)\n",