From 68e44bd83ce6c3e352f74b266aa39d8b649af9e3 Mon Sep 17 00:00:00 2001
From: Jong Wook Kim <jongwook@nyu.edu>
Date: Thu, 29 Dec 2022 23:02:52 -0700
Subject: [PATCH] saving the qk matrix in the attention module for convenience

---
 whisper/model.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/whisper/model.py b/whisper/model.py
index ca3928e..820d3c1 100644
--- a/whisper/model.py
+++ b/whisper/model.py
@@ -62,6 +62,7 @@ class MultiHeadAttention(nn.Module):
         self.key = Linear(n_state, n_state, bias=False)
         self.value = Linear(n_state, n_state)
         self.out = Linear(n_state, n_state)
+        self.last_qk = None
 
     def forward(
         self,
@@ -96,6 +97,8 @@ class MultiHeadAttention(nn.Module):
         if mask is not None:
             qk = qk + mask[:n_ctx, :n_ctx]
 
+        self.last_qk = qk.detach()
+
         w = F.softmax(qk.float(), dim=-1).to(q.dtype)
         return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2)