saving the qk matrix in the attention module for convenience

2025-11-26 23:46:09 +00:00 · 2022-12-29 23:02:52 -07:00 · 2022-12-29 23:02:52 -07:00 · 68e44bd83c
commit 68e44bd83c
parent 0b5dcfdef7
1 changed files with 3 additions and 0 deletions
--- a/whisper/model.py
+++ b/whisper/model.py
@ -62,6 +62,7 @@ class MultiHeadAttention(nn.Module):
        self.key = Linear(n_state, n_state, bias=False)
        self.value = Linear(n_state, n_state)
        self.out = Linear(n_state, n_state)
        self.last_qk = None
    def forward(
        self,
@ -96,6 +97,8 @@ class MultiHeadAttention(nn.Module):
        if mask is not None:
            qk = qk + mask[:n_ctx, :n_ctx]
        self.last_qk = qk.detach()
        w = F.softmax(qk.float(), dim=-1).to(q.dtype)
        return (w @ v).permute(0, 2, 1, 3).flatten(start_dim=2)