clarify transcription parameter

2025-11-24 14:35:57 +00:00 · 2024-07-22 16:14:30 -07:00 · 2024-07-22 16:14:30 -07:00 · 1caba7d5d4
commit 1caba7d5d4
parent 092cb3409e
6 changed files with 17 additions and 3 deletions
--- a/jfk.json
+++ b/jfk.json
@ -0,0 +1 @@
 {"segments": [{"id": 0, "seek": 0, "start": 0.0, "end": 11.0, "text": " And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.", "tokens": [50364, 400, 370, 452, 7177, 6280, 11, 1029, 406, 437, 428, 1941, 393, 360, 337, 291, 11, 1029, 437, 291, 393, 360, 337, 428, 1941, 13, 50914], "temperature": 0.0, "avg_logprob": -0.20427462032863072, "compression_ratio": 1.3544303797468353, "no_speech_prob": 0.04382958635687828}], "language": "en", "text": " And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country."}
--- a/jfk.srt
+++ b/jfk.srt
@ -0,0 +1,4 @@
 1
 00:00:00,000 --> 00:00:11,000
 And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
--- a/jfk.tsv
+++ b/jfk.tsv
@ -0,0 +1,2 @@
 start	end	text
 0	11000	And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
--- a/jfk.txt
+++ b/jfk.txt
@ -0,0 +1 @@
 And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
--- a/jfk.vtt
+++ b/jfk.vtt
@ -0,0 +1,5 @@
 WEBVTT
 00:00.000 --> 00:11.000
 And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
--- a/whisper/transcribe.py
+++ b/whisper/transcribe.py
@ -769,14 +769,15 @@ def audio_tensor(audio: Union[str, np.ndarray, torch.Tensor]) -> torch.Tensor:
 class MinimalTranscriber(Transcriber):
    exact: bool = True
-    chlen: float = CHUNK_LENGTH
+    # amount of time per chunk that is considered in-context
    contextualized: float = CHUNK_LENGTH
    async def process(self, stream: ArrayStream, **kw) -> dict:
-        data = await stream.request(self.chlen, self.exact)
+        data = await stream.request(CHUNK_LENGTH, self.exact)
        while data.shape[-1] > 0:
            self(data, stream.offset, True)
            t = (
-                self.chlen
+                self.contextualized
                - (stream.offset + data.shape[-1] - self.seek) / FRAMES_PER_SECOND
                + CHUNK_LENGTH
            )
		`@ -0,0 +1 @@`
							{"segments": [{"id": 0, "seek": 0, "start": 0.0, "end": 11.0, "text": " And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.", "tokens": [50364, 400, 370, 452, 7177, 6280, 11, 1029, 406, 437, 428, 1941, 393, 360, 337, 291, 11, 1029, 437, 291, 393, 360, 337, 428, 1941, 13, 50914], "temperature": 0.0, "avg_logprob": -0.20427462032863072, "compression_ratio": 1.3544303797468353, "no_speech_prob": 0.04382958635687828}], "language": "en", "text": " And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country."}
		`@ -0,0 +1,4 @@`
							`1`
							`00:00:00,000 --> 00:00:11,000`
							`And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.`
		`@ -0,0 +1,2 @@`
							`start end text`
							`0 11000 And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.`