mirror of
https://github.com/openai/whisper.git
synced 2025-11-24 14:35:57 +00:00
Fix timestamps and strip extraneous whitespace in WebVTT output (#219)
* Use two-digit hours in WebVTT timestamps Per the WebVTT specification [0]: > A WebVTT timestamp consists of the following components, in the given > order: > > 1. Optionally (required if hours is non-zero): > 1. Two or more ASCII digits, representing the hours as a base ten > integer. > 2. A U+003A COLON character (:) YouTube won’t accept timestamps containing single-digit hours. [0] https://www.w3.org/TR/webvtt1/#webvtt-timestamp * Strip segment text in WebVTT output We already do this for plain text and SubRip output, so we should do it for WebVTT too.
This commit is contained in:
parent
0b1ba3d46e
commit
02b74308ff
@ -40,7 +40,7 @@ def format_timestamp(seconds: float, always_include_hours: bool = False, decimal
|
|||||||
seconds = milliseconds // 1_000
|
seconds = milliseconds // 1_000
|
||||||
milliseconds -= seconds * 1_000
|
milliseconds -= seconds * 1_000
|
||||||
|
|
||||||
hours_marker = f"{hours}:" if always_include_hours or hours > 0 else ""
|
hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
|
||||||
return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
|
return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
|
||||||
|
|
||||||
|
|
||||||
@ -54,7 +54,7 @@ def write_vtt(transcript: Iterator[dict], file: TextIO):
|
|||||||
for segment in transcript:
|
for segment in transcript:
|
||||||
print(
|
print(
|
||||||
f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
|
f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
|
||||||
f"{segment['text'].replace('-->', '->')}\n",
|
f"{segment['text'].strip().replace('-->', '->')}\n",
|
||||||
file=file,
|
file=file,
|
||||||
flush=True,
|
flush=True,
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user