word timing tweaks (#1559)

* word timing tweaks * comment on eot * clearer comments
2025-11-23 22:15:58 +00:00 · 2023-08-07 14:48:56 -07:00 · 2023-08-07 14:48:56 -07:00 · e8622f9afc
commit e8622f9afc
parent b91c907694
1 changed files with 7 additions and 2 deletions
--- a/whisper/timing.py
+++ b/whisper/timing.py
@ -214,6 +214,13 @@ def find_alignment(
    text_indices, time_indices = dtw(-matrix)
    words, word_tokens = tokenizer.split_to_word_tokens(text_tokens + [tokenizer.eot])
    if len(word_tokens) <= 1:
        # return on eot only
        # >>> np.pad([], (1, 0))
        # array([0.])
        # This results in crashes when we lookup jump_times with float, like
        # IndexError: arrays used as indices must be of integer (or boolean) type
        return []
    word_boundaries = np.pad(np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0))
    jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(bool)
@ -297,8 +304,6 @@ def add_word_timestamps(
    # hack: truncate long words at sentence boundaries.
    # a better segmentation algorithm based on VAD should be able to replace this.
    if len(word_durations) > 0:
        median_duration = np.median(word_durations)
        max_duration = median_duration * 2
        sentence_end_marks = ".。!！?？"
        # ensure words at sentence boundaries are not longer than twice the median word duration.
        for i in range(1, len(alignment)):