Merge 1d9e6af04d9fdfd8228a785c066986eaf190bdc8 into 517a43ecd132a2089d85f4ebc044728a71d49f6e

2025-03-30 14:28:27 +00:00 · 2025-01-21 12:01:30 +05:30 · 2025-01-21 12:01:30 +05:30 · 8b3f19c82c
commit 8b3f19c82c
parent 517a43ecd1 1d9e6af04d
1 changed files with 8 additions and 8 deletions
--- a/whisper/normalizers/english.py
+++ b/whisper/normalizers/english.py
@ -406,11 +406,11 @@ class EnglishNumberNormalizer:
        s = " ".join(results)

        # put a space at number/letter boundary
-        s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
-        s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
+        s = re.sub(r"([a-z])(\d)", r"\1 \2", s)
+        s = re.sub(r"(\d)([a-z])", r"\1 \2", s)

        # but remove spaces which could be a suffix
-        s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s)
+        s = re.sub(r"(\d)\s+(st|nd|rd|th|s)\b", r"\1\2", s)

        return s

@ -431,8 +431,8 @@ class EnglishNumberNormalizer:
                return m.string

        # apply currency postprocessing; "$2 and ¢7" -> "$2.07"
-        s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
-        s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)
+        s = re.sub(r"([€£$])(\d+) (?:and )?¢(\d{1,2})\b", combine_cents, s)
+        s = re.sub(r"[€£$]0.(\d{1,2})\b", extract_cents, s)

        # write "one(s)" instead of "1(s)", just for the readability
        s = re.sub(r"\b1(s?)\b", r"one\1", s)
@ -535,15 +535,15 @@ class EnglishTextNormalizer:
            s = re.sub(pattern, replacement, s)

        s = re.sub(r"(\d),(\d)", r"\1\2", s)  # remove commas between digits
-        s = re.sub(r"\.([^0-9]|$)", r" \1", s)  # remove periods not followed by numbers
+        s = re.sub(r"\.(\D|$)", r" \1", s)  # remove periods not followed by numbers
        s = remove_symbols_and_diacritics(s, keep=".%$¢€£")  # keep numeric symbols

        s = self.standardize_numbers(s)
        s = self.standardize_spellings(s)

        # now remove prefix/suffix symbols that are not preceded/followed by numbers
-        s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
-        s = re.sub(r"([^0-9])%", r"\1 ", s)
+        s = re.sub(r"[.$¢€£](\D)", r" \1", s)
+        s = re.sub(r"(\D)%", r"\1 ", s)

        s = re.sub(r"\s+", " ", s)  # replace any successive whitespaces with a space