added comments to EnglishTextNormalizer

This commit is contained in:
ethangearey 2024-05-05 22:35:06 -04:00
parent ba3f3cd54b
commit 5809f7d176

View File

@ -464,9 +464,10 @@ class EnglishSpellingNormalizer:
class EnglishTextNormalizer: class EnglishTextNormalizer:
def __init__(self): def __init__(self):
self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b" self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b" # NOTE: any more here?
self.replacers = { self.replacers = {
# common contractions # common contractions
# NOTE: valid to ignore ALL of these contractions? test different ones here
r"\bwon't\b": "will not", r"\bwon't\b": "will not",
r"\bcan't\b": "can not", r"\bcan't\b": "can not",
r"\blet's\b": "let us", r"\blet's\b": "let us",
@ -504,6 +505,7 @@ class EnglishTextNormalizer:
r"\bsr\b": "senior ", r"\bsr\b": "senior ",
r"\besq\b": "esquire ", r"\besq\b": "esquire ",
# prefect tenses, ideally it should be any past participles, but it's harder.. # prefect tenses, ideally it should be any past participles, but it's harder..
# NOTE: "It's harder" -> fix it ourselves
r"'d been\b": " had been", r"'d been\b": " had been",
r"'s been\b": " has been", r"'s been\b": " has been",
r"'d gone\b": " had gone", r"'d gone\b": " had gone",
@ -520,6 +522,11 @@ class EnglishTextNormalizer:
r"'ve\b": " have", r"'ve\b": " have",
r"'m\b": " am", r"'m\b": " am",
} }
# NOTE: I think the numbers and spellings won't have much room for change.
# We can definitely do the past participles and remove clearly ambiguous cases
# Remove some contractions
# Try to think of some more additional rules? Ask GPT for help
self.standardize_numbers = EnglishNumberNormalizer() self.standardize_numbers = EnglishNumberNormalizer()
self.standardize_spellings = EnglishSpellingNormalizer() self.standardize_spellings = EnglishSpellingNormalizer()