diff --git a/whisper/decoding.py b/whisper/decoding.py index 49485d0..fa43c63 100644 --- a/whisper/decoding.py +++ b/whisper/decoding.py @@ -112,7 +112,13 @@ class DecodingOptions: # implementation details fp16: bool = True # use fp16 for most of the calculation + bf16: bool = False # use bf16 for most of the calculation + def __post_init__(self): + if self.fp16 and self.bf16: + raise ValueError("Both fp16 and bf16 cannot be True at the same time") + if self.bf16: + object.__setattr__(self, "fp16", False) @dataclass(frozen=True) class DecodingResult: @@ -655,7 +661,9 @@ class DecodingTask: audio_features = self.model.encoder(mel) if audio_features.dtype != ( - torch.float16 if self.options.fp16 else torch.float32 + torch.float16 if self.options.fp16 else + torch.bfloat16 if self.options.bf16 else + torch.float32 ): return TypeError( f"audio_features has an incorrect dtype: {audio_features.dtype}" diff --git a/whisper/transcribe.py b/whisper/transcribe.py index 0a4cc36..dc3dc00 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -132,7 +132,7 @@ def transcribe( warnings.warn("FP16 is not supported on CPU; using FP32 instead") dtype = torch.float32 - if dtype == torch.float32: + if dtype == torch.float32 or dtype == torch.bfloat16: decode_options["fp16"] = False # Pad 30-seconds of silence to the input audio, for slicing