descriptinc · iyaja · Mar 6, 2024 · Mar 6, 2024
diff --git a/audiotools/core/whisper.py b/audiotools/core/whisper.py
@@ -21,7 +21,7 @@ def setup_whisper(
  ).to(self.whisper_device)
  self.is_initialized = True
 
- def get_whisper_features(self) -> torch.Tensor:
+ def get_whisper_features(self, **kwargs) -> torch.Tensor:
  """Preprocess audio signal as per the whisper model's training config.
 
  Returns
@@ -49,11 +49,12 @@ def get_whisper_features(self) -> torch.Tensor:
  raw_speech,
  sampling_rate=self.whisper_processor.feature_extractor.sampling_rate,
  return_tensors="pt",
+ **kwargs
  ).input_features
 
  return input_features
 
- def get_whisper_transcript(self) -> str:
+ def get_whisper_transcript(self, **kwargs) -> str:
  """Get the transcript of the audio signal using the whisper model.
 
  Returns
@@ -69,12 +70,12 @@ def get_whisper_transcript(self) -> str:
 
  with torch.inference_mode():
  input_features = input_features.to(self.whisper_device)
- generated_ids = self.whisper_model.generate(inputs=input_features)
+ generated_ids = self.whisper_model.generate(input_features=input_features, **kwargs)
 
- transcription = self.whisper_processor.batch_decode(generated_ids)
+ transcription = self.whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)
  return transcription[0]
 
- def get_whisper_embeddings(self) -> torch.Tensor:
+ def get_whisper_embeddings(self, **kwargs) -> torch.Tensor:
  """Get the last hidden state embeddings of the audio signal using the whisper model.
 
  Returns
@@ -92,6 +93,6 @@ def get_whisper_embeddings(self) -> torch.Tensor:
 
  with torch.inference_mode():
  input_features = input_features.to(self.whisper_device)
- embeddings = encoder(input_features)
+ embeddings = encoder(input_features, **kwargs)
 
  return embeddings.last_hidden_state