SYSTRAN · trungkienbkhn · Jul 18, 2024 · Jun 9, 2023 · Jun 14, 2023 · Jun 21, 2023
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 [![CI](https:/SYSTRAN/faster-whisper/workflows/CI/badge.svg)](https:/SYSTRAN/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper)
 
-# Faster Whisper transcription with CTranslate2
+# Mobius Faster Whisper transcription with CTranslate2
 
 **faster-whisper** is a reimplementation of OpenAI's Whisper model using [CTranslate2](https:/OpenNMT/CTranslate2/), which is a fast inference engine for Transformer models.
 
@@ -166,6 +166,35 @@ for segment in segments:
 segments, _ = model.transcribe("audio.mp3")
 segments = list(segments) # The transcription will actually run here.
 ```
+
+### multi-segment language detection
+
+To directly use the model for improved language detection, the following code snippet can be used:
+
+```python
+from faster_whisper import WhisperModel
+model = WhisperModel("medium", device="cuda", compute_type="float16")
+language_info = model.detect_language_multi_segment("audio.mp3")
+```
+
+### Batched faster-whisper
+
+
+The batched version of faster-whisper is inspired by [whisper-x](https:/m-bain/whisperX) licensed under the BSD-4 Clause license. This product includes software developed by Max Bain. We modify this implementation and also added kaldi-based feature extraction. It improves the speed upto 10-12x compared to openAI implementation and 3-4x compared to the sequential faster_whisper version. It works by transcribing semantically meaningful audio chunks as batches leading to faster inference. 
+
+The following code snippet illustrates how to run inference with batched version on an example audio file. Please also refer to the test scripts of batched faster whisper.
+
+```python
+from faster_whisper import BatchedInferencePipeline
+
+model = WhisperModel("medium", device="cuda", compute_type="float16")
+batched_model = BatchedInferencePipeline(model=model)
+result = batched_model.transcribe("audio.mp3", batch_size=16)
+
+for segment, info in result:
+ print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
+```
+
 ### Faster Distil-Whisper
 
 The Distil-Whisper checkpoints are compatible with the Faster-Whisper package. In particular, the latest [distil-large-v3](https://huggingface.co/distil-whisper/distil-large-v3)

diff --git a/faster_whisper/__init__.py b/faster_whisper/__init__.py
@@ -1,12 +1,13 @@
 from faster_whisper.audio import decode_audio
-from faster_whisper.transcribe import WhisperModel
+from faster_whisper.transcribe import BatchedInferencePipeline, WhisperModel
 from faster_whisper.utils import available_models, download_model, format_timestamp
 from faster_whisper.version import __version__
 
 __all__ = [
  "available_models",
  "decode_audio",
  "WhisperModel",
+ "BatchedInferencePipeline",
  "download_model",
  "format_timestamp",
  "__version__",

diff --git a/faster_whisper/feature_extractor.py b/faster_whisper/feature_extractor.py
@@ -1,4 +1,6 @@
 import numpy as np
+import torch
+import torchaudio.compliance.kaldi as ta_kaldi
 
 
 # Adapted from https:/huggingface/transformers/blob/main/src/transformers/models/whisper/feature_extraction_whisper.py # noqa: E501
@@ -21,6 +23,7 @@ def __init__(
  self.mel_filters = self.get_mel_filters(
  sampling_rate, n_fft, n_mels=feature_size
  )
+ self.n_mels = feature_size
 
  def get_mel_filters(self, sr, n_fft, n_mels=128, dtype=np.float32):
  # Initialize the weights
@@ -142,29 +145,53 @@ def stft(self, frames, window):
  data[f] = np.fft.fft(fft_signal, axis=0)[:num_fft_bins]
  return data.T
 
- def __call__(self, waveform, padding=True, chunk_length=None):
+ def __call__(self, waveform, enable_ta=False, padding=True, chunk_length=None):
  """
  Compute the log-Mel spectrogram of the provided audio, gives similar results
- whisper's original torch implementation with 1e-5 tolerance.
+ whisper's original torch implementation with 1e-5 tolerance. Additionally, faster
+ feature extraction option using kaldi fbank features are available if torchaudio is
+ available.
  """
+ if enable_ta:
+ waveform = waveform.astype(np.float32)
+
  if chunk_length is not None:
  self.n_samples = chunk_length * self.sampling_rate
  self.nb_max_frames = self.n_samples // self.hop_length
 
  if padding:
  waveform = np.pad(waveform, [(0, self.n_samples)])
 
- window = np.hanning(self.n_fft + 1)[:-1]
-
- frames = self.fram_wave(waveform)
- stft = self.stft(frames, window=window)
- magnitudes = np.abs(stft[:, :-1]) ** 2
-
- filters = self.mel_filters
- mel_spec = filters @ magnitudes
-
- log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None))
- log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
- log_spec = (log_spec + 4.0) / 4.0
+ if enable_ta:
+ audio = torch.from_numpy(waveform).unsqueeze(0)
+ fbank = ta_kaldi.fbank(
+ audio,
+ sample_frequency=self.sampling_rate,
+ window_type="hanning",
+ num_mel_bins=self.n_mels,
+ )
+ log_spec = fbank.numpy().T.astype(np.float32) # ctranslate does not take 64
+
+ # normalize
+
+ # Audioset values as default mean and std for audio
+ mean_val = -4.2677393
+ std_val = 4.5689974
+ scaled_features = (log_spec - (mean_val)) / (std_val * 2)
+ log_spec = scaled_features
+
+ else:
+ window = np.hanning(self.n_fft + 1)[:-1]
+
+ frames = self.fram_wave(waveform)
+ stft = self.stft(frames, window=window)
+ magnitudes = np.abs(stft[:, :-1]) ** 2
+
+ filters = self.mel_filters
+ mel_spec = filters @ magnitudes
+
+ log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None))
+ log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
+ log_spec = (log_spec + 4.0) / 4.0
 
  return log_spec