mobiusml · Jiltseb · Dec 12, 2023 · Nov 24, 2023 · Nov 24, 2023 · Nov 27, 2023
diff --git a/README.md b/README.md
@@ -68,9 +68,9 @@ pip install nvidia-cublas-cu11 nvidia-cudnn-cu11
 export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
 ```
 
-#### Download the libraries from Purfview's repository (Windows only)
+#### Download the libraries from Purfview's repository (Windows & Linux)
 
-Purfview's [whisper-standalone-win](https:/Purfview/whisper-standalone-win) provides the required NVIDIA libraries for Windows in a [single archive](https:/Purfview/whisper-standalone-win/releases/tag/libs). Decompress the archive and place the libraries in a directory included in the `PATH`.
+Purfview's [whisper-standalone-win](https:/Purfview/whisper-standalone-win) provides the required NVIDIA libraries for Windows & Linux in a [single archive](https:/Purfview/whisper-standalone-win/releases/tag/libs). Decompress the archive and place the libraries in a directory included in the `PATH`.
 
 </details>
 
@@ -104,7 +104,7 @@ pip install --force-reinstall "faster-whisper @ https:/guillaumekln/
 ```python
 from faster_whisper import WhisperModel
 
-model_size = "large-v2"
+model_size = "large-v3"
 
 # Run on GPU with FP16
 model = WhisperModel(model_size, device="cuda", compute_type="float16")
@@ -185,17 +185,17 @@ Here is a non exhaustive list of open-source projects using faster-whisper. Feel
 
 ## Model conversion
 
-When loading a model from its size such as `WhisperModel("large-v2")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/guillaumekln).
+When loading a model from its size such as `WhisperModel("large-v3")`, the correspondig CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran).
 
 We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models.
 
-For example the command below converts the [original "large-v2" Whisper model](https://huggingface.co/openai/whisper-large-v2) and saves the weights in FP16:
+For example the command below converts the [original "large-v3" Whisper model](https://huggingface.co/openai/whisper-large-v3) and saves the weights in FP16:
 
 ```bash
 pip install transformers[torch]>=4.23
 
-ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper-large-v2-ct2 \
- --copy_files tokenizer.json --quantization float16
+ct2-transformers-converter --model openai/whisper-large-v3 --output_dir whisper-large-v3-ct2
+--copy_files tokenizer.json preprocessor_config.json --quantization float16
 ```
 
 * The option `--model` accepts a model name on the Hub or a path to a model directory.
@@ -207,12 +207,12 @@ Models can also be converted from the code. See the [conversion API](https://ope
 
 1. Directly load the model from a local directory:
 ```python
-model = faster_whisper.WhisperModel("whisper-large-v2-ct2")
+model = faster_whisper.WhisperModel("whisper-large-v3-ct2")
 ```
 
 2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name:
 ```python
-model = faster_whisper.WhisperModel("username/whisper-large-v2-ct2")
+model = faster_whisper.WhisperModel("username/whisper-large-v3-ct2")
 ```
 
 ## Comparing performance against other implementations

diff --git a/faster_whisper/audio.py b/faster_whisper/audio.py
@@ -43,7 +43,7 @@ def decode_audio(
  raw_buffer = io.BytesIO()
  dtype = None
 
- with av.open(input_file, metadata_errors="ignore") as container:
+ with av.open(input_file, mode="r", metadata_errors="ignore") as container:
  frames = container.decode(audio=0)
  frames = _ignore_invalid_frames(frames)
  frames = _group_frames(frames, 500000)

diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py
@@ -108,7 +108,7 @@ def decode_with_timestamps(self, tokens: List[int]) -> str:
  def split_to_word_tokens(
  self, tokens: List[int]
  ) -> Tuple[List[str], List[List[int]]]:
- if self.language_code in {"zh", "ja", "th", "lo", "my"}:
+ if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}:
  # These languages don't typically use spaces, so it is difficult to split words
  # without morpheme analysis. Here, we instead split words at any
  # position where the tokens are decoded as valid unicode points
@@ -274,4 +274,5 @@ def split_tokens_on_spaces(
  "yi",
  "yo",
  "zh",
+ "yue",
 )
diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
@@ -1,8 +1,10 @@
 import itertools
+import json
 import logging
 import os
 import zlib
 
+from inspect import signature
 from typing import BinaryIO, Iterable, List, NamedTuple, Optional, Tuple, Union
 
 import ctranslate2
@@ -92,8 +94,8 @@ def __init__(
 
  Args:
  model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en,
- small, small.en, medium, medium.en, large-v1, large-v2, or large), a path to a converted
- model directory, or a CTranslate2-converted Whisper model ID from the Hugging Face Hub.
+ small, small.en, medium, medium.en, large-v1, large-v2, large-v3, or large), a path to a
+ converted model directory, or a CTranslate2-converted Whisper model ID from the HF Hub.
  When a size or a model ID is configured, the converted model is downloaded
  from the Hugging Face Hub.
  device: Device to use for computation ("cpu", "cuda", "auto").
@@ -142,7 +144,8 @@ def __init__(
  "openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en")
  )
 
- self.feature_extractor = FeatureExtractor()
+ self.feat_kwargs = self._get_feature_kwargs(model_path)
+ self.feature_extractor = FeatureExtractor(**self.feat_kwargs)
  self.num_samples_per_token = self.feature_extractor.hop_length * 2
  self.frames_per_second = (
  self.feature_extractor.sampling_rate // self.feature_extractor.hop_length
@@ -159,6 +162,22 @@ def supported_languages(self) -> List[str]:
  """The languages supported by the model."""
  return list(_LANGUAGE_CODES) if self.model.is_multilingual else ["en"]
 
+ def _get_feature_kwargs(self, model_path) -> dict:
+ preprocessor_config_file = os.path.join(model_path, "preprocessor_config.json")
+ config = {}
+ if os.path.isfile(preprocessor_config_file):
+ try:
+ with open(preprocessor_config_file, "r", encoding="utf-8") as json_file:
+ config = json.load(json_file)
+ valid_keys = signature(FeatureExtractor.__init__).parameters.keys()
+ config = {k: v for k, v in config.items() if k in valid_keys}
+ except json.JSONDecodeError as e:
+ self.logger.warning(
+ "Could not load preprocessor_config.json: %s", str(e)
+ )
+
+ return config
+
  def transcribe(
  self,
  audio: Union[str, BinaryIO, np.ndarray],

diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py
@@ -10,17 +10,18 @@
 from tqdm.auto import tqdm
 
 _MODELS = {
- "tiny.en": "guillaumekln/faster-whisper-tiny.en",
- "tiny": "guillaumekln/faster-whisper-tiny",
- "base.en": "guillaumekln/faster-whisper-base.en",
- "base": "guillaumekln/faster-whisper-base",
- "small.en": "guillaumekln/faster-whisper-small.en",
- "small": "guillaumekln/faster-whisper-small",
- "medium.en": "guillaumekln/faster-whisper-medium.en",
- "medium": "guillaumekln/faster-whisper-medium",
- "large-v1": "guillaumekln/faster-whisper-large-v1",
- "large-v2": "guillaumekln/faster-whisper-large-v2",
- "large": "guillaumekln/faster-whisper-large-v2",
+ "tiny.en": "Systran/faster-whisper-tiny.en",
+ "tiny": "Systran/faster-whisper-tiny",
+ "base.en": "Systran/faster-whisper-base.en",
+ "base": "Systran/faster-whisper-base",
+ "small.en": "Systran/faster-whisper-small.en",
+ "small": "Systran/faster-whisper-small",
+ "medium.en": "Systran/faster-whisper-medium.en",
+ "medium": "Systran/faster-whisper-medium",
+ "large-v1": "Systran/faster-whisper-large-v1",
+ "large-v2": "Systran/faster-whisper-large-v2",
+ "large-v3": "Systran/faster-whisper-large-v3",
+ "large": "Systran/faster-whisper-large-v3",
 }
 
 
@@ -50,8 +51,8 @@ def download_model(
  Args:
  size_or_id: Size of the model to download from https://huggingface.co/guillaumekln
  (tiny, tiny.en, base, base.en, small, small.en medium, medium.en, large-v1, large-v2,
- large), or a CTranslate2-converted model ID from the Hugging Face Hub
- (e.g. guillaumekln/faster-whisper-large-v2).
+ large-v3, large), or a CTranslate2-converted model ID from the Hugging Face Hub
+ (e.g. Systran/faster-whisper-large-v3).
  output_dir: Directory where the model should be saved. If not set, the model is saved in
  the cache directory.
  local_files_only: If True, avoid downloading the file and return the path to the local
@@ -76,6 +77,7 @@ def download_model(
 
  allow_patterns = [
  "config.json",
+ "preprocessor_config.json",
  "model.bin",
  "tokenizer.json",
  "vocabulary.*",

diff --git a/faster_whisper/version.py b/faster_whisper/version.py
@@ -1,3 +1,3 @@
 """Version information."""
 
-__version__ = "0.9.0"
+__version__ = "0.10.0"
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 av==10.*
-ctranslate2>=3.17,<4
+ctranslate2>=3.22,<4
 huggingface_hub>=0.13
-tokenizers>=0.13,<0.15
+tokenizers>=0.13,<0.16
 onnxruntime>=1.14,<2