huggingface · sgugger · Apr 8, 2022 · Apr 8, 2022
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -1792,7 +1792,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
  # load pt weights early so that we know which dtype to init the model under
  if from_pt:
- if not is_sharded:
+ if not is_sharded and state_dict is None:
  # Time to load the checkpoint
  state_dict = load_state_dict(resolved_archive_file)
  # set dtype to instantiate the model under: