diff --git a/project/configs/layer-weighting.cfg b/project/configs/layer-weighting.cfg index bcd5198..8b694d4 100644 --- a/project/configs/layer-weighting.cfg +++ b/project/configs/layer-weighting.cfg @@ -119,7 +119,7 @@ factory = "curated_transformer" all_layer_outputs = True [components.transformer.model] -@architectures = "spacy-curated-transformers.XlmrTransformer.v1" +@architectures = "spacy-curated-transformers.XlmrTransformer.v2" vocab_size = 250002 num_hidden_layers = 12 hidden_width = 768 diff --git a/project/configs/no-layer-weighting.cfg b/project/configs/no-layer-weighting.cfg index e5b677a..058d51b 100644 --- a/project/configs/no-layer-weighting.cfg +++ b/project/configs/no-layer-weighting.cfg @@ -98,7 +98,7 @@ upstream = "*" factory = "curated_transformer" [components.transformer.model] -@architectures = "spacy-curated-transformers.XlmrTransformer.v1" +@architectures = "spacy-curated-transformers.XlmrTransformer.v2" vocab_size = 250002 piece_encoder = {"@architectures": "spacy-curated-transformers.XlmrSentencepieceEncoder.v1"} diff --git a/requirements.txt b/requirements.txt index 1d37378..dd3e0af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -curated-transformers>=2.0.0.dev1,<3.0.0 +curated-transformers>=2.0.0.dev2,<3.0.0 curated-tokenizers>=0.9.2,<1.0.0 fsspec>=2023.5.0 spacy>=4.0.0.dev2,<5.0.0 diff --git a/setup.cfg b/setup.cfg index 7f69da5..7cdeffd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -14,7 +14,7 @@ zip_safe = true include_package_data = true python_requires = >=3.8 install_requires = - curated-transformers>=2.0.0.dev1,<3.0.0 + curated-transformers>=2.0.0.dev2,<3.0.0 curated-tokenizers>=0.9.2,<1.0.0 fsspec>=2023.5.0 spacy>=4.0.0.dev2,<5.0.0 @@ -27,10 +27,15 @@ spacy_factories = spacy_architectures = spacy-curated-transformers.AlbertTransformer.v1 = spacy_curated_transformers.models:build_albert_transformer_model_v1 + spacy-curated-transformers.AlbertTransformer.v2 = spacy_curated_transformers.models:build_albert_transformer_model_v2 spacy-curated-transformers.BertTransformer.v1 = spacy_curated_transformers.models:build_bert_transformer_model_v1 + spacy-curated-transformers.BertTransformer.v2 = spacy_curated_transformers.models:build_bert_transformer_model_v2 spacy-curated-transformers.CamembertTransformer.v1 = spacy_curated_transformers.models:build_camembert_transformer_model_v1 + spacy-curated-transformers.CamembertTransformer.v2 = spacy_curated_transformers.models:build_camembert_transformer_model_v2 spacy-curated-transformers.RobertaTransformer.v1 = spacy_curated_transformers.models:build_roberta_transformer_model_v1 + spacy-curated-transformers.RobertaTransformer.v2 = spacy_curated_transformers.models:build_roberta_transformer_model_v2 spacy-curated-transformers.XlmrTransformer.v1 = spacy_curated_transformers.models:build_xlmr_transformer_model_v1 + spacy-curated-transformers.XlmrTransformer.v2 = spacy_curated_transformers.models:build_xlmr_transformer_model_v2 spacy-curated-transformers.WithStridedSpans.v1 = spacy_curated_transformers.models:build_with_strided_spans_v1 spacy-curated-transformers.ScalarWeight.v1 = spacy_curated_transformers.models:build_scalar_weight_v1 spacy-curated-transformers.TransformerLayersListener.v1 = spacy_curated_transformers.models.listeners:build_transformer_layers_listener_v1 diff --git a/spacy_curated_transformers/cli/fill_config_transformer.py b/spacy_curated_transformers/cli/fill_config_transformer.py index d8c58e4..7122fd1 100644 --- a/spacy_curated_transformers/cli/fill_config_transformer.py +++ b/spacy_curated_transformers/cli/fill_config_transformer.py @@ -67,12 +67,14 @@ def init_fill_curated_transformer_cli( class HfParamSource(Enum): MODEL_CONFIG = 1 - TOKENIZER_CONFIG = 2 + MODEL_CONFIG_OPTIONAL = 2 + TOKENIZER_CONFIG = 3 # Entrypoint parameters that are common to all curated transformer models. COMMON_ENTRYPOINT_PARAMS: Dict[str, HfParamSource] = { "attention_probs_dropout_prob": HfParamSource.MODEL_CONFIG, + "dtype": HfParamSource.MODEL_CONFIG_OPTIONAL, "hidden_act": HfParamSource.MODEL_CONFIG, "hidden_dropout_prob": HfParamSource.MODEL_CONFIG, "hidden_width": HfParamSource.MODEL_CONFIG, @@ -99,6 +101,7 @@ class HfParamSource(Enum): "intermediate_width": "intermediate_size", "padding_idx": "pad_token_id", "embedding_width": "embedding_size", + "dtype": "torch_dtype", } @@ -328,9 +331,9 @@ def _fill_parameters( filled_params = {} for param_name, source in params_to_fill.items(): hf_key = ENTRYPOINT_PARAMS_TO_HF_CONFIG_KEYS.get(param_name, param_name) - if source == HfParamSource.MODEL_CONFIG: + if source in (HfParamSource.MODEL_CONFIG, HfParamSource.MODEL_CONFIG_OPTIONAL): value = hf_config.get(hf_key) - if value is None: + if value is None and source == HfParamSource.MODEL_CONFIG: msg.fail( f"Hugging Face model config has a missing key '{hf_key}'", exits=1 ) @@ -341,8 +344,9 @@ def _fill_parameters( f"Hugging Face tokenizer config has a missing key '{hf_key}'", exits=1, ) - assert value is not None - filled_params[param_name] = value + assert value is not None or source == HfParamSource.MODEL_CONFIG_OPTIONAL + if value is not None: + filled_params[param_name] = value msg.info(title="Filled-in model parameters:") msg.table(filled_params) diff --git a/spacy_curated_transformers/models/__init__.py b/spacy_curated_transformers/models/__init__.py index e7a9a69..0be2485 100644 --- a/spacy_curated_transformers/models/__init__.py +++ b/spacy_curated_transformers/models/__init__.py @@ -1,10 +1,15 @@ from .architectures import ( build_albert_transformer_model_v1, + build_albert_transformer_model_v2, build_bert_transformer_model_v1, + build_bert_transformer_model_v2, build_camembert_transformer_model_v1, + build_camembert_transformer_model_v2, build_pytorch_checkpoint_loader_v1, build_roberta_transformer_model_v1, + build_roberta_transformer_model_v2, build_xlmr_transformer_model_v1, + build_xlmr_transformer_model_v2, ) from .hf_loader import build_hf_transformer_encoder_loader_v1 from .scalar_weight import build_scalar_weight_v1 diff --git a/spacy_curated_transformers/models/architectures.py b/spacy_curated_transformers/models/architectures.py index 04c2862..649e7e9 100644 --- a/spacy_curated_transformers/models/architectures.py +++ b/spacy_curated_transformers/models/architectures.py @@ -131,7 +131,111 @@ def build_albert_transformer_model_v1( Optional listener to wrap. Only used when replacing listeners in downstream components. """ + return build_albert_transformer_model_v2( + vocab_size=vocab_size, + with_spans=with_spans, + piece_encoder=piece_encoder, + attention_probs_dropout_prob=attention_probs_dropout_prob, + embedding_width=embedding_width, + hidden_act=hidden_act, + hidden_dropout_prob=hidden_dropout_prob, + hidden_width=hidden_width, + intermediate_width=intermediate_width, + layer_norm_eps=layer_norm_eps, + max_position_embeddings=max_position_embeddings, + model_max_length=model_max_length, + num_attention_heads=num_attention_heads, + num_hidden_groups=num_hidden_groups, + num_hidden_layers=num_hidden_layers, + padding_idx=padding_idx, + type_vocab_size=type_vocab_size, + torchscript=torchscript, + mixed_precision=mixed_precision, + grad_scaler_config=grad_scaler_config, + wrapped_listener=wrapped_listener, + ) + + +def build_albert_transformer_model_v2( + *, + vocab_size: int, + with_spans: Callable[ + [TorchTransformerModelT], + SpanExtractorModelT, + ], + piece_encoder: Tok2PiecesModelT, + attention_probs_dropout_prob: float = 0.0, + dtype: str = "float32", + embedding_width: int = 128, + hidden_act: str = "gelu_new", + hidden_dropout_prob: float = 0.0, + hidden_width: int = 768, + intermediate_width: int = 3072, + layer_norm_eps: float = 1e-12, + max_position_embeddings: int = 512, + model_max_length: int = 512, + num_attention_heads: int = 12, + num_hidden_groups: int = 1, + num_hidden_layers: int = 12, + padding_idx: int = 0, + type_vocab_size: int = 2, + torchscript: bool = False, + mixed_precision: bool = False, + grad_scaler_config: dict = SimpleFrozenDict(), + wrapped_listener: Optional[TransformerListenerModelT] = None, +) -> Union[TransformerModelT, WrappedTransformerAndListenerModelT]: + """Construct an ALBERT transformer model. + + vocab_size (int): + Vocabulary size. + with_spans (Callable): + Callback that constructs a span generator model. + piece_encoder (Model) + The piece encoder to segment input tokens. + attention_probs_dropout_prob (float): + Dropout probabilty of the self-attention layers. + dtype (str): + Torch parameter data type. + embedding_width (int): + Width of the embedding representations. + hidden_act (str): + Activation used by the point-wise feed-forward layers. + hidden_dropout_prob (float): + Dropout probabilty of the point-wise feed-forward and + embedding layers. + hidden_width (int): + Width of the final representations. + intermediate_width (int): + Width of the intermediate projection layer in the + point-wise feed-forward layer. + layer_norm_eps (float): + Epsilon for layer normalization. + max_position_embeddings (int): + Maximum length of position embeddings. + model_max_length (int): + Maximum length of model inputs. + num_attention_heads (int): + Number of self-attention heads. + num_hidden_groups (int): + Number of layer groups whose constituents share parameters. + num_hidden_layers (int): + Number of hidden layers. + padding_idx (int): + Index of the padding meta-token. + type_vocab_size (int): + Type vocabulary size. + torchscript (bool): + Set to `True` when loading TorchScript models, `False` otherwise. + mixed_precision (bool): + Use mixed-precision training. + grad_scaler_config (dict): + Configuration passed to the PyTorch gradient scaler. + wrapped_listener (Optional[TransformerListenerModelT]): + Optional listener to wrap. Only used when replacing listeners + in downstream components. + """ config = ALBERTConfig( + dtype=_torch_dtype_from_str(dtype), embedding_width=embedding_width, hidden_width=hidden_width, intermediate_width=intermediate_width, @@ -240,7 +344,103 @@ def build_bert_transformer_model_v1( Optional listener to wrap. Only used when replacing listeners in downstream components. """ + return build_bert_transformer_model_v2( + vocab_size=vocab_size, + with_spans=with_spans, + piece_encoder=piece_encoder, + attention_probs_dropout_prob=attention_probs_dropout_prob, + hidden_act=hidden_act, + hidden_dropout_prob=hidden_dropout_prob, + hidden_width=hidden_width, + intermediate_width=intermediate_width, + layer_norm_eps=layer_norm_eps, + max_position_embeddings=max_position_embeddings, + model_max_length=model_max_length, + num_attention_heads=num_attention_heads, + num_hidden_layers=num_hidden_layers, + padding_idx=padding_idx, + type_vocab_size=type_vocab_size, + torchscript=torchscript, + mixed_precision=mixed_precision, + grad_scaler_config=grad_scaler_config, + wrapped_listener=wrapped_listener, + ) + + +def build_bert_transformer_model_v2( + *, + vocab_size: int, + with_spans: Callable[ + [TorchTransformerModelT], + SpanExtractorModelT, + ], + piece_encoder: Tok2PiecesModelT, + attention_probs_dropout_prob: float = 0.1, + dtype: str = "float32", + hidden_act: str = "gelu", + hidden_dropout_prob: float = 0.1, + hidden_width: int = 768, + intermediate_width: int = 3072, + layer_norm_eps: float = 1e-12, + max_position_embeddings: int = 512, + model_max_length: int = 512, + num_attention_heads: int = 12, + num_hidden_layers: int = 12, + padding_idx: int = 0, + type_vocab_size: int = 2, + torchscript: bool = False, + mixed_precision: bool = False, + grad_scaler_config: dict = SimpleFrozenDict(), + wrapped_listener: Optional[TransformerListenerModelT] = None, +) -> Union[TransformerModelT, WrappedTransformerAndListenerModelT]: + """Construct a BERT transformer model. + + vocab_size (int): + Vocabulary size. + with_spans (Callable): + Callback that constructs a span generator model. + piece_encoder (Model) + The piece encoder to segment input tokens. + attention_probs_dropout_prob (float): + Dropout probabilty of the self-attention layers. + dtype (str): + Data type of the model parameters. + hidden_act (str): + Activation used by the point-wise feed-forward layers. + hidden_dropout_prob (float): + Dropout probabilty of the point-wise feed-forward and + embedding layers. + hidden_width (int): + Width of the final representations. + intermediate_width (int): + Width of the intermediate projection layer in the + point-wise feed-forward layer. + layer_norm_eps (float): + Epsilon for layer normalization. + max_position_embeddings (int): + Maximum length of position embeddings. + model_max_length (int): + Maximum length of model inputs. + num_attention_heads (int): + Number of self-attention heads. + num_hidden_layers (int): + Number of hidden layers. + padding_idx (int): + Index of the padding meta-token. + type_vocab_size (int): + Type vocabulary size. + torchscript (bool): + Set to `True` when loading TorchScript models, `False` otherwise. + mixed_precision (bool): + Use mixed-precision training. + grad_scaler_config (dict): + Configuration passed to the PyTorch gradient scaler. + wrapped_listener (Optional[TransformerListenerModelT]): + Optional listener to wrap. Only used when replacing listeners + in downstream components. + """ config = BERTConfig( + dtype=_torch_dtype_from_str(dtype), embedding_width=hidden_width, hidden_width=hidden_width, intermediate_width=intermediate_width, @@ -348,7 +548,103 @@ def build_camembert_transformer_model_v1( Optional listener to wrap. Only used when replacing listeners in downstream components. """ + return build_camembert_transformer_model_v2( + vocab_size=vocab_size, + with_spans=with_spans, + piece_encoder=piece_encoder, + attention_probs_dropout_prob=attention_probs_dropout_prob, + hidden_act=hidden_act, + hidden_dropout_prob=hidden_dropout_prob, + hidden_width=hidden_width, + intermediate_width=intermediate_width, + layer_norm_eps=layer_norm_eps, + max_position_embeddings=max_position_embeddings, + model_max_length=model_max_length, + num_attention_heads=num_attention_heads, + num_hidden_layers=num_hidden_layers, + padding_idx=padding_idx, + type_vocab_size=type_vocab_size, + mixed_precision=mixed_precision, + torchscript=torchscript, + grad_scaler_config=grad_scaler_config, + wrapped_listener=wrapped_listener, + ) + + +def build_camembert_transformer_model_v2( + *, + vocab_size: int, + with_spans: Callable[ + [TorchTransformerModelT], + SpanExtractorModelT, + ], + piece_encoder: Tok2PiecesModelT, + attention_probs_dropout_prob: float = 0.1, + dtype: str = "float32", + hidden_act: str = "gelu", + hidden_dropout_prob: float = 0.1, + hidden_width: int = 768, + intermediate_width: int = 3072, + layer_norm_eps: float = 1e-5, + max_position_embeddings: int = 514, + model_max_length: int = 512, + num_attention_heads: int = 12, + num_hidden_layers: int = 12, + padding_idx: int = 1, + type_vocab_size: int = 1, + mixed_precision: bool = False, + torchscript=False, + grad_scaler_config: dict = SimpleFrozenDict(), + wrapped_listener: Optional[TransformerListenerModelT] = None, +) -> Union[TransformerModelT, WrappedTransformerAndListenerModelT]: + """Construct a CamemBERT transformer model. + + vocab_size (int): + Vocabulary size. + with_spans (Callable): + Callback that constructs a span generator model. + piece_encoder (Model) + The piece encoder to segment input tokens. + attention_probs_dropout_prob (float): + Dropout probabilty of the self-attention layers. + dtype (str): + Data type of the model parameters. + hidden_act (str): + Activation used by the point-wise feed-forward layers. + hidden_dropout_prob (float): + Dropout probabilty of the point-wise feed-forward and + embedding layers. + hidden_width (int): + Width of the final representations. + intermediate_width (int): + Width of the intermediate projection layer in the + point-wise feed-forward layer. + layer_norm_eps (float): + Epsilon for layer normalization. + max_position_embeddings (int): + Maximum length of position embeddings. + model_max_length (int): + Maximum length of model inputs. + num_attention_heads (int): + Number of self-attention heads. + num_hidden_layers (int): + Number of hidden layers. + padding_idx (int): + Index of the padding meta-token. + type_vocab_size (int): + Type vocabulary size. + torchscript (bool): + Set to `True` when loading TorchScript models, `False` otherwise. + mixed_precision (bool): + Use mixed-precision training. + grad_scaler_config (dict): + Configuration passed to the PyTorch gradient scaler. + wrapped_listener (Optional[TransformerListenerModelT]): + Optional listener to wrap. Only used when replacing listeners + in downstream components. + """ config = RoBERTaConfig( + dtype=_torch_dtype_from_str(dtype), embedding_width=hidden_width, hidden_width=hidden_width, intermediate_width=intermediate_width, @@ -456,7 +752,103 @@ def build_roberta_transformer_model_v1( Optional listener to wrap. Only used when replacing listeners in downstream components. """ + return build_roberta_transformer_model_v2( + vocab_size=vocab_size, + with_spans=with_spans, + piece_encoder=piece_encoder, + attention_probs_dropout_prob=attention_probs_dropout_prob, + hidden_act=hidden_act, + hidden_dropout_prob=hidden_dropout_prob, + hidden_width=hidden_width, + intermediate_width=intermediate_width, + layer_norm_eps=layer_norm_eps, + max_position_embeddings=max_position_embeddings, + model_max_length=model_max_length, + num_attention_heads=num_attention_heads, + num_hidden_layers=num_hidden_layers, + padding_idx=padding_idx, + type_vocab_size=type_vocab_size, + torchscript=torchscript, + mixed_precision=mixed_precision, + grad_scaler_config=grad_scaler_config, + wrapped_listener=wrapped_listener, + ) + + +def build_roberta_transformer_model_v2( + *, + vocab_size: int, + with_spans: Callable[ + [TorchTransformerModelT], + SpanExtractorModelT, + ], + piece_encoder: Tok2PiecesModelT, + attention_probs_dropout_prob: float = 0.1, + dtype: str = "float32", + hidden_act: str = "gelu", + hidden_dropout_prob: float = 0.1, + hidden_width: int = 768, + intermediate_width: int = 3072, + layer_norm_eps: float = 1e-5, + max_position_embeddings: int = 514, + model_max_length: int = 512, + num_attention_heads: int = 12, + num_hidden_layers: int = 12, + padding_idx: int = 1, + type_vocab_size: int = 1, + torchscript: bool = False, + mixed_precision: bool = False, + grad_scaler_config: dict = SimpleFrozenDict(), + wrapped_listener: Optional[TransformerListenerModelT] = None, +) -> Union[TransformerModelT, WrappedTransformerAndListenerModelT]: + """Construct a RoBERTa transformer model. + + vocab_size (int): + Vocabulary size. + with_spans (Callable): + Callback that constructs a span generator model. + piece_encoder (Model) + The piece encoder to segment input tokens. + attention_probs_dropout_prob (float): + Dropout probabilty of the self-attention layers. + dtype (str): + Data type of the model parameters. + hidden_act (str): + Activation used by the point-wise feed-forward layers. + hidden_dropout_prob (float): + Dropout probabilty of the point-wise feed-forward and + embedding layers. + hidden_width (int): + Width of the final representations. + intermediate_width (int): + Width of the intermediate projection layer in the + point-wise feed-forward layer. + layer_norm_eps (float): + Epsilon for layer normalization. + max_position_embeddings (int): + Maximum length of position embeddings. + model_max_length (int): + Maximum length of model inputs. + num_attention_heads (int): + Number of self-attention heads. + num_hidden_layers (int): + Number of hidden layers. + padding_idx (int): + Index of the padding meta-token. + type_vocab_size (int): + Type vocabulary size. + torchscript (bool): + Set to `True` when loading TorchScript models, `False` otherwise. + mixed_precision (bool): + Use mixed-precision training. + grad_scaler_config (dict): + Configuration passed to the PyTorch gradient scaler. + wrapped_listener (Optional[TransformerListenerModelT]): + Optional listener to wrap. Only used when replacing listeners + in downstream components. + """ config = RoBERTaConfig( + dtype=_torch_dtype_from_str(dtype), embedding_width=hidden_width, hidden_width=hidden_width, intermediate_width=intermediate_width, @@ -564,7 +956,103 @@ def build_xlmr_transformer_model_v1( Optional listener to wrap. Only used when replacing listeners in downstream components. """ + return build_xlmr_transformer_model_v2( + vocab_size=vocab_size, + with_spans=with_spans, + piece_encoder=piece_encoder, + attention_probs_dropout_prob=attention_probs_dropout_prob, + hidden_act=hidden_act, + hidden_dropout_prob=hidden_dropout_prob, + hidden_width=hidden_width, + intermediate_width=intermediate_width, + layer_norm_eps=layer_norm_eps, + max_position_embeddings=max_position_embeddings, + model_max_length=model_max_length, + num_attention_heads=num_attention_heads, + num_hidden_layers=num_hidden_layers, + padding_idx=padding_idx, + type_vocab_size=type_vocab_size, + torchscript=torchscript, + mixed_precision=mixed_precision, + grad_scaler_config=grad_scaler_config, + wrapped_listener=wrapped_listener, + ) + + +def build_xlmr_transformer_model_v2( + *, + vocab_size: int, + with_spans: Callable[ + [TorchTransformerModelT], + SpanExtractorModelT, + ], + piece_encoder: Tok2PiecesModelT, + attention_probs_dropout_prob: float = 0.1, + dtype: str = "float32", + hidden_act: str = "gelu", + hidden_dropout_prob: float = 0.1, + hidden_width: int = 768, + intermediate_width: int = 3072, + layer_norm_eps: float = 1e-5, + max_position_embeddings: int = 514, + model_max_length: int = 512, + num_attention_heads: int = 12, + num_hidden_layers: int = 12, + padding_idx: int = 1, + type_vocab_size: int = 1, + torchscript: bool = False, + mixed_precision: bool = False, + grad_scaler_config: dict = SimpleFrozenDict(), + wrapped_listener: Optional[TransformerListenerModelT] = None, +) -> Union[TransformerModelT, WrappedTransformerAndListenerModelT]: + """Construct a XLM-RoBERTa transformer model. + + vocab_size (int): + Vocabulary size. + with_spans (Callable): + Callback that constructs a span generator model. + piece_encoder (Model) + The piece encoder to segment input tokens. + attention_probs_dropout_prob (float): + Dropout probabilty of the self-attention layers. + dtype (str): + Data type of the model parameters. + hidden_act (str): + Activation used by the point-wise feed-forward layers. + hidden_dropout_prob (float): + Dropout probabilty of the point-wise feed-forward and + embedding layers. + hidden_width (int): + Width of the final representations. + intermediate_width (int): + Width of the intermediate projection layer in the + point-wise feed-forward layer. + layer_norm_eps (float): + Epsilon for layer normalization. + max_position_embeddings (int): + Maximum length of position embeddings. + model_max_length (int): + Maximum length of model inputs. + num_attention_heads (int): + Number of self-attention heads. + num_hidden_layers (int): + Number of hidden layers. + padding_idx (int): + Index of the padding meta-token. + type_vocab_size (int): + Type vocabulary size. + torchscript (bool): + Set to `True` when loading TorchScript models, `False` otherwise. + mixed_precision (bool): + Use mixed-precision training. + grad_scaler_config (dict): + Configuration passed to the PyTorch gradient scaler. + wrapped_listener (Optional[TransformerListenerModelT]): + Optional listener to wrap. Only used when replacing listeners + in downstream components. + """ config = RoBERTaConfig( + dtype=_torch_dtype_from_str(dtype), embedding_width=hidden_width, hidden_width=hidden_width, intermediate_width=intermediate_width, @@ -825,17 +1313,15 @@ def load(model, X=None, Y=None): device = get_torch_default_device() encoder = model.shims[0]._model assert isinstance(encoder, FromHFHub) - from_fsspec = type(encoder).from_fsspec - - # We can discard the previously initialized model entirely - # and use the Curated Transformers API to load it from the - # hub. - model.shims[0]._model = None - del encoder - fs = LocalFileSystem() - encoder = from_fsspec(fs=fs, model_path=path, device=device) - model.shims[0]._model = encoder + encoder.from_fsspec_(fs=fs, model_path=path, device=device) return model return load + + +def _torch_dtype_from_str(dtype_as_str: str): + dtype = getattr(torch, dtype_as_str, None) + if not isinstance(dtype, torch.dtype): + raise ValueError(f"Invalid torch dtype `{dtype_as_str}`") + return dtype diff --git a/spacy_curated_transformers/models/hf_loader.py b/spacy_curated_transformers/models/hf_loader.py index da1fa61..3f58d2e 100644 --- a/spacy_curated_transformers/models/hf_loader.py +++ b/spacy_curated_transformers/models/hf_loader.py @@ -27,15 +27,7 @@ def load(model, X=None, Y=None): encoder = model.shims[0]._model assert isinstance(encoder, FromHFHub) device = model.shims[0].device - from_hf_hub = type(encoder).from_hf_hub - - # We can discard the previously initialized model entirely - # and use the Curated Transformers API to load it from the - # hub. - model.shims[0]._model = None - del encoder - encoder = from_hf_hub(name=name, revision=revision, device=device) - model.shims[0]._model = encoder + encoder.from_hf_hub_(name=name, revision=revision, device=device) return model return load diff --git a/spacy_curated_transformers/pipeline/transformer.py b/spacy_curated_transformers/pipeline/transformer.py index 361536c..4e43d9f 100644 --- a/spacy_curated_transformers/pipeline/transformer.py +++ b/spacy_curated_transformers/pipeline/transformer.py @@ -29,7 +29,7 @@ [transformer] [transformer.model] - @architectures = "spacy-curated-transformers.XlmrTransformer.v1" + @architectures = "spacy-curated-transformers.XlmrTransformer.v2" vocab_size = 250002 [transformer.model.piece_encoder] diff --git a/spacy_curated_transformers/tests/models/test_listeners.py b/spacy_curated_transformers/tests/models/test_listeners.py index dc4f519..66e0fae 100644 --- a/spacy_curated_transformers/tests/models/test_listeners.py +++ b/spacy_curated_transformers/tests/models/test_listeners.py @@ -22,7 +22,7 @@ all_layer_outputs = True [components.transformer.model] - @architectures = "spacy-curated-transformers.BertTransformer.v1" + @architectures = "spacy-curated-transformers.BertTransformer.v2" vocab_size = 28996 num_hidden_layers = 2 hidden_width = 60 diff --git a/spacy_curated_transformers/tests/pipeline/test_transformer.py b/spacy_curated_transformers/tests/pipeline/test_transformer.py index cedc962..ac0818b 100644 --- a/spacy_curated_transformers/tests/pipeline/test_transformer.py +++ b/spacy_curated_transformers/tests/pipeline/test_transformer.py @@ -87,7 +87,7 @@ all_layer_outputs = False [components.transformer.model] - @architectures = "spacy-curated-transformers.BertTransformer.v1" + @architectures = "spacy-curated-transformers.BertTransformer.v2" vocab_size = 28996 num_hidden_layers = 1 hidden_width = 60 @@ -135,7 +135,7 @@ all_layer_outputs = True [components.transformer.model] - @architectures = "spacy-curated-transformers.BertTransformer.v1" + @architectures = "spacy-curated-transformers.BertTransformer.v2" vocab_size = 28996 num_hidden_layers = 1 hidden_width = 60 @@ -661,7 +661,7 @@ def test_replace_listeners(cfg_string, listener_name, listener_entrypoint): assert transformer.model.name == "transformer_model" assert ( nlp.config["components"]["transformer"]["model"]["@architectures"] - == "spacy-curated-transformers.BertTransformer.v1" + == "spacy-curated-transformers.BertTransformer.v2" ) assert ( nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"] @@ -688,7 +688,7 @@ def test_replace_listeners(cfg_string, listener_name, listener_entrypoint): assert tagger_tok2vec.layers[1].name == listener_name assert ( nlp.config["components"]["tagger"]["model"]["tok2vec"]["@architectures"] - == "spacy-curated-transformers.BertTransformer.v1" + == "spacy-curated-transformers.BertTransformer.v2" ) assert ( nlp.config["components"]["tagger"]["model"]["tok2vec"]["wrapped_listener"][ diff --git a/spacy_curated_transformers/tests/test_cli_app.py b/spacy_curated_transformers/tests/test_cli_app.py index e28d4bf..7f4152f 100644 --- a/spacy_curated_transformers/tests/test_cli_app.py +++ b/spacy_curated_transformers/tests/test_cli_app.py @@ -29,7 +29,7 @@ def test_debug_pieces(): [components.transformer] factory = "curated_transformer" [components.transformer.model] -@architectures = "spacy-curated-transformers.BertTransformer.v1" +@architectures = "spacy-curated-transformers.BertTransformer.v2" piece_encoder = {"@architectures":"spacy-curated-transformers.BertWordpieceEncoder.v1"} [components.transformer.model.with_spans] @architectures = "spacy-curated-transformers.WithStridedSpans.v1" @@ -46,7 +46,7 @@ def test_debug_pieces(): factory = "curated_transformer" [components.transformer.model] -@architectures = "spacy-curated-transformers.BertTransformer.v1" +@architectures = "spacy-curated-transformers.BertTransformer.v2" piece_encoder = {"@architectures":"spacy-curated-transformers.BertWordpieceEncoder.v1"} with_spans = {"@architectures":"spacy-curated-transformers.WithStridedSpans.v1"} attention_probs_dropout_prob = 0.1 @@ -72,15 +72,15 @@ def test_debug_pieces(): [initialize.components.transformer.encoder_loader] @model_loaders = "spacy-curated-transformers.HFTransformerEncoderLoader.v1" name = "hf-internal-testing/tiny-random-bert" -revision = "main" +revision = "8fc97e155588266e09c9f37d4a9608e1a65a279e" [initialize.components.transformer.piecer_loader] @model_loaders = "spacy-curated-transformers.HFPieceEncoderLoader.v1" name = "hf-internal-testing/tiny-random-bert" -revision = "main" +revision = "8fc97e155588266e09c9f37d4a9608e1a65a279e" """, -["--model-name", "hf-internal-testing/tiny-random-bert", "--model-revision", "main"], +["--model-name", "hf-internal-testing/tiny-random-bert", "--model-revision", "8fc97e155588266e09c9f37d4a9608e1a65a279e"], ), ( @@ -92,7 +92,7 @@ def test_debug_pieces(): [components.transformer] factory = "curated_transformer" [components.transformer.model] -@architectures = "spacy-curated-transformers.CamembertTransformer.v1" +@architectures = "spacy-curated-transformers.CamembertTransformer.v2" piece_encoder = {"@architectures":"spacy-curated-transformers.ByteBpeEncoder.v1"} [components.transformer.model.with_spans] @architectures = "spacy-curated-transformers.WithStridedSpans.v1" @@ -109,10 +109,11 @@ def test_debug_pieces(): factory = "curated_transformer" [components.transformer.model] -@architectures = "spacy-curated-transformers.CamembertTransformer.v1" +@architectures = "spacy-curated-transformers.CamembertTransformer.v2" piece_encoder = {"@architectures":"spacy-curated-transformers.ByteBpeEncoder.v1"} with_spans = {"@architectures":"spacy-curated-transformers.WithStridedSpans.v1"} attention_probs_dropout_prob = 0.1 +dtype = "float32" hidden_act = "gelu" hidden_dropout_prob = 0.1 hidden_width = 32 @@ -155,7 +156,7 @@ def test_debug_pieces(): [components.transformer] factory = "curated_transformer" [components.transformer.model] -@architectures = "spacy-curated-transformers.RobertaTransformer.v1" +@architectures = "spacy-curated-transformers.RobertaTransformer.v2" piece_encoder = {"@architectures":"spacy-curated-transformers.ByteBpeEncoder.v1"} [components.transformer.model.with_spans] @architectures = "spacy-curated-transformers.WithStridedSpans.v1" @@ -182,7 +183,7 @@ def test_debug_pieces(): factory = "curated_transformer" [components.transformer.model] -@architectures = "spacy-curated-transformers.RobertaTransformer.v1" +@architectures = "spacy-curated-transformers.RobertaTransformer.v2" piece_encoder = {"@architectures":"spacy-curated-transformers.ByteBpeEncoder.v1"} with_spans = {"@architectures":"spacy-curated-transformers.WithStridedSpans.v1"} attention_probs_dropout_prob = 0.1 @@ -208,15 +209,15 @@ def test_debug_pieces(): [initialize.components.transformer.encoder_loader] @model_loaders = "spacy-curated-transformers.HFTransformerEncoderLoader.v1" name = "hf-internal-testing/tiny-random-roberta" -revision = "main" +revision = "73def02fc9f13169a1ce21ad4602aae38d7cbd5a" [initialize.components.transformer.piecer_loader] @model_loaders = "spacy-curated-transformers.HFPieceEncoderLoader.v1" name = "hf-internal-testing/tiny-random-roberta" -revision = "main" +revision = "73def02fc9f13169a1ce21ad4602aae38d7cbd5a" """, -["--model-name", "hf-internal-testing/tiny-random-roberta", "--model-revision", "main"], +["--model-name", "hf-internal-testing/tiny-random-roberta", "--model-revision", "73def02fc9f13169a1ce21ad4602aae38d7cbd5a"], ), ( @@ -228,7 +229,7 @@ def test_debug_pieces(): [components.transformer] factory = "curated_transformer" [components.transformer.model] -@architectures = "spacy-curated-transformers.XlmrTransformer.v1" +@architectures = "spacy-curated-transformers.XlmrTransformer.v2" piece_encoder = {"@architectures":"spacy-curated-transformers.XlmrSentencepieceEncoder.v1"} [components.transformer.model.with_spans] @architectures = "spacy-curated-transformers.WithStridedSpans.v1" @@ -256,8 +257,9 @@ def test_debug_pieces(): factory = "curated_transformer" [components.transformer.model] -@architectures = "spacy-curated-transformers.XlmrTransformer.v1" +@architectures = "spacy-curated-transformers.XlmrTransformer.v2" attention_probs_dropout_prob = 0.1 +dtype = "float32" hidden_act = "gelu" hidden_dropout_prob = 0.1 hidden_width = 32 diff --git a/spacy_curated_transformers/tests/test_registry.py b/spacy_curated_transformers/tests/test_registry.py index da67ff8..edd7f9d 100644 --- a/spacy_curated_transformers/tests/test_registry.py +++ b/spacy_curated_transformers/tests/test_registry.py @@ -8,10 +8,15 @@ "model_name", [ "spacy-curated-transformers.AlbertTransformer.v1", + "spacy-curated-transformers.AlbertTransformer.v2", "spacy-curated-transformers.BertTransformer.v1", + "spacy-curated-transformers.BertTransformer.v2", "spacy-curated-transformers.CamembertTransformer.v1", + "spacy-curated-transformers.CamembertTransformer.v2", "spacy-curated-transformers.RobertaTransformer.v1", + "spacy-curated-transformers.RobertaTransformer.v2", "spacy-curated-transformers.XlmrTransformer.v1", + "spacy-curated-transformers.XlmrTransformer.v2", "spacy-curated-transformers.WithStridedSpans.v1", "spacy-curated-transformers.ScalarWeight.v1", "spacy-curated-transformers.TransformerLayersListener.v1",