Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge main into v4 #31

Merged
merged 3 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
curated-transformers>=0.1.0,<0.2.0
curated-tokenizers>=0.0.8,<0.1.0
curated-tokenizers>=0.0.9,<0.1.0
spacy>=4.0.0.dev2,<5.0.0
thinc>=9.0.0.dev4,<9.1.0
srsly
Expand Down
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[metadata]
version = 0.2.1
version = 0.2.2
description = Curated transformer models for spaCy pipelines
url = https:/explosion/spacy-curated-transformers
author = Explosion
Expand All @@ -15,7 +15,7 @@ include_package_data = true
python_requires = >=3.6
install_requires =
curated-transformers>=0.1.0,<0.2.0
curated-tokenizers>=0.0.7,<0.1.0
curated-tokenizers>=0.0.9,<0.1.0
spacy>=4.0.0.dev2,<5.0.0
thinc>=9.0.0.dev4,<9.1.0
torch>=1.12.0
Expand Down
34 changes: 33 additions & 1 deletion spacy_curated_transformers/models/output.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataclasses import dataclass
from typing import Generic, List, Optional, TypeVar
from typing import Any, Dict, Generic, List, Optional, TypeVar

import srsly
from thinc.types import Floats2d, Ragged

TrfOutputT = TypeVar("TrfOutputT", Floats2d, Ragged)
Expand Down Expand Up @@ -82,3 +83,34 @@ def all_hidden_layer_states(self) -> List[Ragged]:
@property
def num_outputs(self) -> int:
return len(self.all_outputs)

def from_dict(self, msg: Dict[str, Any]) -> "DocTransformerOutput":
self.all_outputs = [
Ragged(dataXd, lengths) for (dataXd, lengths) in msg["all_outputs"]
]
self.last_layer_only = msg["last_layer_only"]
return self

def to_dict(self) -> Dict[str, Any]:
return {
"all_outputs": [
(layer.dataXd, layer.lengths) for layer in self.all_outputs
],
"last_layer_only": self.last_layer_only,
}


@srsly.msgpack_encoders("doc_transformer_output")
def serialize_transformer_data(obj: DocTransformerOutput, chain=None):
if isinstance(obj, DocTransformerOutput):
return {"__doc_transformer_output__": obj.to_dict()}
return obj if chain is None else chain(obj)


@srsly.msgpack_decoders("doc_transformer_output")
def deserialize_transformer_data(obj, chain=None):
if "__doc_transformer_output__" in obj:
return DocTransformerOutput(all_outputs=[], last_layer_only=False).from_dict(
obj["__doc_transformer_output__"]
)
return obj if chain is None else chain(obj)
64 changes: 62 additions & 2 deletions spacy_curated_transformers/tests/pipeline/test_transformer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import multiprocessing
from functools import partial
from typing import Any, Dict

Expand All @@ -7,10 +8,13 @@
import torch
from spacy import Config, util
from spacy.language import Language
from spacy.tokens import Doc
from spacy.training import Example
from spacy.training.initialize import init_nlp
from spacy.training.loop import train
from spacy.util import registry as spacy_registry
from thinc.api import CupyOps, get_current_ops
from thinc.backends import get_array_ops
from thinc.model import Model

from spacy_curated_transformers._compat import has_hf_transformers, transformers
Expand Down Expand Up @@ -49,6 +53,10 @@

from ..util import make_tempdir, torch_assertclose, xp_assert_array_equal

# Torch currently interacts badly with the fork method:
# https:/pytorch/pytorch/issues/17199
multiprocessing.set_start_method("spawn")

cfg_string_last_layer_listener = """
# LastTransformerLayerListener

Expand Down Expand Up @@ -153,9 +161,10 @@
]


def create_and_train_tagger(cfg_string):
def create_tagger(cfg_string):
config = Config().from_str(cfg_string)
nlp = util.load_model_from_config(config, auto_fill=True, validate=True)

tagger = nlp.get_pipe("tagger")

train_examples = []
Expand All @@ -164,7 +173,19 @@ def create_and_train_tagger(cfg_string):
for tag in t[1]["tags"]:
tagger.add_label(tag)

optimizer = nlp.initialize(lambda: train_examples)
nlp.initialize(lambda: train_examples)

return nlp


def create_and_train_tagger(cfg_string):
nlp = create_tagger(cfg_string)

train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))

optimizer = nlp.create_optimizer()

for _ in range(10):
losses = {}
Expand Down Expand Up @@ -195,6 +216,22 @@ def test_tagger(cfg_string):
evaluate_tagger_on_train_data(model)


@pytest.mark.slow
@pytest.mark.skipif(not has_hf_transformers, reason="requires huggingface transformers")
@pytest.mark.parametrize(
"cfg_string",
[cfg_string_last_layer_listener, cfg_string_scalar_weighting_layer_listener],
)
@pytest.mark.skipif(
isinstance(get_current_ops(), CupyOps),
reason="multiprocessing and GPU support are incompatible",
)
def test_tagger_multiprocessing(cfg_string):
model = create_tagger(cfg_string)
for _ in model.pipe(["This is a test..."] * 100, n_process=2):
pass


def _hf_tokenize_per_token(tokenizer, docs, *, roberta=False):
if roberta:
hf_encoding = [
Expand Down Expand Up @@ -478,11 +515,21 @@ def test_transformer_pipe_outputs():
assert all([doc._.trf_data.last_layer_only for doc in docs]) == True
assert all([len(doc._.trf_data.all_outputs) == 1 for doc in docs]) == True

serialized = [doc.to_bytes() for doc in docs]
deserialized = [Doc(nlp.vocab).from_bytes(doc_bytes) for doc_bytes in serialized]
for doc, doc_deserialized in zip(docs, deserialized):
_assert_doc_model_output_equal(doc, doc_deserialized)

pipe = make_transformer(nlp, "transformer", model, all_layer_outputs=True)
docs = list(pipe.pipe(docs))
assert all([not doc._.trf_data.last_layer_only for doc in docs]) == True
assert all([len(doc._.trf_data.all_outputs) == 12 + 1 for doc in docs]) == True

serialized = [doc.to_bytes() for doc in docs]
deserialized = [Doc(nlp.vocab).from_bytes(doc_bytes) for doc_bytes in serialized]
for doc, doc_deserialized in zip(docs, deserialized):
_assert_doc_model_output_equal(doc, doc_deserialized)


cfg_string_gradual_unfreezing = (
cfg_string_last_layer_listener
Expand Down Expand Up @@ -717,3 +764,16 @@ def test_transformer_add_pipe():
]
== DEFAULT_CONFIG["transformer"]["model"]["with_spans"]["@architectures"]
)


def _assert_doc_model_output_equal(doc1: Doc, doc2: Doc):
output1 = doc1._.trf_data
output2 = doc2._.trf_data

assert output1.last_layer_only == output2.last_layer_only
assert len(output1.all_outputs) == len(output2.all_outputs)

for layer1, layer2 in zip(output1.all_outputs, output2.all_outputs):
ops = get_array_ops(layer1.dataXd)
ops.xp.testing.assert_allclose(layer1.dataXd, layer2.dataXd)
ops.xp.testing.assert_array_equal(layer1.lengths, layer2.lengths)
32 changes: 21 additions & 11 deletions spacy_curated_transformers/tokenization/hf_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,26 @@
SUPPORTED_TOKENIZERS = () # type: ignore


class _HFPieceEncoderLoader:
"""This was formerly an inline function. However, only proper objects
can be pickled."""

def __init__(self, *, name: str, revision: str):
self.name = name
self.revision = revision

def __call__(self, model, X=None, Y=None):
if not has_hf_transformers:
raise ValueError(
"`HFPieceEncoderLoader` requires the Hugging Face `transformers` package to be installed"
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
self.name, revision=self.revision
)
return _convert_encoder(model, tokenizer)


def build_hf_piece_encoder_loader_v1(*, name: str, revision: str = "main") -> Callable[
[Tok2PiecesModelT, Optional[Tok2PiecesInT], Optional[Tok2PiecesInT]],
Tok2PiecesModelT,
Expand All @@ -31,17 +51,7 @@ def build_hf_piece_encoder_loader_v1(*, name: str, revision: str = "main") -> Ca
revision (str):
Name of the model revision/branch.
"""

def load(model, X=None, Y=None):
if not has_hf_transformers:
raise ValueError(
"`HFPieceEncoderLoader` requires the Hugging Face `transformers` package to be installed"
)

tokenizer = transformers.AutoTokenizer.from_pretrained(name, revision=revision)
return _convert_encoder(model, tokenizer)

return load
return _HFPieceEncoderLoader(name=name, revision=revision)


def _convert_encoder(
Expand Down
Loading