From a2e178f157b49aae69fd41c118fc69927f2e1c69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Tue, 12 Dec 2023 10:06:01 +0100 Subject: [PATCH 1/7] Add ParametricAttention.v2 This layer is an extension of the existing `ParametricAttention` layer, adding support for transformations (such as a non-linear layer) of the key representation. This brings the model closer to the paper that suggested it (Yang et al, 2016) and gave slightly better results in experiments. --- thinc/api.py | 3 +- thinc/layers/__init__.py | 2 + thinc/layers/parametricattention_v2.py | 107 +++++++++++++++++++++++++ thinc/tests/layers/test_layers_api.py | 3 + website/docs/api-layers.md | 38 +++++++++ 5 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 thinc/layers/parametricattention_v2.py diff --git a/thinc/api.py b/thinc/api.py index b2bc346a0..204aa386e 100644 --- a/thinc/api.py +++ b/thinc/api.py @@ -41,6 +41,7 @@ MultiSoftmax, MXNetWrapper, ParametricAttention, + ParametricAttention_v2, PyTorchLSTM, PyTorchRNNWrapper, PyTorchWrapper, @@ -207,7 +208,7 @@ "PyTorchWrapper", "PyTorchRNNWrapper", "PyTorchLSTM", "TensorFlowWrapper", "keras_subclass", "MXNetWrapper", "PyTorchWrapper_v2", "Softmax_v2", "PyTorchWrapper_v3", - "SparseLinear_v2", "TorchScriptWrapper_v1", + "SparseLinear_v2", "TorchScriptWrapper_v1", "ParametricAttention_v2", "add", "bidirectional", "chain", "clone", "concatenate", "noop", "residual", "uniqued", "siamese", "list2ragged", "ragged2list", diff --git a/thinc/layers/__init__.py b/thinc/layers/__init__.py index 032af5fde..841e6c072 100644 --- a/thinc/layers/__init__.py +++ b/thinc/layers/__init__.py @@ -35,6 +35,7 @@ from .noop import noop from .padded2list import padded2list from .parametricattention import ParametricAttention +from .parametricattention_v2 import ParametricAttention_v2 from .premap_ids import premap_ids from .pytorchwrapper import ( PyTorchRNNWrapper, @@ -94,6 +95,7 @@ "Mish", "MultiSoftmax", "ParametricAttention", + "ParametricAttention_v2", "PyTorchLSTM", "PyTorchWrapper", "PyTorchWrapper_v2", diff --git a/thinc/layers/parametricattention_v2.py b/thinc/layers/parametricattention_v2.py new file mode 100644 index 000000000..04b09d5a1 --- /dev/null +++ b/thinc/layers/parametricattention_v2.py @@ -0,0 +1,107 @@ +from typing import Callable, Optional, Tuple, cast + +from ..config import registry +from ..model import Model +from ..types import Floats2d, Ragged +from ..util import get_width + +InT = Ragged +OutT = Ragged + + +@registry.layers("ParametricAttention.v2") +def ParametricAttention_v2( + *, + key_transform: Optional[Model[Floats2d, Floats2d]] = None, + nO: Optional[int] = None +) -> Model[InT, OutT]: + if key_transform is None: + layers = [] + refs = {} + else: + layers = [key_transform] + refs = {"key_transform": cast(Optional[Model], key_transform)} + + layers = [key_transform] if key_transform is not None else [] + + """Weight inputs by similarity to a learned vector""" + return Model( + "para-attn", + forward, + init=init, + params={"Q": None}, + dims={"nO": nO}, + layers=layers, + refs=refs, + ) + + +def forward(model: Model[InT, OutT], Xr: InT, is_train: bool) -> Tuple[OutT, Callable]: + Q = model.get_param("Q") + key_transform = model.maybe_get_ref("key_transform") + + attention, bp_attention = _get_attention( + model.ops, Q, key_transform, Xr.dataXd, Xr.lengths, is_train + ) + output, bp_output = _apply_attention(model.ops, attention, Xr.dataXd, Xr.lengths) + + def backprop(dYr: OutT) -> InT: + dX, d_attention = bp_output(dYr.dataXd) + dQ, dX2 = bp_attention(d_attention) + model.inc_grad("Q", dQ.ravel()) + dX += dX2 + return Ragged(dX, dYr.lengths) + + return Ragged(output, Xr.lengths), backprop + + +def init( + model: Model[InT, OutT], X: Optional[InT] = None, Y: Optional[OutT] = None +) -> None: + key_transform = model.maybe_get_ref("key_transform") + width = get_width(X) if X is not None else None + if width: + model.set_dim("nO", width) + if key_transform is not None: + key_transform.set_dim("nO", width) + + # Randomly initialize the parameter, as though it were an embedding. + Q = model.ops.alloc1f(model.get_dim("nO")) + Q += model.ops.xp.random.uniform(-0.1, 0.1, Q.shape) + model.set_param("Q", Q) + + X_array = X.dataXd if X is not None else None + Y_array = Y.dataXd if Y is not None else None + + if key_transform is not None: + key_transform.initialize(X_array, Y_array) + + +def _get_attention(ops, Q, key_transform, X, lengths, is_train): + if key_transform is None: + K, K_bp = X, lambda dY: dY + else: + K, K_bp = key_transform(X, is_train=is_train) + + attention = ops.gemm(K, ops.reshape2f(Q, -1, 1)) + attention = ops.softmax_sequences(attention, lengths) + + def get_attention_bwd(d_attention): + d_attention = ops.backprop_softmax_sequences(d_attention, attention, lengths) + dQ = ops.gemm(K, d_attention, trans1=True) + dY = ops.xp.outer(d_attention, Q) + dX = K_bp(dY) + return dQ, dX + + return attention, get_attention_bwd + + +def _apply_attention(ops, attention, X, lengths): + output = X * attention + + def apply_attention_bwd(d_output): + d_attention = (X * d_output).sum(axis=1, keepdims=True) + dX = d_output * attention + return dX, d_attention + + return output, apply_attention_bwd diff --git a/thinc/tests/layers/test_layers_api.py b/thinc/tests/layers/test_layers_api.py index 0ef559d96..62be60dc4 100644 --- a/thinc/tests/layers/test_layers_api.py +++ b/thinc/tests/layers/test_layers_api.py @@ -8,6 +8,7 @@ from thinc.api import Dropout, Model, NumpyOps, registry, with_padded from thinc.backends import NumpyOps from thinc.compat import has_torch +from thinc.layers.relu import Relu from thinc.types import Array2d, Floats2d, FloatsXd, Padded, Ragged, Shape from thinc.util import data_validation, get_width @@ -129,6 +130,8 @@ def assert_data_match(Y, out_data): ("MultiSoftmax.v1", {"nOs": (1, 3)}, array2d, array2d), # ("CauchySimilarity.v1", {}, (array2d, array2d), array1d), ("ParametricAttention.v1", {}, ragged, ragged), + ("ParametricAttention.v2", {}, ragged, ragged), + ("ParametricAttention.v2", {"key_transform": {"@layers": "Gelu.v1"}}, ragged, ragged), ("SparseLinear.v1", {}, (numpy.asarray([1, 2, 3], dtype="uint64"), array1d, numpy.asarray([1, 1], dtype="i")), array2d), ("SparseLinear.v2", {}, (numpy.asarray([1, 2, 3], dtype="uint64"), array1d, numpy.asarray([1, 1], dtype="i")), array2d), ("remap_ids.v1", {"dtype": "f"}, ["a", 1, 5.0], array2dint), diff --git a/website/docs/api-layers.md b/website/docs/api-layers.md index dbdde5b20..ef2fa0a03 100644 --- a/website/docs/api-layers.md +++ b/website/docs/api-layers.md @@ -686,6 +686,44 @@ attention mechanism. https://github.com/explosion/thinc/blob/master/thinc/layers/parametricattention.py ``` +### ParametricAttention_v2 {#parametricattention_v2 tag="function"} + + + +- **Input:** Ragged +- **Output:** Ragged +- **Parameters:** Q + + + +A layer that uses the parametric attention scheme described by +[Yang et al. (2016)](https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf). +The layer learns a parameter vector that is used as the keys in a single-headed +attention mechanism. + + + +The original `ParametricAttention` layer uses the hidden representation as-is +for the keys in the attention. This differs from the paper that introduces +parametric attention (Equation 5). `ParametricAttention_v2` adds the option to +transform the key representation in line with the paper by passing such a +transformation through the `key_transform` parameter. + + + + +| Argument | Type | Description | +|-----------------|----------------------------------------------|------------------------------------------------------------------------| +| `key_transform` | Optional[Model[Floats2d, Floats2d]] | Transformation to apply to the key representations. Defaults to `None` | +| `nO` | Optional[int] | The size of the output vectors. | +| **RETURNS** | Model[Ragged, Ragged] | The created attention layer. | + +```python +https://github.com/explosion/thinc/blob/master/thinc/layers/parametricattention_v2.py +``` + + + ### Relu {#relu tag="function"} From f394c848894e20fbf4396f2f7361c4339da750cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Wed, 13 Dec 2023 13:55:58 +0100 Subject: [PATCH 2/7] Use `noop` for when `key_transform` is `None` --- thinc/layers/parametricattention_v2.py | 28 +++++++++----------------- 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/thinc/layers/parametricattention_v2.py b/thinc/layers/parametricattention_v2.py index 04b09d5a1..58e0c3668 100644 --- a/thinc/layers/parametricattention_v2.py +++ b/thinc/layers/parametricattention_v2.py @@ -5,6 +5,8 @@ from ..types import Floats2d, Ragged from ..util import get_width +from .noop import noop + InT = Ragged OutT = Ragged @@ -16,13 +18,7 @@ def ParametricAttention_v2( nO: Optional[int] = None ) -> Model[InT, OutT]: if key_transform is None: - layers = [] - refs = {} - else: - layers = [key_transform] - refs = {"key_transform": cast(Optional[Model], key_transform)} - - layers = [key_transform] if key_transform is not None else [] + key_transform = noop() """Weight inputs by similarity to a learned vector""" return Model( @@ -31,14 +27,14 @@ def ParametricAttention_v2( init=init, params={"Q": None}, dims={"nO": nO}, - layers=layers, - refs=refs, + refs={"key_transform": key_transform}, + layers=[key_transform], ) def forward(model: Model[InT, OutT], Xr: InT, is_train: bool) -> Tuple[OutT, Callable]: Q = model.get_param("Q") - key_transform = model.maybe_get_ref("key_transform") + key_transform = model.get_ref("key_transform") attention, bp_attention = _get_attention( model.ops, Q, key_transform, Xr.dataXd, Xr.lengths, is_train @@ -58,11 +54,11 @@ def backprop(dYr: OutT) -> InT: def init( model: Model[InT, OutT], X: Optional[InT] = None, Y: Optional[OutT] = None ) -> None: - key_transform = model.maybe_get_ref("key_transform") + key_transform = model.get_ref("key_transform") width = get_width(X) if X is not None else None if width: model.set_dim("nO", width) - if key_transform is not None: + if key_transform.has_dim("nO"): key_transform.set_dim("nO", width) # Randomly initialize the parameter, as though it were an embedding. @@ -73,15 +69,11 @@ def init( X_array = X.dataXd if X is not None else None Y_array = Y.dataXd if Y is not None else None - if key_transform is not None: - key_transform.initialize(X_array, Y_array) + key_transform.initialize(X_array, Y_array) def _get_attention(ops, Q, key_transform, X, lengths, is_train): - if key_transform is None: - K, K_bp = X, lambda dY: dY - else: - K, K_bp = key_transform(X, is_train=is_train) + K, K_bp = key_transform(X, is_train=is_train) attention = ops.gemm(K, ops.reshape2f(Q, -1, 1)) attention = ops.softmax_sequences(attention, lengths) From 137a457dd505fdd1fcd1807a49bcf326a4928ba2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Wed, 13 Dec 2023 13:56:54 +0100 Subject: [PATCH 3/7] Remove stray import --- thinc/tests/layers/test_layers_api.py | 1 - 1 file changed, 1 deletion(-) diff --git a/thinc/tests/layers/test_layers_api.py b/thinc/tests/layers/test_layers_api.py index 62be60dc4..046d98940 100644 --- a/thinc/tests/layers/test_layers_api.py +++ b/thinc/tests/layers/test_layers_api.py @@ -8,7 +8,6 @@ from thinc.api import Dropout, Model, NumpyOps, registry, with_padded from thinc.backends import NumpyOps from thinc.compat import has_torch -from thinc.layers.relu import Relu from thinc.types import Array2d, Floats2d, FloatsXd, Padded, Ragged, Shape from thinc.util import data_validation, get_width From e92d581710c5f8964c903fa25ecb57836d294f4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Wed, 13 Dec 2023 13:58:47 +0100 Subject: [PATCH 4/7] Add constant for key transform ref --- thinc/layers/parametricattention_v2.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/thinc/layers/parametricattention_v2.py b/thinc/layers/parametricattention_v2.py index 58e0c3668..7c70d43e1 100644 --- a/thinc/layers/parametricattention_v2.py +++ b/thinc/layers/parametricattention_v2.py @@ -10,6 +10,8 @@ InT = Ragged OutT = Ragged +KEY_TRANSFORM_REF: str = "key_transform" + @registry.layers("ParametricAttention.v2") def ParametricAttention_v2( @@ -27,14 +29,14 @@ def ParametricAttention_v2( init=init, params={"Q": None}, dims={"nO": nO}, - refs={"key_transform": key_transform}, + refs={KEY_TRANSFORM_REF: key_transform}, layers=[key_transform], ) def forward(model: Model[InT, OutT], Xr: InT, is_train: bool) -> Tuple[OutT, Callable]: Q = model.get_param("Q") - key_transform = model.get_ref("key_transform") + key_transform = model.get_ref(KEY_TRANSFORM_REF) attention, bp_attention = _get_attention( model.ops, Q, key_transform, Xr.dataXd, Xr.lengths, is_train @@ -54,7 +56,7 @@ def backprop(dYr: OutT) -> InT: def init( model: Model[InT, OutT], X: Optional[InT] = None, Y: Optional[OutT] = None ) -> None: - key_transform = model.get_ref("key_transform") + key_transform = model.get_ref(KEY_TRANSFORM_REF) width = get_width(X) if X is not None else None if width: model.set_dim("nO", width) From 06be6dd2206f0b58a9f4b7da2041aac91e67c93f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Wed, 13 Dec 2023 14:01:05 +0100 Subject: [PATCH 5/7] Check that we correctly set the key transform --- thinc/tests/layers/test_parametric_attention_v2.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 thinc/tests/layers/test_parametric_attention_v2.py diff --git a/thinc/tests/layers/test_parametric_attention_v2.py b/thinc/tests/layers/test_parametric_attention_v2.py new file mode 100644 index 000000000..9f8286a0b --- /dev/null +++ b/thinc/tests/layers/test_parametric_attention_v2.py @@ -0,0 +1,10 @@ +from thinc.layers.parametricattention_v2 import ( + KEY_TRANSFORM_REF, + ParametricAttention_v2, +) +from thinc.layers.gelu import Gelu + + +def test_key_transform_used(): + attn = ParametricAttention_v2(key_transform=Gelu()) + assert attn.get_ref(KEY_TRANSFORM_REF).name == "gelu" From 2369b340733e7c6189297a80d1e68be17582cd16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Wed, 13 Dec 2023 14:16:01 +0100 Subject: [PATCH 6/7] isooooooort --- thinc/layers/parametricattention_v2.py | 1 - thinc/tests/layers/test_parametric_attention_v2.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/thinc/layers/parametricattention_v2.py b/thinc/layers/parametricattention_v2.py index 7c70d43e1..e252dd7d2 100644 --- a/thinc/layers/parametricattention_v2.py +++ b/thinc/layers/parametricattention_v2.py @@ -4,7 +4,6 @@ from ..model import Model from ..types import Floats2d, Ragged from ..util import get_width - from .noop import noop InT = Ragged diff --git a/thinc/tests/layers/test_parametric_attention_v2.py b/thinc/tests/layers/test_parametric_attention_v2.py index 9f8286a0b..fd88880f4 100644 --- a/thinc/tests/layers/test_parametric_attention_v2.py +++ b/thinc/tests/layers/test_parametric_attention_v2.py @@ -1,8 +1,8 @@ +from thinc.layers.gelu import Gelu from thinc.layers.parametricattention_v2 import ( KEY_TRANSFORM_REF, ParametricAttention_v2, ) -from thinc.layers.gelu import Gelu def test_key_transform_used(): From 80f47b793be92f82d533b5c244ede9be4b62965e Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 13 Dec 2023 15:25:05 +0100 Subject: [PATCH 7/7] Update citation to ACL link Co-authored-by: Adriane Boyd --- website/docs/api-layers.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api-layers.md b/website/docs/api-layers.md index ef2fa0a03..442ecb463 100644 --- a/website/docs/api-layers.md +++ b/website/docs/api-layers.md @@ -697,7 +697,7 @@ https://github.com/explosion/thinc/blob/master/thinc/layers/parametricattention. A layer that uses the parametric attention scheme described by -[Yang et al. (2016)](https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf). +[Yang et al. (2016)](https://aclanthology.org/N16-1174). The layer learns a parameter vector that is used as the keys in a single-headed attention mechanism.