From a2e178f157b49aae69fd41c118fc69927f2e1c69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 12 Dec 2023 10:06:01 +0100
Subject: [PATCH 1/7] Add ParametricAttention.v2

This layer is an extension of the existing `ParametricAttention` layer,
adding support for transformations (such as a non-linear layer) of the
key representation. This brings the model closer to the paper that
suggested it (Yang et al, 2016) and gave slightly better results in
experiments.
---
 thinc/api.py                           |   3 +-
 thinc/layers/__init__.py               |   2 +
 thinc/layers/parametricattention_v2.py | 107 +++++++++++++++++++++++++
 thinc/tests/layers/test_layers_api.py  |   3 +
 website/docs/api-layers.md             |  38 +++++++++
 5 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 thinc/layers/parametricattention_v2.py

diff --git a/thinc/api.py b/thinc/api.py
index b2bc346a0..204aa386e 100644
--- a/thinc/api.py
+++ b/thinc/api.py
@@ -41,6 +41,7 @@
     MultiSoftmax,
     MXNetWrapper,
     ParametricAttention,
+    ParametricAttention_v2,
     PyTorchLSTM,
     PyTorchRNNWrapper,
     PyTorchWrapper,
@@ -207,7 +208,7 @@
     "PyTorchWrapper", "PyTorchRNNWrapper", "PyTorchLSTM",
     "TensorFlowWrapper", "keras_subclass", "MXNetWrapper",
     "PyTorchWrapper_v2", "Softmax_v2", "PyTorchWrapper_v3",
-    "SparseLinear_v2", "TorchScriptWrapper_v1",
+    "SparseLinear_v2", "TorchScriptWrapper_v1", "ParametricAttention_v2",
 
     "add", "bidirectional", "chain", "clone", "concatenate", "noop",
     "residual", "uniqued", "siamese", "list2ragged", "ragged2list",
diff --git a/thinc/layers/__init__.py b/thinc/layers/__init__.py
index 032af5fde..841e6c072 100644
--- a/thinc/layers/__init__.py
+++ b/thinc/layers/__init__.py
@@ -35,6 +35,7 @@
 from .noop import noop
 from .padded2list import padded2list
 from .parametricattention import ParametricAttention
+from .parametricattention_v2 import ParametricAttention_v2
 from .premap_ids import premap_ids
 from .pytorchwrapper import (
     PyTorchRNNWrapper,
@@ -94,6 +95,7 @@
     "Mish",
     "MultiSoftmax",
     "ParametricAttention",
+    "ParametricAttention_v2",
     "PyTorchLSTM",
     "PyTorchWrapper",
     "PyTorchWrapper_v2",
diff --git a/thinc/layers/parametricattention_v2.py b/thinc/layers/parametricattention_v2.py
new file mode 100644
index 000000000..04b09d5a1
--- /dev/null
+++ b/thinc/layers/parametricattention_v2.py
@@ -0,0 +1,107 @@
+from typing import Callable, Optional, Tuple, cast
+
+from ..config import registry
+from ..model import Model
+from ..types import Floats2d, Ragged
+from ..util import get_width
+
+InT = Ragged
+OutT = Ragged
+
+
+@registry.layers("ParametricAttention.v2")
+def ParametricAttention_v2(
+    *,
+    key_transform: Optional[Model[Floats2d, Floats2d]] = None,
+    nO: Optional[int] = None
+) -> Model[InT, OutT]:
+    if key_transform is None:
+        layers = []
+        refs = {}
+    else:
+        layers = [key_transform]
+        refs = {"key_transform": cast(Optional[Model], key_transform)}
+
+    layers = [key_transform] if key_transform is not None else []
+
+    """Weight inputs by similarity to a learned vector"""
+    return Model(
+        "para-attn",
+        forward,
+        init=init,
+        params={"Q": None},
+        dims={"nO": nO},
+        layers=layers,
+        refs=refs,
+    )
+
+
+def forward(model: Model[InT, OutT], Xr: InT, is_train: bool) -> Tuple[OutT, Callable]:
+    Q = model.get_param("Q")
+    key_transform = model.maybe_get_ref("key_transform")
+
+    attention, bp_attention = _get_attention(
+        model.ops, Q, key_transform, Xr.dataXd, Xr.lengths, is_train
+    )
+    output, bp_output = _apply_attention(model.ops, attention, Xr.dataXd, Xr.lengths)
+
+    def backprop(dYr: OutT) -> InT:
+        dX, d_attention = bp_output(dYr.dataXd)
+        dQ, dX2 = bp_attention(d_attention)
+        model.inc_grad("Q", dQ.ravel())
+        dX += dX2
+        return Ragged(dX, dYr.lengths)
+
+    return Ragged(output, Xr.lengths), backprop
+
+
+def init(
+    model: Model[InT, OutT], X: Optional[InT] = None, Y: Optional[OutT] = None
+) -> None:
+    key_transform = model.maybe_get_ref("key_transform")
+    width = get_width(X) if X is not None else None
+    if width:
+        model.set_dim("nO", width)
+        if key_transform is not None:
+            key_transform.set_dim("nO", width)
+
+    # Randomly initialize the parameter, as though it were an embedding.
+    Q = model.ops.alloc1f(model.get_dim("nO"))
+    Q += model.ops.xp.random.uniform(-0.1, 0.1, Q.shape)
+    model.set_param("Q", Q)
+
+    X_array = X.dataXd if X is not None else None
+    Y_array = Y.dataXd if Y is not None else None
+
+    if key_transform is not None:
+        key_transform.initialize(X_array, Y_array)
+
+
+def _get_attention(ops, Q, key_transform, X, lengths, is_train):
+    if key_transform is None:
+        K, K_bp = X, lambda dY: dY
+    else:
+        K, K_bp = key_transform(X, is_train=is_train)
+
+    attention = ops.gemm(K, ops.reshape2f(Q, -1, 1))
+    attention = ops.softmax_sequences(attention, lengths)
+
+    def get_attention_bwd(d_attention):
+        d_attention = ops.backprop_softmax_sequences(d_attention, attention, lengths)
+        dQ = ops.gemm(K, d_attention, trans1=True)
+        dY = ops.xp.outer(d_attention, Q)
+        dX = K_bp(dY)
+        return dQ, dX
+
+    return attention, get_attention_bwd
+
+
+def _apply_attention(ops, attention, X, lengths):
+    output = X * attention
+
+    def apply_attention_bwd(d_output):
+        d_attention = (X * d_output).sum(axis=1, keepdims=True)
+        dX = d_output * attention
+        return dX, d_attention
+
+    return output, apply_attention_bwd
diff --git a/thinc/tests/layers/test_layers_api.py b/thinc/tests/layers/test_layers_api.py
index 0ef559d96..62be60dc4 100644
--- a/thinc/tests/layers/test_layers_api.py
+++ b/thinc/tests/layers/test_layers_api.py
@@ -8,6 +8,7 @@
 from thinc.api import Dropout, Model, NumpyOps, registry, with_padded
 from thinc.backends import NumpyOps
 from thinc.compat import has_torch
+from thinc.layers.relu import Relu
 from thinc.types import Array2d, Floats2d, FloatsXd, Padded, Ragged, Shape
 from thinc.util import data_validation, get_width
 
@@ -129,6 +130,8 @@ def assert_data_match(Y, out_data):
     ("MultiSoftmax.v1", {"nOs": (1, 3)}, array2d, array2d),
     # ("CauchySimilarity.v1", {}, (array2d, array2d), array1d),
     ("ParametricAttention.v1", {}, ragged, ragged),
+    ("ParametricAttention.v2", {}, ragged, ragged),
+    ("ParametricAttention.v2", {"key_transform": {"@layers": "Gelu.v1"}}, ragged, ragged),
     ("SparseLinear.v1", {}, (numpy.asarray([1, 2, 3], dtype="uint64"), array1d, numpy.asarray([1, 1], dtype="i")), array2d),
     ("SparseLinear.v2", {}, (numpy.asarray([1, 2, 3], dtype="uint64"), array1d, numpy.asarray([1, 1], dtype="i")), array2d),
     ("remap_ids.v1", {"dtype": "f"}, ["a", 1, 5.0], array2dint),
diff --git a/website/docs/api-layers.md b/website/docs/api-layers.md
index dbdde5b20..ef2fa0a03 100644
--- a/website/docs/api-layers.md
+++ b/website/docs/api-layers.md
@@ -686,6 +686,44 @@ attention mechanism.
 https://github.com/explosion/thinc/blob/master/thinc/layers/parametricattention.py
 ```
 
+### ParametricAttention_v2 {#parametricattention_v2 tag="function"}
+
+<inline-list>
+
+- **Input:** <ndarray>Ragged</ndarray>
+- **Output:** <ndarray>Ragged</ndarray>
+- **Parameters:** <ndarray shape="nO,">Q</ndarray>
+
+</inline-list>
+
+A layer that uses the parametric attention scheme described by
+[Yang et al. (2016)](https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf).
+The layer learns a parameter vector that is used as the keys in a single-headed
+attention mechanism.
+
+<infobox variant="warning">
+
+The original `ParametricAttention` layer uses the hidden representation as-is
+for the keys in the attention. This differs from the paper that introduces
+parametric attention (Equation 5). `ParametricAttention_v2` adds the option to
+transform the key representation in line with the paper by passing such a 
+transformation through the `key_transform` parameter.
+
+</infobox>
+
+
+| Argument        | Type                                         | Description                                                            |
+|-----------------|----------------------------------------------|------------------------------------------------------------------------|
+| `key_transform` | <tt>Optional[Model[Floats2d, Floats2d]]</tt> | Transformation to apply to the key representations. Defaults to `None` |
+| `nO`            | <tt>Optional[int]</tt>                       | The size of the output vectors.                                        |
+| **RETURNS**     | <tt>Model[Ragged, Ragged]</tt>               | The created attention layer.                                           |
+
+```python
+https://github.com/explosion/thinc/blob/master/thinc/layers/parametricattention_v2.py
+```
+
+
+
 ### Relu {#relu tag="function"}
 
 <inline-list>

From f394c848894e20fbf4396f2f7361c4339da750cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 13 Dec 2023 13:55:58 +0100
Subject: [PATCH 2/7] Use `noop` for when `key_transform` is `None`

---
 thinc/layers/parametricattention_v2.py | 28 +++++++++-----------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/thinc/layers/parametricattention_v2.py b/thinc/layers/parametricattention_v2.py
index 04b09d5a1..58e0c3668 100644
--- a/thinc/layers/parametricattention_v2.py
+++ b/thinc/layers/parametricattention_v2.py
@@ -5,6 +5,8 @@
 from ..types import Floats2d, Ragged
 from ..util import get_width
 
+from .noop import noop
+
 InT = Ragged
 OutT = Ragged
 
@@ -16,13 +18,7 @@ def ParametricAttention_v2(
     nO: Optional[int] = None
 ) -> Model[InT, OutT]:
     if key_transform is None:
-        layers = []
-        refs = {}
-    else:
-        layers = [key_transform]
-        refs = {"key_transform": cast(Optional[Model], key_transform)}
-
-    layers = [key_transform] if key_transform is not None else []
+        key_transform = noop()
 
     """Weight inputs by similarity to a learned vector"""
     return Model(
@@ -31,14 +27,14 @@ def ParametricAttention_v2(
         init=init,
         params={"Q": None},
         dims={"nO": nO},
-        layers=layers,
-        refs=refs,
+        refs={"key_transform": key_transform},
+        layers=[key_transform],
     )
 
 
 def forward(model: Model[InT, OutT], Xr: InT, is_train: bool) -> Tuple[OutT, Callable]:
     Q = model.get_param("Q")
-    key_transform = model.maybe_get_ref("key_transform")
+    key_transform = model.get_ref("key_transform")
 
     attention, bp_attention = _get_attention(
         model.ops, Q, key_transform, Xr.dataXd, Xr.lengths, is_train
@@ -58,11 +54,11 @@ def backprop(dYr: OutT) -> InT:
 def init(
     model: Model[InT, OutT], X: Optional[InT] = None, Y: Optional[OutT] = None
 ) -> None:
-    key_transform = model.maybe_get_ref("key_transform")
+    key_transform = model.get_ref("key_transform")
     width = get_width(X) if X is not None else None
     if width:
         model.set_dim("nO", width)
-        if key_transform is not None:
+        if key_transform.has_dim("nO"):
             key_transform.set_dim("nO", width)
 
     # Randomly initialize the parameter, as though it were an embedding.
@@ -73,15 +69,11 @@ def init(
     X_array = X.dataXd if X is not None else None
     Y_array = Y.dataXd if Y is not None else None
 
-    if key_transform is not None:
-        key_transform.initialize(X_array, Y_array)
+    key_transform.initialize(X_array, Y_array)
 
 
 def _get_attention(ops, Q, key_transform, X, lengths, is_train):
-    if key_transform is None:
-        K, K_bp = X, lambda dY: dY
-    else:
-        K, K_bp = key_transform(X, is_train=is_train)
+    K, K_bp = key_transform(X, is_train=is_train)
 
     attention = ops.gemm(K, ops.reshape2f(Q, -1, 1))
     attention = ops.softmax_sequences(attention, lengths)

From 137a457dd505fdd1fcd1807a49bcf326a4928ba2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 13 Dec 2023 13:56:54 +0100
Subject: [PATCH 3/7] Remove stray import

---
 thinc/tests/layers/test_layers_api.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/thinc/tests/layers/test_layers_api.py b/thinc/tests/layers/test_layers_api.py
index 62be60dc4..046d98940 100644
--- a/thinc/tests/layers/test_layers_api.py
+++ b/thinc/tests/layers/test_layers_api.py
@@ -8,7 +8,6 @@
 from thinc.api import Dropout, Model, NumpyOps, registry, with_padded
 from thinc.backends import NumpyOps
 from thinc.compat import has_torch
-from thinc.layers.relu import Relu
 from thinc.types import Array2d, Floats2d, FloatsXd, Padded, Ragged, Shape
 from thinc.util import data_validation, get_width
 

From e92d581710c5f8964c903fa25ecb57836d294f4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 13 Dec 2023 13:58:47 +0100
Subject: [PATCH 4/7] Add constant for key transform ref

---
 thinc/layers/parametricattention_v2.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/thinc/layers/parametricattention_v2.py b/thinc/layers/parametricattention_v2.py
index 58e0c3668..7c70d43e1 100644
--- a/thinc/layers/parametricattention_v2.py
+++ b/thinc/layers/parametricattention_v2.py
@@ -10,6 +10,8 @@
 InT = Ragged
 OutT = Ragged
 
+KEY_TRANSFORM_REF: str = "key_transform"
+
 
 @registry.layers("ParametricAttention.v2")
 def ParametricAttention_v2(
@@ -27,14 +29,14 @@ def ParametricAttention_v2(
         init=init,
         params={"Q": None},
         dims={"nO": nO},
-        refs={"key_transform": key_transform},
+        refs={KEY_TRANSFORM_REF: key_transform},
         layers=[key_transform],
     )
 
 
 def forward(model: Model[InT, OutT], Xr: InT, is_train: bool) -> Tuple[OutT, Callable]:
     Q = model.get_param("Q")
-    key_transform = model.get_ref("key_transform")
+    key_transform = model.get_ref(KEY_TRANSFORM_REF)
 
     attention, bp_attention = _get_attention(
         model.ops, Q, key_transform, Xr.dataXd, Xr.lengths, is_train
@@ -54,7 +56,7 @@ def backprop(dYr: OutT) -> InT:
 def init(
     model: Model[InT, OutT], X: Optional[InT] = None, Y: Optional[OutT] = None
 ) -> None:
-    key_transform = model.get_ref("key_transform")
+    key_transform = model.get_ref(KEY_TRANSFORM_REF)
     width = get_width(X) if X is not None else None
     if width:
         model.set_dim("nO", width)

From 06be6dd2206f0b58a9f4b7da2041aac91e67c93f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 13 Dec 2023 14:01:05 +0100
Subject: [PATCH 5/7] Check that we correctly set the key transform

---
 thinc/tests/layers/test_parametric_attention_v2.py | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 thinc/tests/layers/test_parametric_attention_v2.py

diff --git a/thinc/tests/layers/test_parametric_attention_v2.py b/thinc/tests/layers/test_parametric_attention_v2.py
new file mode 100644
index 000000000..9f8286a0b
--- /dev/null
+++ b/thinc/tests/layers/test_parametric_attention_v2.py
@@ -0,0 +1,10 @@
+from thinc.layers.parametricattention_v2 import (
+    KEY_TRANSFORM_REF,
+    ParametricAttention_v2,
+)
+from thinc.layers.gelu import Gelu
+
+
+def test_key_transform_used():
+    attn = ParametricAttention_v2(key_transform=Gelu())
+    assert attn.get_ref(KEY_TRANSFORM_REF).name == "gelu"

From 2369b340733e7c6189297a80d1e68be17582cd16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 13 Dec 2023 14:16:01 +0100
Subject: [PATCH 6/7] isooooooort

---
 thinc/layers/parametricattention_v2.py             | 1 -
 thinc/tests/layers/test_parametric_attention_v2.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/thinc/layers/parametricattention_v2.py b/thinc/layers/parametricattention_v2.py
index 7c70d43e1..e252dd7d2 100644
--- a/thinc/layers/parametricattention_v2.py
+++ b/thinc/layers/parametricattention_v2.py
@@ -4,7 +4,6 @@
 from ..model import Model
 from ..types import Floats2d, Ragged
 from ..util import get_width
-
 from .noop import noop
 
 InT = Ragged
diff --git a/thinc/tests/layers/test_parametric_attention_v2.py b/thinc/tests/layers/test_parametric_attention_v2.py
index 9f8286a0b..fd88880f4 100644
--- a/thinc/tests/layers/test_parametric_attention_v2.py
+++ b/thinc/tests/layers/test_parametric_attention_v2.py
@@ -1,8 +1,8 @@
+from thinc.layers.gelu import Gelu
 from thinc.layers.parametricattention_v2 import (
     KEY_TRANSFORM_REF,
     ParametricAttention_v2,
 )
-from thinc.layers.gelu import Gelu
 
 
 def test_key_transform_used():

From 80f47b793be92f82d533b5c244ede9be4b62965e Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 13 Dec 2023 15:25:05 +0100
Subject: [PATCH 7/7] Update citation to ACL link

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 website/docs/api-layers.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api-layers.md b/website/docs/api-layers.md
index ef2fa0a03..442ecb463 100644
--- a/website/docs/api-layers.md
+++ b/website/docs/api-layers.md
@@ -697,7 +697,7 @@ https://github.com/explosion/thinc/blob/master/thinc/layers/parametricattention.
 </inline-list>
 
 A layer that uses the parametric attention scheme described by
-[Yang et al. (2016)](https://www.cs.cmu.edu/~./hovy/papers/16HLT-hierarchical-attention-networks.pdf).
+[Yang et al. (2016)](https://aclanthology.org/N16-1174).
 The layer learns a parameter vector that is used as the keys in a single-headed
 attention mechanism.