explosion · adrianeboyd · Sep 9, 2022 · Jul 13, 2022 · Jul 14, 2022 · Jul 18, 2022
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 # Thinc: A refreshing functional take on deep learning, compatible with your favorite libraries
 
-### From the makers of [spaCy](https://spacy.io), [Prodigy](https://prodi.gy) and [FastAPI](https://fastapi.tiangolo.com)
+### From the makers of [spaCy](https://spacy.io) and [Prodigy](https://prodi.gy)
 
 [Thinc](https://thinc.ai) is a **lightweight deep learning library** that offers an elegant,
 type-checked, functional-programming API for **composing models**, with support

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -23,7 +23,7 @@ jobs:
  imageName: 'windows-2019'
  python.version: '3.6'
  Python37Mac:
- imageName: 'macos-10.15'
+ imageName: 'macos-latest'
  python.version: '3.7'
  Python38Linux:
  imageName: 'ubuntu-latest'

diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
  "murmurhash>=1.0.2,<1.1.0",
  "cymem>=2.0.2,<2.1.0",
  "preshed>=3.0.2,<3.1.0",
- "blis>=0.7.8,<0.8.0",
+ "blis>=0.7.8,<0.10.0",
  "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
@@ -2,7 +2,7 @@
 murmurhash>=1.0.2,<1.1.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-blis>=0.7.8,<0.8.0
+blis>=0.7.8,<0.10.0
 srsly>=2.4.0,<3.0.0
 wasabi>=0.8.1,<1.1.0
 catalogue>=2.0.4,<2.1.0
@@ -34,3 +34,4 @@ nbformat>=5.0.4,<5.2.0
 # Test to_disk/from_disk against pathlib.Path subclasses
 pathy>=0.3.5
 black>=22.0,<23.0
+confection>=0.0.1,<1.0.0
diff --git a/setup.cfg b/setup.cfg
@@ -35,16 +35,17 @@ setup_requires =
  cymem>=2.0.2,<2.1.0
  preshed>=3.0.2,<3.1.0
  murmurhash>=1.0.2,<1.1.0
- blis>=0.7.8,<0.8.0
+ blis>=0.7.8,<0.10.0
 install_requires =
  # Explosion-provided dependencies
- blis>=0.7.8,<0.8.0
+ blis>=0.7.8,<0.10.0
  murmurhash>=1.0.2,<1.1.0
  cymem>=2.0.2,<2.1.0
  preshed>=3.0.2,<3.1.0
  wasabi>=0.8.1,<1.1.0
  srsly>=2.4.0,<3.0.0
  catalogue>=2.0.4,<2.1.0
+ confection>=0.0.1,<1.0.0
  # Third-party dependencies
  setuptools
  numpy>=1.15.0

diff --git a/setup.py b/setup.py
@@ -25,7 +25,7 @@
 ]
 COMPILE_OPTIONS = {
  "msvc": ["/Ox", "/EHsc"],
- "other": ["-O3", "-Wno-strict-prototypes", "-Wno-unused-function"],
+ "other": ["-O3", "-Wno-strict-prototypes", "-Wno-unused-function", "-std=c++11"],
 }
 COMPILER_DIRECTIVES = {
  "language_level": -3,

diff --git a/thinc/about.py b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.1.0"
+__version__ = "8.1.1"
 __release__ = True
diff --git a/thinc/api.py b/thinc/api.py
@@ -27,7 +27,7 @@
 from .layers import CauchySimilarity, ParametricAttention, Logistic
 from .layers import resizable, sigmoid_activation, Sigmoid, SparseLinear
 from .layers import ClippedLinear, ReluK, HardTanh, HardSigmoid
-from .layers import HardSwish, HardSwishMobilenet, Swish, Gelu
+from .layers import Dish, HardSwish, HardSwishMobilenet, Swish, Gelu
 from .layers import PyTorchWrapper, PyTorchRNNWrapper, PyTorchLSTM
 from .layers import TensorFlowWrapper, keras_subclass, MXNetWrapper
 from .layers import PyTorchWrapper_v2, Softmax_v2
@@ -40,6 +40,7 @@
 from .layers import with_reshape, with_getitem, strings2arrays, list2array
 from .layers import list2ragged, ragged2list, list2padded, padded2list, remap_ids
 from .layers import array_getitem, with_cpu, with_debug, with_nvtx_range
+from .layers import with_signpost_interval
 from .layers import tuplify
 
 from .layers import reduce_first, reduce_last, reduce_max, reduce_mean, reduce_sum

diff --git a/thinc/backends/_custom_kernels.cu b/thinc/backends/_custom_kernels.cu
@@ -161,6 +161,20 @@ __global__ void clipped_linear(T* Y, const T* X, double slope, double offset, do
 }
 
 
+template <typename T>
+__global__ void dish(T* Y, const T* X, int N)
+{
+ int _loop_start = blockIdx.x * blockDim.x + threadIdx.x;
+ int _loop_stride = blockDim.x * gridDim.x;
+
+ for (int i = _loop_start; i < N; i += _loop_stride)
+ {
+ T x = X[i];
+ Y[i] = 0.5 * x * (x / sqrt(1 + x * x) + 1);
+ }
+}
+
+
 template <typename T>
 __global__ void gelu(T* Y, const T* X, double threshold, int N)
 {
@@ -414,6 +428,23 @@ __global__ void backprop_hard_swish_mobilenet(T* dX, const T* dY, const T* X, in
 }
 
 
+template <typename T>
+__global__ void backprop_dish(T* dX, const T* dY, const T* X, int N)
+{
+
+ int _loop_start = blockIdx.x * blockDim.x + threadIdx.x;
+ int _loop_stride = blockDim.x * gridDim.x;
+
+ for (int i = _loop_start; i < N; i += _loop_stride)
+ {
+ T x = X[i];
+ T x_sq = x * x;
+ T x_sq_plus_one = x_sq + 1.0;
+ dX[i] = dY[i] * (x/sqrt(x_sq_plus_one) - (0.5 * x * x_sq)
+ / pow(x_sq_plus_one, static_cast<T>(1.5)) + 0.5);
+ }
+}
+
 
 template <typename T>
 __global__ void backprop_gelu(T* dX, const T* dY, const T* X,

diff --git a/thinc/backends/_custom_kernels.py b/thinc/backends/_custom_kernels.py
@@ -10,6 +10,8 @@
 KERNELS_LIST = [
  "backprop_clipped_linear<double>",
  "backprop_clipped_linear<float>",
+ "backprop_dish<double>",
+ "backprop_dish<float>",
  "backprop_gelu<double>",
  "backprop_gelu<float>",
  "backprop_hard_swish<double>",
@@ -32,6 +34,8 @@
  "backprop_swish<float>",
  "clipped_linear<double>",
  "clipped_linear<float>",
+ "dish<double>",
+ "dish<float>",
  "gather_add<double>",
  "gather_add<float>",
  "gelu<double>",
@@ -78,6 +82,8 @@ def compile_mmh(src):
 
 clipped_linear_kernel_float = _get_kernel("clipped_linear<float>")
 clipped_linear_kernel_double = _get_kernel("clipped_linear<double>")
+dish_kernel_float = _get_kernel("dish<float>")
+dish_kernel_double = _get_kernel("dish<double>")
 gather_add_kernel_float = _get_kernel("gather_add<float>")
 gather_add_kernel_double = _get_kernel("gather_add<double>")
 gelu_kernel_float = _get_kernel("gelu<float>")
@@ -98,6 +104,8 @@ def compile_mmh(src):
 
 backprop_clipped_linear_kernel_double = _get_kernel("backprop_clipped_linear<double>")
 backprop_clipped_linear_kernel_float = _get_kernel("backprop_clipped_linear<float>")
+backprop_dish_kernel_double = _get_kernel("backprop_dish<double>")
+backprop_dish_kernel_float = _get_kernel("backprop_dish<float>")
 backprop_gelu_kernel_double = _get_kernel("backprop_gelu<double>")
 backprop_gelu_kernel_float = _get_kernel("backprop_gelu<float>")
 backprop_hard_swish_kernel_double = _get_kernel("backprop_hard_swish<double>")
@@ -199,6 +207,19 @@ def gather_add(table, indices, *, threads_per_block=128, num_blocks=128):
  return out
 
 
+def dish(X, *, inplace=False, threads_per_block=128, num_blocks=128):
+ _is_float_array(X)
+
+ out = X
+ if not inplace:
+ out = _alloc_like(X, zeros=False)
+ if X.dtype == "float32":
+ dish_kernel_float((num_blocks,), (threads_per_block,), (out, X, X.size))
+ else:
+ dish_kernel_double((num_blocks,), (threads_per_block,), (out, X, X.size))
+ return out
+
+
 def gelu(X, *, inplace=False, threshold=6.0, threads_per_block=128, num_blocks=128):
  _is_float_array(X)
 
@@ -483,6 +504,33 @@ def backprop_hard_swish_mobilenet(
  return out
 
 
+def backprop_dish(
+ dY,
+ X,
+ *,
+ inplace: bool = False,
+ threads_per_block=128,
+ num_blocks=128,
+):
+ _is_float_array(dY)
+ _is_float_array(X, shape=dY.shape)
+
+ out = dY
+ if not inplace:
+ out = _alloc_like(dY, zeros=False)
+
+ if dY.dtype == "float32":
+ backprop_dish_kernel_float(
+ (num_blocks,), (threads_per_block,), (out, dY, X, out.size)
+ )
+ else:
+ backprop_dish_kernel_double(
+ (num_blocks,), (threads_per_block,), (out, dY, X, out.size)
+ )
+
+ return out
+
+
 def backprop_gelu(
  dY,
  X,

diff --git a/thinc/backends/cpu_kernels.hh b/thinc/backends/cpu_kernels.hh
@@ -27,23 +27,58 @@ struct axpy {
 
 // All elementwise functions, such as most activations, work in-place.
 
-template <typename A, typename L>
-L argmax(A* arr, L len)
+
+template <typename T, typename L>
+struct argmax_result {
+ T max;
+ L max_idx;
+};
+
+template <typename T, typename L>
+argmax_result<T, L> argmax(T const *arr, L len)
 {
- static_assert(std::is_floating_point<A>::value,
+ static_assert(std::is_floating_point<T>::value,
  "Array should be floating point");
  static_assert(std::is_integral<L>::value, "Array length should be integral");
 
- L max = 0;
+ argmax_result<T, L> r { arr[0], 0 };
+
  for (L i = 1; i < len; ++i) {
- if (arr[i] > arr[max]) {
- max = i;
+ if (arr[i] > r.max) {
+ r.max = arr[i];
+ r.max_idx = i;
  }
  }
 
- return max;
+ return r;
+}
+
+// The next two templates define argmax for a fixed number of elements.
+
+template <typename T, typename L>
+argmax_result<T, L> argmax(T a) {
+ static_assert(std::is_floating_point<T>::value, "Argument should be floating point");
+ argmax_result<T, L> acc { a, 0 };
+ return acc;
+}
+
+template<typename T, typename L, typename... Args>
+argmax_result<T, L> argmax(T a, Args... args) {
+ static_assert(std::is_floating_point<T>::value, "Arguments should be floating point");
+
+ auto acc = argmax<T, L>(args...);
+
+ if (acc.max > a) {
+ acc.max_idx += 1;
+ } else {
+ acc.max_idx = 0;
+ acc.max = a;
+ }
+
+ return acc;
 }
 
+
 template <typename A, typename L>
 void vec_add(A* X, const A* Y, A scale, L N)
 {
@@ -62,12 +97,31 @@ void cpu_maxout(A* best__bo, L* which__bo, const A* cands__bop, L B, L O, L P)
  "Array should be floating point");
  static_assert(std::is_integral<L>::value, "Array length should be integral");
 
- for (int i = 0; i < B * O; ++i) {
- which__bo[i] = argmax(cands__bop + i * P, P);
- best__bo[i] = cands__bop[i * P + which__bo[i]];
+ // For small inputs, we use an unrolled argmax.
+ if (P == 2) {
+ for (int i = 0; i < B * O; ++i) {
+ A const *input = cands__bop + i * P;
+ auto r = argmax<A, L>(input[0], input[1]);
+ which__bo[i] = r.max_idx;
+ best__bo[i] = r.max;
+ }
+ } else if (P == 3) {
+ for (int i = 0; i < B * O; ++i) {
+ A const *input = cands__bop + i * P;
+ auto r = argmax<A, L>(input[0], input[1], input[2]);
+ which__bo[i] = r.max_idx;
+ best__bo[i] = r.max;
+ }
+ } else {
+ for (int i = 0; i < B * O; ++i) {
+ auto r = argmax<A, L>(cands__bop + i * P, P);
+ which__bo[i] = r.max_idx;
+ best__bo[i] = r.max;
+ }
  }
 }
 
+
 template <typename A, typename L>
 void cpu_backprop_maxout(A* dX__bop, const A* dX__bo, const L* which__bo,
  L B, L O, L P)

diff --git a/thinc/backends/cupy_ops.py b/thinc/backends/cupy_ops.py
@@ -36,6 +36,18 @@ def gather_add(self, table, indices):
  else:
  return super().gather_add(table, indices)
 
+ def dish(self, X, inplace=False):
+ if X.dtype in ("float32", "float64"):
+ return _custom_kernels.dish(X, inplace=inplace)
+ else:
+ return super().dish(X, inplace=inplace)
+
+ def backprop_dish(self, dY, X, inplace=False):
+ if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
+ return _custom_kernels.backprop_dish(dY, X, inplace=inplace)
+ else:
+ return super().backprop_dish(dY, X, inplace=inplace)
+
  def gelu(self, X, inplace=False):
  if X.dtype in ("float32", "float64"):
  return _custom_kernels.gelu(X, inplace=inplace, threshold=6.0)