Update TextCatBOW to use the fixed SparseLinear layer (explosion#…

…13149) * Update `TextCatBOW` to use the fixed `SparseLinear` layer A while ago, we fixed the `SparseLinear` layer to use all available parameters: explosion/thinc#754 This change updates `TextCatBOW` to `v3` which uses the new `SparseLinear_v2` layer. This results in a sizeable improvement on a text categorization task that was tested. While at it, this `spacy.TextCatBOW.v3` also adds the `length_exponent` option to make it possible to change the hidden size. Ideally, we'd just have an option called `length`. But the way that `TextCatBOW` uses hashes results in a non-uniform distribution of parameters when the length is not a power of two. * Replace TexCatBOW `length_exponent` parameter by `length` We now round up the length to the next power of two if it isn't a power of two. * Remove some tests for TextCatBOW.v2 * Fix missing import
jordankanter · May 10, 2024 · 4149618 · 4149618
1 parent ad4da43
commit 4149618
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 7 deletions.
diff --git a/spacy/errors.py b/spacy/errors.py
@@ -974,9 +974,6 @@ class Errors(metaclass=ErrorsWithCodes):
  E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
  "but only callbacks with one or three parameters are supported")
  E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
- E1057 = ("The `TextCatReduce` architecture must be used with at least one "
- "reduction. Please enable one of `use_reduce_first`, "
- "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
 
  # v4 error strings
  E4000 = ("Expected a Doc as input, but got: '{type}'")

diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
@@ -499,9 +499,9 @@ def test_resize(name, textcat_config):
  ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
  ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
  ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
- # REDUCE
- ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
- ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+ # CNN
+ ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
+ ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
  ],
 )
 # fmt: on
@@ -749,7 +749,7 @@ def test_overfitting_IO_multi():
  # ENSEMBLE V2
  ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
  ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
- # CNN V2 (legacy)
+ # CNN V2
  ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
  ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
  # PARAMETRIC ATTENTION V1

diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
@@ -1020,6 +1020,46 @@ but used an internal `tok2vec` instead of taking it as argument:
 
 ### spacy.TextCatBOW.v3 {id="TextCatBOW"}
 
+> #### Example Config
+>
+> ```ini
+> [model]
+> @architectures = "spacy.TextCatCNN.v2"
+> exclusive_classes = false
+> nO = null
+>
+> [model.tok2vec]
+> @architectures = "spacy.HashEmbedCNN.v2"
+> pretrained_vectors = null
+> width = 96
+> depth = 4
+> embed_size = 2000
+> window_size = 1
+> maxout_pieces = 3
+> subword_features = true
+> ```
+
+A neural network model where token vectors are calculated using a CNN. The
+vectors are mean pooled and used as features in a feed-forward network. This
+architecture is usually less accurate than the ensemble, but runs faster.
+
+| Name | Description |
+| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
+| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
+| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
+| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
+
+<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
+
+[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
+not yet resizable. Since v2, new labels can be added to this component, even
+after training.
+
+</Accordion>
+
+### spacy.TextCatBOW.v3 {id="TextCatBOW"}
+
 > #### Example Config
 >
 > ```ini