Fix RoBERTa SST (#110)

* Only make tokens when we don't already have them * Changelog
allenai · Aug 17, 2020 · 4fa5fc1 · 4fa5fc1
1 parent 0491690
commit 4fa5fc1
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Fixed `GraphParser.get_metrics` so that it expects a dict from `F1Measure.get_metric`.
 - `CopyNet` and `SimpleSeq2Seq` models now work with AMP.
+- Made the SST reader a little more strict in the kinds of input it accepts.
+
 
 ## [v1.1.0rc2](https:/allenai/allennlp-models/releases/tag/v1.1.0rc2) - 2020-07-31
 

diff --git a/allennlp_models/classification/dataset_readers/stanford_sentiment_tree_bank.py b/allennlp_models/classification/dataset_readers/stanford_sentiment_tree_bank.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
 import logging
 
 from allennlp.data import Tokenizer
@@ -111,9 +111,20 @@ def text_to_instance(self, tokens: List[str], sentiment: str = None) -> Optional
  label : `LabelField`
  The sentiment label of the sentence or phrase.
  """
-
+ assert isinstance(
+ tokens, list
+ ) # If tokens is a str, nothing breaks but the results are garbage, so we check.
  if self._tokenizer is None:
- tokens = [Token(x) for x in tokens]
+
+ def make_token(t: Union[str, Token]):
+ if isinstance(t, str):
+ return Token(t)
+ elif isinstance(t, Token):
+ return t
+ else:
+ raise ValueError("Tokens must be either str or Token.")
+
+ tokens = [make_token(x) for x in tokens]
  else:
  tokens = self._tokenizer.tokenize(" ".join(tokens))
  text_field = TextField(tokens, token_indexers=self._token_indexers)