Tidy up and auto-format

explosion · Aug 18, 2019 · 009280f · 009280f
1 parent 89f2b87
commit 009280f
Show file tree

Hide file tree

Showing 12 changed files with 126 additions and 104 deletions.
diff --git a/spacy/_ml.py b/spacy/_ml.py
@@ -674,14 +674,14 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
  with Model.define_operators({">>": chain, "**": clone}):
  # context encoder
  tok2vec = Tok2Vec(
-  width=hidden_width,
-  embed_size=embed_width,
-  pretrained_vectors=pretrained_vectors,
-  cnn_maxout_pieces=cnn_maxout_pieces,
-  subword_features=True,
-  conv_depth=conv_depth,
-  bilstm_depth=0,
-  )
+ width=hidden_width,
+ embed_size=embed_width,
+ pretrained_vectors=pretrained_vectors,
+ cnn_maxout_pieces=cnn_maxout_pieces,
+ subword_features=True,
+ conv_depth=conv_depth,
+ bilstm_depth=0,
+ )
 
  model = (
  tok2vec

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
@@ -8,7 +8,7 @@
 import srsly
 from wasabi import Printer, MESSAGES
 
-from ..gold import GoldCorpus, read_json_object
+from ..gold import GoldCorpus
 from ..syntax import nonproj
 from ..util import load_model, get_lang_class
 
@@ -95,13 +95,19 @@ def debug_data(
  corpus = GoldCorpus(train_path, dev_path)
  try:
  train_docs = list(corpus.train_docs(nlp))
- train_docs_unpreprocessed = list(corpus.train_docs_without_preprocessing(nlp))
+ train_docs_unpreprocessed = list(
+ corpus.train_docs_without_preprocessing(nlp)
+ )
  except ValueError as e:
- loading_train_error_message = "Training data cannot be loaded: {}".format(str(e))
+ loading_train_error_message = "Training data cannot be loaded: {}".format(
+ str(e)
+ )
  try:
  dev_docs = list(corpus.dev_docs(nlp))
  except ValueError as e:
- loading_dev_error_message = "Development data cannot be loaded: {}".format(str(e))
+ loading_dev_error_message = "Development data cannot be loaded: {}".format(
+ str(e)
+ )
  if loading_train_error_message or loading_dev_error_message:
  if loading_train_error_message:
  msg.fail(loading_train_error_message)
@@ -158,11 +164,15 @@ def debug_data(
  )
  if gold_train_data["n_misaligned_words"] > 0:
  msg.warn(
- "{} misaligned tokens in the training data".format(gold_train_data["n_misaligned_words"])
+ "{} misaligned tokens in the training data".format(
+ gold_train_data["n_misaligned_words"]
+ )
  )
  if gold_dev_data["n_misaligned_words"] > 0:
  msg.warn(
- "{} misaligned tokens in the dev data".format(gold_dev_data["n_misaligned_words"])
+ "{} misaligned tokens in the dev data".format(
+ gold_dev_data["n_misaligned_words"]
+ )
  )
  most_common_words = gold_train_data["words"].most_common(10)
  msg.text(
@@ -184,7 +194,9 @@ def debug_data(
 
  if "ner" in pipeline:
  # Get all unique NER labels present in the data
- labels = set(label for label in gold_train_data["ner"] if label not in ("O", "-"))
+ labels = set(
+ label for label in gold_train_data["ner"] if label not in ("O", "-")
+ )
  label_counts = gold_train_data["ner"]
  model_labels = _get_labels_from_model(nlp, "ner")
  new_labels = [l for l in labels if l not in model_labels]
@@ -222,7 +234,9 @@ def debug_data(
  )
 
  if gold_train_data["ws_ents"]:
- msg.fail("{} invalid whitespace entity spans".format(gold_train_data["ws_ents"]))
+ msg.fail(
+ "{} invalid whitespace entity spans".format(gold_train_data["ws_ents"])
+ )
  has_ws_ents_error = True
 
  for label in new_labels:
@@ -323,33 +337,36 @@ def debug_data(
  "Found {} sentence{} with an average length of {:.1f} words.".format(
  gold_train_data["n_sents"],
  "s" if len(train_docs) > 1 else "",
- gold_train_data["n_words"] / gold_train_data["n_sents"]
+ gold_train_data["n_words"] / gold_train_data["n_sents"],
  )
  )
 
  # profile labels
  labels_train = [label for label in gold_train_data["deps"]]
- labels_train_unpreprocessed = [label for label in gold_train_unpreprocessed_data["deps"]]
+ labels_train_unpreprocessed = [
+ label for label in gold_train_unpreprocessed_data["deps"]
+ ]
  labels_dev = [label for label in gold_dev_data["deps"]]
 
  if gold_train_unpreprocessed_data["n_nonproj"] > 0:
  msg.info(
  "Found {} nonprojective train sentence{}".format(
  gold_train_unpreprocessed_data["n_nonproj"],
- "s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else ""
+ "s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "",
  )
  )
  if gold_dev_data["n_nonproj"] > 0:
  msg.info(
  "Found {} nonprojective dev sentence{}".format(
  gold_dev_data["n_nonproj"],
- "s" if gold_dev_data["n_nonproj"] > 1 else ""
+ "s" if gold_dev_data["n_nonproj"] > 1 else "",
  )
  )
 
  msg.info(
  "{} {} in train data".format(
- len(labels_train_unpreprocessed), "label" if len(labels_train) == 1 else "labels"
+ len(labels_train_unpreprocessed),
+ "label" if len(labels_train) == 1 else "labels",
  )
  )
  msg.info(
@@ -373,43 +390,45 @@ def debug_data(
  )
  has_low_data_warning = True
 
-
  # rare labels in projectivized train
  rare_projectivized_labels = []
  for label in gold_train_data["deps"]:
  if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label:
- rare_projectivized_labels.append("{}: {}".format(label, str(gold_train_data["deps"][label])))
+ rare_projectivized_labels.append(
+ "{}: {}".format(label, str(gold_train_data["deps"][label]))
+ )
 
  if len(rare_projectivized_labels) > 0:
- msg.warn(
- "Low number of examples for {} label{} in the "
- "projectivized dependency trees used for training. You may "
- "want to projectivize labels such as punct before "
- "training in order to improve parser performance.".format(
- len(rare_projectivized_labels),
- "s" if len(rare_projectivized_labels) > 1 else "")
- )
- msg.warn(
- "Projectivized labels with low numbers of examples: "
- "{}".format("\n".join(rare_projectivized_labels)),
- show=verbose
+ msg.warn(
+ "Low number of examples for {} label{} in the "
+ "projectivized dependency trees used for training. You may "
+ "want to projectivize labels such as punct before "
+ "training in order to improve parser performance.".format(
+ len(rare_projectivized_labels),
+ "s" if len(rare_projectivized_labels) > 1 else "",
  )
- has_low_data_warning = True
+ )
+ msg.warn(
+ "Projectivized labels with low numbers of examples: "
+ "{}".format("\n".join(rare_projectivized_labels)),
+ show=verbose,
+ )
+ has_low_data_warning = True
 
  # labels only in train
  if set(labels_train) - set(labels_dev):
  msg.warn(
  "The following labels were found only in the train data: "
  "{}".format(", ".join(set(labels_train) - set(labels_dev))),
- show=verbose
+ show=verbose,
  )
 
  # labels only in dev
  if set(labels_dev) - set(labels_train):
  msg.warn(
- "The following labels were found only in the dev data: " +
- ", ".join(set(labels_dev) - set(labels_train)),
- show=verbose
+ "The following labels were found only in the dev data: "
+ + ", ".join(set(labels_dev) - set(labels_train)),
+ show=verbose,
  )
 
  if has_low_data_warning:
@@ -422,8 +441,10 @@ def debug_data(
  # multiple root labels
  if len(gold_train_unpreprocessed_data["roots"]) > 1:
  msg.warn(
- "Multiple root labels ({}) ".format(", ".join(gold_train_unpreprocessed_data["roots"])) +
- "found in training data. spaCy's parser uses a single root "
+ "Multiple root labels ({}) ".format(
+ ", ".join(gold_train_unpreprocessed_data["roots"])
+ )
+ + "found in training data. spaCy's parser uses a single root "
  "label ROOT so this distinction will not be available."
  )
 
@@ -432,14 +453,14 @@ def debug_data(
  msg.fail(
  "Found {} nonprojective projectivized train sentence{}".format(
  gold_train_data["n_nonproj"],
- "s" if gold_train_data["n_nonproj"] > 1 else ""
+ "s" if gold_train_data["n_nonproj"] > 1 else "",
  )
  )
  if gold_train_data["n_cycles"] > 0:
  msg.fail(
  "Found {} projectivized train sentence{} with cycles".format(
  gold_train_data["n_cycles"],
- "s" if gold_train_data["n_cycles"] > 1 else ""
+ "s" if gold_train_data["n_cycles"] > 1 else "",
  )
  )
 

diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
@@ -114,7 +114,7 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc):
  probs, _ = read_freqs(freqs_loc)
  msg.good("Counted frequencies")
  else:
- probs, _ = ({}, DEFAULT_OOV_PROB)
+ probs, _ = ({}, DEFAULT_OOV_PROB) # noqa: F841
  if clusters_loc:
  with msg.loading("Reading clusters..."):
  clusters = read_clusters(clusters_loc)

diff --git a/spacy/errors.py b/spacy/errors.py
@@ -429,6 +429,7 @@ class Errors(object):
  E155 = ("The `nlp` object should have access to pre-trained word vectors, cf. "
  "https://spacy.io/usage/models#languages.")
 
+
 @add_codes
 class TempErrors(object):
  T003 = ("Resizing pre-trained Tagger models is not currently supported.")

diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
@@ -1,10 +1,8 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 
-import re
 import sys
 
-
 from .stop_words import STOP_WORDS
 from .tag_map import TAG_MAP
 from ...attrs import LANG
@@ -32,7 +30,7 @@ class Morpheme:
  from typing import NamedTuple
 
  class Morpheme(NamedTuple):
- 
+
  surface = str("")
  lemma = str("")
  tag = str("")

diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
@@ -8,6 +8,7 @@
 from .stop_words import STOP_WORDS
 from .tag_map import TAG_MAP
 
+
 class ChineseDefaults(Language.Defaults):
  lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
  lex_attr_getters[LANG] = lambda text: "zh"
@@ -45,4 +46,4 @@ def make_doc(self, text):
  return Doc(self.vocab, words=words, spaces=spaces)
 
 
-__all__ = ["Chinese"]
+__all__ = ["Chinese"]
diff --git a/spacy/lang/zh/tag_map.py b/spacy/lang/zh/tag_map.py
@@ -1,8 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from ...symbols import POS, PUNCT, SYM, ADJ, CONJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
-from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
+from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
+from ...symbols import NOUN, PART, INTJ, PRON
 
 # The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank tag set.
 # We also map the tags to the simpler Google Universal POS tag set.
@@ -43,5 +43,5 @@
  "JJ": {POS: ADJ},
  "P": {POS: ADP},
  "PN": {POS: PRON},
- "PU": {POS: PUNCT}
-}
+ "PU": {POS: PUNCT},
+}
diff --git a/spacy/scorer.py b/spacy/scorer.py
@@ -160,14 +160,15 @@ def score(self, doc, gold, verbose=False, punct_labels=("p", "punct")):
  cand_deps.add((gold_i, gold_head, token.dep_.lower()))
  if "-" not in [token[-1] for token in gold.orig_annot]:
  # Find all NER labels in gold and doc
- ent_labels = set([x[0] for x in gold_ents]
- + [k.label_ for k in doc.ents])
+ ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
  # Set up all labels for per type scoring and prepare gold per type
  gold_per_ents = {ent_label: set() for ent_label in ent_labels}
  for ent_label in ent_labels:
  if ent_label not in self.ner_per_ents:
  self.ner_per_ents[ent_label] = PRFScore()
- gold_per_ents[ent_label].update([x for x in gold_ents if x[0] == ent_label])
+ gold_per_ents[ent_label].update(
+ [x for x in gold_ents if x[0] == ent_label]
+ )
  # Find all candidate labels, for all and per type
  cand_ents = set()
  cand_per_ents = {ent_label: set() for ent_label in ent_labels}

diff --git a/spacy/tests/regression/test_issue4002.py b/spacy/tests/regression/test_issue4002.py
@@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-import pytest
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc
 

diff --git a/spacy/tests/regression/test_issue4104.py b/spacy/tests/regression/test_issue4104.py
@@ -3,12 +3,13 @@
 
 from ..util import get_doc
 
+
 def test_issue4104(en_vocab):
  """Test that English lookup lemmatization of spun & dry are correct
  expected mapping = {'dry': 'dry', 'spun': 'spin', 'spun-dry': 'spin-dry'}
- """
- text = 'dry spun spun-dry'
+ """
+ text = "dry spun spun-dry"
  doc = get_doc(en_vocab, [t for t in text.split(" ")])
  # using a simple list to preserve order
- expected = ['dry', 'spin', 'spin-dry']
+ expected = ["dry", "spin", "spin-dry"]
  assert [token.lemma_ for token in doc] == expected
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
@@ -6,6 +6,7 @@
 from spacy.tokens import Doc
 import pytest
 
+
 def test_gold_biluo_U(en_vocab):
  words = ["I", "flew", "to", "London", "."]
  spaces = [True, True, True, False, True]
@@ -32,14 +33,18 @@ def test_gold_biluo_BIL(en_vocab):
  tags = biluo_tags_from_offsets(doc, entities)
  assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
 
+
 def test_gold_biluo_overlap(en_vocab):
  words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
  spaces = [True, True, True, True, True, False, True]
  doc = Doc(en_vocab, words=words, spaces=spaces)
- entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
- (len("I flew to "), len("I flew to San Francisco"), "LOC")]
+ entities = [
+ (len("I flew to "), len("I flew to San Francisco Valley"), "LOC"),
+ (len("I flew to "), len("I flew to San Francisco"), "LOC"),
+ ]
  with pytest.raises(ValueError):
- tags = biluo_tags_from_offsets(doc, entities)
+ biluo_tags_from_offsets(doc, entities)
+
 
 def test_gold_biluo_misalign(en_vocab):
  words = ["I", "flew", "to", "San", "Francisco", "Valley."]