Add JSON schemas (see #2928)

explosion · Nov 15, 2018 · db2f29b · db2f29b
1 parent 6e27900
commit db2f29b
Show file tree

Hide file tree

Showing 7 changed files with 366 additions and 0 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -10,6 +10,7 @@ ujson>=1.35
 dill>=0.2,<0.3
 regex==2017.4.5
 requests>=2.13.0,<3.0.0
+jsonschema>=2.6.0,<3.0.0
 pytest>=3.6.0,<4.0.0
 pytest-timeout>=1.3.0,<2.0.0
 mock>=2.0.0,<3.0.0

diff --git a/setup.py b/setup.py
@@ -198,6 +198,7 @@ def setup_package():
  'regex==2017.4.5',
  'dill>=0.2,<0.3',
  'requests>=2.13.0,<3.0.0',
+ 'jsonschema>=2.6.0,<3.0.0',
  'pathlib==1.0.1; python_version < "3.4"'],
  setup_requires=['wheel'],
  extras_require={

diff --git a/spacy/cli/schemas/__init__.py b/spacy/cli/schemas/__init__.py
@@ -0,0 +1,51 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from pathlib import Path
+from jsonschema import Draft4Validator
+
+from ...errors import Errors
+from ...util import read_json
+
+
+SCHEMAS = {}
+
+
+def get_schema(name):
+ """Get the JSON schema for a given name. Looks for a .json file in
+ spacy.cli.schemas, validates the schema and raises ValueError if not found.
+
+ EXAMPLE:
+ >>> schema = get_schema('training')
+
+ name (unicode): The name of the schema.
+ RETURNS (dict): The JSON schema.
+ """
+ if name not in SCHEMAS:
+ schema_path = Path(__file__).parent / '{}.json'.format(name)
+ if not schema_path.exists():
+ raise ValueError(Errors.E102.format(name=name))
+ schema = read_json(schema_path)
+ # TODO: replace with (stable) Draft6Validator, if available
+ validator = Draft4Validator(schema)
+ validator.check_schema(schema)
+ SCHEMAS[name] = schema
+ return SCHEMAS[name]
+
+
+def validate_json(data, schema):
+ """Validate data against a given JSON schema (see https://json-schema.org).
+
+ data: JSON-serializable data to validate.
+ schema (dict): The JSON schema.
+ RETURNS (list): A list of error messages, if available.
+ """
+ validator = Draft4Validator(schema)
+ errors = []
+ for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
+ if err.path:
+ err_path = '[{}]'.format(' -> '.join([str(p) for p in err.path]))
+ else:
+ err_path = ''
+ errors.append(err.message + ' ' + err_path)
+ return errors
diff --git a/spacy/cli/schemas/meta.json b/spacy/cli/schemas/meta.json
@@ -0,0 +1,128 @@
+{
+ "$schema": "http://json-schema.org/draft-06/schema",
+ "type": "object",
+ "properties": {
+ "lang": {
+ "title": "Two-letter language code, e.g. 'en'",
+ "type": "string",
+ "minLength": 2,
+ "maxLength": 2,
+ "pattern": "^[a-z]*$"
+ },
+ "name": {
+ "title": "Model name",
+ "type": "string",
+ "minLength": 1,
+ "pattern": "^[a-z_]*$"
+ },
+ "version": {
+ "title": "Model version",
+ "type": "string",
+ "minLength": 1,
+ "pattern": "^[0-9a-z.-]*$"
+ },
+ "spacy_version": {
+ "title": "Compatible spaCy version identifier",
+ "type": "string",
+ "minLength": 1,
+ "pattern": "^[0-9a-z.-><=]*$"
+ },
+ "parent_package": {
+ "title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
+ "type": "string",
+ "minLength": 1,
+ "default": "spacy"
+ },
+ "pipeline": {
+ "title": "Names of pipeline components",
+ "type": "array",
+ "items": {
+ "type": "string",
+ "minLength": 1
+ }
+ },
+ "description": {
+ "title": "Model description",
+ "type": "string"
+ },
+ "license": {
+ "title": "Model license",
+ "type": "string"
+ },
+ "author": {
+ "title": "Model author name",
+ "type": "string"
+ },
+ "email": {
+ "title": "Model author email",
+ "type": "string",
+ "format": "email"
+ },
+ "url": {
+ "title": "Model author URL",
+ "type": "string",
+ "format": "uri"
+ },
+ "sources": {
+ "title": "Training data sources",
+ "type": "array",
+ "items": {
+ "type": "string"
+ }
+ },
+ "vectors": {
+ "title": "Included word vectors",
+ "type": "object",
+ "properties": {
+ "keys": {
+ "title": "Number of unique keys",
+ "type": "integer",
+ "minimum": 0
+ },
+ "vectors": {
+ "title": "Number of unique vectors",
+ "type": "integer",
+ "minimum": 0
+ },
+ "width": {
+ "title": "Number of dimensions",
+ "type": "integer",
+ "minimum": 0
+ }
+ }
+ },
+ "accuracy": {
+ "title": "Accuracy numbers",
+ "type": "object",
+ "patternProperties": {
+ "*": {
+ "type": "number",
+ "minimum": 0.0
+ }
+ }
+ },
+ "speed": {
+ "title": "Speed evaluation numbers",
+ "type": "object",
+ "patternProperties": {
+ "*": {
+ "oneOf": [
+ {
+ "type": "number",
+ "minimum": 0.0
+ },
+ {
+ "type": "integer",
+ "minimum": 0
+ }
+ ]
+ }
+ }
+ }
+ },
+ "required": [
+ "lang",
+ "name",
+ "version"
+ ]
+}
diff --git a/spacy/cli/schemas/training.json b/spacy/cli/schemas/training.json
@@ -0,0 +1,146 @@
+{
+ "$schema": "http://json-schema.org/draft-06/schema",
+ "title": "Training data for spaCy models",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "text": {
+ "title": "The text of the training example",
+ "type": "string",
+ "minLength": 1
+ },
+ "ents": {
+ "title": "Named entity spans in the text",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "start": {
+ "title": "Start character offset of the span",
+ "type": "integer",
+ "minimum": 0
+ },
+ "end": {
+ "title": "End character offset of the span",
+ "type": "integer",
+ "minimum": 0
+ },
+ "label": {
+ "title": "Entity label",
+ "type": "string",
+ "minLength": 1,
+ "pattern": "^[A-Z0-9]*$"
+ }
+ },
+ "required": [
+ "start",
+ "end",
+ "label"
+ ]
+ }
+ },
+ "sents": {
+ "title": "Sentence spans in the text",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "start": {
+ "title": "Start character offset of the span",
+ "type": "integer",
+ "minimum": 0
+ },
+ "end": {
+ "title": "End character offset of the span",
+ "type": "integer",
+ "minimum": 0
+ }
+ },
+ "required": [
+ "start",
+ "end"
+ ]
+ }
+ },
+ "cats": {
+ "title": "Text categories for the text classifier",
+ "type": "object",
+ "patternProperties": {
+ "*": {
+ "title": "A text category",
+ "oneOf": [
+ {
+ "type": "boolean"
+ },
+ {
+ "type": "number",
+ "minimum": 0
+ }
+ ]
+ }
+ },
+ "propertyNames": {
+ "pattern": "^[A-Z0-9]*$",
+ "minLength": 1
+ }
+ },
+ "tokens": {
+ "title": "The tokens in the text",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "minProperties": 1,
+ "properties": {
+ "id": {
+ "title": "Token ID, usually token index",
+ "type": "integer",
+ "minimum": 0
+ },
+ "start": {
+ "title": "Start character offset of the token",
+ "type": "integer",
+ "minimum": 0
+ },
+ "end": {
+ "title": "End character offset of the token",
+ "type": "integer",
+ "minimum": 0
+ },
+ "pos": {
+ "title": "Coarse-grained part-of-speech tag",
+ "type": "string",
+ "minLength": 1
+ },
+ "tag": {
+ "title": "Fine-grained part-of-speech tag",
+ "type": "string",
+ "minLength": 1
+ },
+ "dep": {
+ "title": "Dependency label",
+ "type": "string",
+ "minLength": 1
+ },
+ "head": {
+ "title": "Index of the token's head",
+ "type": "integer",
+ "minimum": 0
+ }
+ },
+ "required": [
+ "start",
+ "end"
+ ]
+ }
+ },
+ "_": {
+ "title": "Custom user space",
+ "type": "object"
+ }
+ },
+ "required": [
+ "text"
+ ]
+ }
+}
diff --git a/spacy/errors.py b/spacy/errors.py
@@ -270,6 +270,7 @@ class Errors(object):
  "NBOR_RELOP.")
  E101 = ("NODE_NAME should be a new node and NBOR_NAME should already have "
  "have been declared in previous edges.")
+ E102 = ("Can't find JSON schema for '{name}'.")
 
 
 @add_codes

diff --git a/spacy/tests/test_json_schemas.py b/spacy/tests/test_json_schemas.py
@@ -0,0 +1,38 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from spacy.cli.schemas import validate_json, get_schema
+import pytest
+
+
+@pytest.fixture(scope='session')
+def training_schema():
+ return get_schema('training')
+
+
+def test_json_schema_get():
+ schema = get_schema('training')
+ assert schema
+ with pytest.raises(ValueError):
+ schema = get_schema('xxx')
+
+
+@pytest.mark.parametrize('data', [
+ {'text': 'Hello world'},
+ {'text': 'Hello', 'ents': [{'start': 0, 'end': 5, 'label': 'TEST'}]}
+])
+def test_json_schema_training_valid(data, training_schema):
+ errors = validate_json([data], training_schema)
+ assert not errors
+
+
+@pytest.mark.parametrize('data,n_errors', [
+ ({'spans': []}, 1),
+ ({'text': 'Hello', 'ents': [{'start': '0', 'end': '5', 'label': 'TEST'}]}, 2),
+ ({'text': 'Hello', 'ents': [{'start': 0, 'end': 5}]}, 1),
+ ({'text': 'Hello', 'ents': [{'start': 0, 'end': 5, 'label': 'test'}]}, 1),
+ ({'text': 'spaCy', 'tokens': [{'pos': 'PROPN'}]}, 2)
+])
+def test_json_schema_training_invalid(data, n_errors, training_schema):
+ errors = validate_json([data], training_schema)
+ assert len(errors) == n_errors