Skip to content

Commit

Permalink
Add JSON schemas (see #2928)
Browse files Browse the repository at this point in the history
  • Loading branch information
ines committed Nov 15, 2018
1 parent 6e27900 commit db2f29b
Show file tree
Hide file tree
Showing 7 changed files with 366 additions and 0 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ ujson>=1.35
dill>=0.2,<0.3
regex==2017.4.5
requests>=2.13.0,<3.0.0
jsonschema>=2.6.0,<3.0.0
pytest>=3.6.0,<4.0.0
pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ def setup_package():
'regex==2017.4.5',
'dill>=0.2,<0.3',
'requests>=2.13.0,<3.0.0',
'jsonschema>=2.6.0,<3.0.0',
'pathlib==1.0.1; python_version < "3.4"'],
setup_requires=['wheel'],
extras_require={
Expand Down
51 changes: 51 additions & 0 deletions spacy/cli/schemas/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# coding: utf-8
from __future__ import unicode_literals

from pathlib import Path
from jsonschema import Draft4Validator

from ...errors import Errors
from ...util import read_json


SCHEMAS = {}


def get_schema(name):
"""Get the JSON schema for a given name. Looks for a .json file in
spacy.cli.schemas, validates the schema and raises ValueError if not found.
EXAMPLE:
>>> schema = get_schema('training')
name (unicode): The name of the schema.
RETURNS (dict): The JSON schema.
"""
if name not in SCHEMAS:
schema_path = Path(__file__).parent / '{}.json'.format(name)
if not schema_path.exists():
raise ValueError(Errors.E102.format(name=name))
schema = read_json(schema_path)
# TODO: replace with (stable) Draft6Validator, if available
validator = Draft4Validator(schema)
validator.check_schema(schema)
SCHEMAS[name] = schema
return SCHEMAS[name]


def validate_json(data, schema):
"""Validate data against a given JSON schema (see https://json-schema.org).
data: JSON-serializable data to validate.
schema (dict): The JSON schema.
RETURNS (list): A list of error messages, if available.
"""
validator = Draft4Validator(schema)
errors = []
for err in sorted(validator.iter_errors(data), key=lambda e: e.path):
if err.path:
err_path = '[{}]'.format(' -> '.join([str(p) for p in err.path]))
else:
err_path = ''
errors.append(err.message + ' ' + err_path)
return errors
128 changes: 128 additions & 0 deletions spacy/cli/schemas/meta.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
{
"$schema": "http://json-schema.org/draft-06/schema",
"type": "object",
"properties": {
"lang": {
"title": "Two-letter language code, e.g. 'en'",
"type": "string",
"minLength": 2,
"maxLength": 2,
"pattern": "^[a-z]*$"
},
"name": {
"title": "Model name",
"type": "string",
"minLength": 1,
"pattern": "^[a-z_]*$"
},
"version": {
"title": "Model version",
"type": "string",
"minLength": 1,
"pattern": "^[0-9a-z.-]*$"
},
"spacy_version": {
"title": "Compatible spaCy version identifier",
"type": "string",
"minLength": 1,
"pattern": "^[0-9a-z.-><=]*$"
},
"parent_package": {
"title": "Name of parent spaCy package, e.g. spacy or spacy-nightly",
"type": "string",
"minLength": 1,
"default": "spacy"
},
"pipeline": {
"title": "Names of pipeline components",
"type": "array",
"items": {
"type": "string",
"minLength": 1
}
},
"description": {
"title": "Model description",
"type": "string"
},
"license": {
"title": "Model license",
"type": "string"
},
"author": {
"title": "Model author name",
"type": "string"
},
"email": {
"title": "Model author email",
"type": "string",
"format": "email"
},
"url": {
"title": "Model author URL",
"type": "string",
"format": "uri"
},
"sources": {
"title": "Training data sources",
"type": "array",
"items": {
"type": "string"
}
},
"vectors": {
"title": "Included word vectors",
"type": "object",
"properties": {
"keys": {
"title": "Number of unique keys",
"type": "integer",
"minimum": 0
},
"vectors": {
"title": "Number of unique vectors",
"type": "integer",
"minimum": 0
},
"width": {
"title": "Number of dimensions",
"type": "integer",
"minimum": 0
}
}
},
"accuracy": {
"title": "Accuracy numbers",
"type": "object",
"patternProperties": {
"*": {
"type": "number",
"minimum": 0.0
}
}
},
"speed": {
"title": "Speed evaluation numbers",
"type": "object",
"patternProperties": {
"*": {
"oneOf": [
{
"type": "number",
"minimum": 0.0
},
{
"type": "integer",
"minimum": 0
}
]
}
}
}
},
"required": [
"lang",
"name",
"version"
]
}
146 changes: 146 additions & 0 deletions spacy/cli/schemas/training.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
{
"$schema": "http://json-schema.org/draft-06/schema",
"title": "Training data for spaCy models",
"type": "array",
"items": {
"type": "object",
"properties": {
"text": {
"title": "The text of the training example",
"type": "string",
"minLength": 1
},
"ents": {
"title": "Named entity spans in the text",
"type": "array",
"items": {
"type": "object",
"properties": {
"start": {
"title": "Start character offset of the span",
"type": "integer",
"minimum": 0
},
"end": {
"title": "End character offset of the span",
"type": "integer",
"minimum": 0
},
"label": {
"title": "Entity label",
"type": "string",
"minLength": 1,
"pattern": "^[A-Z0-9]*$"
}
},
"required": [
"start",
"end",
"label"
]
}
},
"sents": {
"title": "Sentence spans in the text",
"type": "array",
"items": {
"type": "object",
"properties": {
"start": {
"title": "Start character offset of the span",
"type": "integer",
"minimum": 0
},
"end": {
"title": "End character offset of the span",
"type": "integer",
"minimum": 0
}
},
"required": [
"start",
"end"
]
}
},
"cats": {
"title": "Text categories for the text classifier",
"type": "object",
"patternProperties": {
"*": {
"title": "A text category",
"oneOf": [
{
"type": "boolean"
},
{
"type": "number",
"minimum": 0
}
]
}
},
"propertyNames": {
"pattern": "^[A-Z0-9]*$",
"minLength": 1
}
},
"tokens": {
"title": "The tokens in the text",
"type": "array",
"items": {
"type": "object",
"minProperties": 1,
"properties": {
"id": {
"title": "Token ID, usually token index",
"type": "integer",
"minimum": 0
},
"start": {
"title": "Start character offset of the token",
"type": "integer",
"minimum": 0
},
"end": {
"title": "End character offset of the token",
"type": "integer",
"minimum": 0
},
"pos": {
"title": "Coarse-grained part-of-speech tag",
"type": "string",
"minLength": 1
},
"tag": {
"title": "Fine-grained part-of-speech tag",
"type": "string",
"minLength": 1
},
"dep": {
"title": "Dependency label",
"type": "string",
"minLength": 1
},
"head": {
"title": "Index of the token's head",
"type": "integer",
"minimum": 0
}
},
"required": [
"start",
"end"
]
}
},
"_": {
"title": "Custom user space",
"type": "object"
}
},
"required": [
"text"
]
}
}
1 change: 1 addition & 0 deletions spacy/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ class Errors(object):
"NBOR_RELOP.")
E101 = ("NODE_NAME should be a new node and NBOR_NAME should already have "
"have been declared in previous edges.")
E102 = ("Can't find JSON schema for '{name}'.")


@add_codes
Expand Down
38 changes: 38 additions & 0 deletions spacy/tests/test_json_schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# coding: utf-8
from __future__ import unicode_literals

from spacy.cli.schemas import validate_json, get_schema
import pytest


@pytest.fixture(scope='session')
def training_schema():
return get_schema('training')


def test_json_schema_get():
schema = get_schema('training')
assert schema
with pytest.raises(ValueError):
schema = get_schema('xxx')


@pytest.mark.parametrize('data', [
{'text': 'Hello world'},
{'text': 'Hello', 'ents': [{'start': 0, 'end': 5, 'label': 'TEST'}]}
])
def test_json_schema_training_valid(data, training_schema):
errors = validate_json([data], training_schema)
assert not errors


@pytest.mark.parametrize('data,n_errors', [
({'spans': []}, 1),
({'text': 'Hello', 'ents': [{'start': '0', 'end': '5', 'label': 'TEST'}]}, 2),
({'text': 'Hello', 'ents': [{'start': 0, 'end': 5}]}, 1),
({'text': 'Hello', 'ents': [{'start': 0, 'end': 5, 'label': 'test'}]}, 1),
({'text': 'spaCy', 'tokens': [{'pos': 'PROPN'}]}, 2)
])
def test_json_schema_training_invalid(data, n_errors, training_schema):
errors = validate_json([data], training_schema)
assert len(errors) == n_errors

0 comments on commit db2f29b

Please sign in to comment.