-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
366 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# coding: utf-8 | ||
from __future__ import unicode_literals | ||
|
||
from pathlib import Path | ||
from jsonschema import Draft4Validator | ||
|
||
from ...errors import Errors | ||
from ...util import read_json | ||
|
||
|
||
SCHEMAS = {} | ||
|
||
|
||
def get_schema(name): | ||
"""Get the JSON schema for a given name. Looks for a .json file in | ||
spacy.cli.schemas, validates the schema and raises ValueError if not found. | ||
EXAMPLE: | ||
>>> schema = get_schema('training') | ||
name (unicode): The name of the schema. | ||
RETURNS (dict): The JSON schema. | ||
""" | ||
if name not in SCHEMAS: | ||
schema_path = Path(__file__).parent / '{}.json'.format(name) | ||
if not schema_path.exists(): | ||
raise ValueError(Errors.E102.format(name=name)) | ||
schema = read_json(schema_path) | ||
# TODO: replace with (stable) Draft6Validator, if available | ||
validator = Draft4Validator(schema) | ||
validator.check_schema(schema) | ||
SCHEMAS[name] = schema | ||
return SCHEMAS[name] | ||
|
||
|
||
def validate_json(data, schema): | ||
"""Validate data against a given JSON schema (see https://json-schema.org). | ||
data: JSON-serializable data to validate. | ||
schema (dict): The JSON schema. | ||
RETURNS (list): A list of error messages, if available. | ||
""" | ||
validator = Draft4Validator(schema) | ||
errors = [] | ||
for err in sorted(validator.iter_errors(data), key=lambda e: e.path): | ||
if err.path: | ||
err_path = '[{}]'.format(' -> '.join([str(p) for p in err.path])) | ||
else: | ||
err_path = '' | ||
errors.append(err.message + ' ' + err_path) | ||
return errors |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
{ | ||
"$schema": "http://json-schema.org/draft-06/schema", | ||
"type": "object", | ||
"properties": { | ||
"lang": { | ||
"title": "Two-letter language code, e.g. 'en'", | ||
"type": "string", | ||
"minLength": 2, | ||
"maxLength": 2, | ||
"pattern": "^[a-z]*$" | ||
}, | ||
"name": { | ||
"title": "Model name", | ||
"type": "string", | ||
"minLength": 1, | ||
"pattern": "^[a-z_]*$" | ||
}, | ||
"version": { | ||
"title": "Model version", | ||
"type": "string", | ||
"minLength": 1, | ||
"pattern": "^[0-9a-z.-]*$" | ||
}, | ||
"spacy_version": { | ||
"title": "Compatible spaCy version identifier", | ||
"type": "string", | ||
"minLength": 1, | ||
"pattern": "^[0-9a-z.-><=]*$" | ||
}, | ||
"parent_package": { | ||
"title": "Name of parent spaCy package, e.g. spacy or spacy-nightly", | ||
"type": "string", | ||
"minLength": 1, | ||
"default": "spacy" | ||
}, | ||
"pipeline": { | ||
"title": "Names of pipeline components", | ||
"type": "array", | ||
"items": { | ||
"type": "string", | ||
"minLength": 1 | ||
} | ||
}, | ||
"description": { | ||
"title": "Model description", | ||
"type": "string" | ||
}, | ||
"license": { | ||
"title": "Model license", | ||
"type": "string" | ||
}, | ||
"author": { | ||
"title": "Model author name", | ||
"type": "string" | ||
}, | ||
"email": { | ||
"title": "Model author email", | ||
"type": "string", | ||
"format": "email" | ||
}, | ||
"url": { | ||
"title": "Model author URL", | ||
"type": "string", | ||
"format": "uri" | ||
}, | ||
"sources": { | ||
"title": "Training data sources", | ||
"type": "array", | ||
"items": { | ||
"type": "string" | ||
} | ||
}, | ||
"vectors": { | ||
"title": "Included word vectors", | ||
"type": "object", | ||
"properties": { | ||
"keys": { | ||
"title": "Number of unique keys", | ||
"type": "integer", | ||
"minimum": 0 | ||
}, | ||
"vectors": { | ||
"title": "Number of unique vectors", | ||
"type": "integer", | ||
"minimum": 0 | ||
}, | ||
"width": { | ||
"title": "Number of dimensions", | ||
"type": "integer", | ||
"minimum": 0 | ||
} | ||
} | ||
}, | ||
"accuracy": { | ||
"title": "Accuracy numbers", | ||
"type": "object", | ||
"patternProperties": { | ||
"*": { | ||
"type": "number", | ||
"minimum": 0.0 | ||
} | ||
} | ||
}, | ||
"speed": { | ||
"title": "Speed evaluation numbers", | ||
"type": "object", | ||
"patternProperties": { | ||
"*": { | ||
"oneOf": [ | ||
{ | ||
"type": "number", | ||
"minimum": 0.0 | ||
}, | ||
{ | ||
"type": "integer", | ||
"minimum": 0 | ||
} | ||
] | ||
} | ||
} | ||
} | ||
}, | ||
"required": [ | ||
"lang", | ||
"name", | ||
"version" | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
{ | ||
"$schema": "http://json-schema.org/draft-06/schema", | ||
"title": "Training data for spaCy models", | ||
"type": "array", | ||
"items": { | ||
"type": "object", | ||
"properties": { | ||
"text": { | ||
"title": "The text of the training example", | ||
"type": "string", | ||
"minLength": 1 | ||
}, | ||
"ents": { | ||
"title": "Named entity spans in the text", | ||
"type": "array", | ||
"items": { | ||
"type": "object", | ||
"properties": { | ||
"start": { | ||
"title": "Start character offset of the span", | ||
"type": "integer", | ||
"minimum": 0 | ||
}, | ||
"end": { | ||
"title": "End character offset of the span", | ||
"type": "integer", | ||
"minimum": 0 | ||
}, | ||
"label": { | ||
"title": "Entity label", | ||
"type": "string", | ||
"minLength": 1, | ||
"pattern": "^[A-Z0-9]*$" | ||
} | ||
}, | ||
"required": [ | ||
"start", | ||
"end", | ||
"label" | ||
] | ||
} | ||
}, | ||
"sents": { | ||
"title": "Sentence spans in the text", | ||
"type": "array", | ||
"items": { | ||
"type": "object", | ||
"properties": { | ||
"start": { | ||
"title": "Start character offset of the span", | ||
"type": "integer", | ||
"minimum": 0 | ||
}, | ||
"end": { | ||
"title": "End character offset of the span", | ||
"type": "integer", | ||
"minimum": 0 | ||
} | ||
}, | ||
"required": [ | ||
"start", | ||
"end" | ||
] | ||
} | ||
}, | ||
"cats": { | ||
"title": "Text categories for the text classifier", | ||
"type": "object", | ||
"patternProperties": { | ||
"*": { | ||
"title": "A text category", | ||
"oneOf": [ | ||
{ | ||
"type": "boolean" | ||
}, | ||
{ | ||
"type": "number", | ||
"minimum": 0 | ||
} | ||
] | ||
} | ||
}, | ||
"propertyNames": { | ||
"pattern": "^[A-Z0-9]*$", | ||
"minLength": 1 | ||
} | ||
}, | ||
"tokens": { | ||
"title": "The tokens in the text", | ||
"type": "array", | ||
"items": { | ||
"type": "object", | ||
"minProperties": 1, | ||
"properties": { | ||
"id": { | ||
"title": "Token ID, usually token index", | ||
"type": "integer", | ||
"minimum": 0 | ||
}, | ||
"start": { | ||
"title": "Start character offset of the token", | ||
"type": "integer", | ||
"minimum": 0 | ||
}, | ||
"end": { | ||
"title": "End character offset of the token", | ||
"type": "integer", | ||
"minimum": 0 | ||
}, | ||
"pos": { | ||
"title": "Coarse-grained part-of-speech tag", | ||
"type": "string", | ||
"minLength": 1 | ||
}, | ||
"tag": { | ||
"title": "Fine-grained part-of-speech tag", | ||
"type": "string", | ||
"minLength": 1 | ||
}, | ||
"dep": { | ||
"title": "Dependency label", | ||
"type": "string", | ||
"minLength": 1 | ||
}, | ||
"head": { | ||
"title": "Index of the token's head", | ||
"type": "integer", | ||
"minimum": 0 | ||
} | ||
}, | ||
"required": [ | ||
"start", | ||
"end" | ||
] | ||
} | ||
}, | ||
"_": { | ||
"title": "Custom user space", | ||
"type": "object" | ||
} | ||
}, | ||
"required": [ | ||
"text" | ||
] | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# coding: utf-8 | ||
from __future__ import unicode_literals | ||
|
||
from spacy.cli.schemas import validate_json, get_schema | ||
import pytest | ||
|
||
|
||
@pytest.fixture(scope='session') | ||
def training_schema(): | ||
return get_schema('training') | ||
|
||
|
||
def test_json_schema_get(): | ||
schema = get_schema('training') | ||
assert schema | ||
with pytest.raises(ValueError): | ||
schema = get_schema('xxx') | ||
|
||
|
||
@pytest.mark.parametrize('data', [ | ||
{'text': 'Hello world'}, | ||
{'text': 'Hello', 'ents': [{'start': 0, 'end': 5, 'label': 'TEST'}]} | ||
]) | ||
def test_json_schema_training_valid(data, training_schema): | ||
errors = validate_json([data], training_schema) | ||
assert not errors | ||
|
||
|
||
@pytest.mark.parametrize('data,n_errors', [ | ||
({'spans': []}, 1), | ||
({'text': 'Hello', 'ents': [{'start': '0', 'end': '5', 'label': 'TEST'}]}, 2), | ||
({'text': 'Hello', 'ents': [{'start': 0, 'end': 5}]}, 1), | ||
({'text': 'Hello', 'ents': [{'start': 0, 'end': 5, 'label': 'test'}]}, 1), | ||
({'text': 'spaCy', 'tokens': [{'pos': 'PROPN'}]}, 2) | ||
]) | ||
def test_json_schema_training_invalid(data, n_errors, training_schema): | ||
errors = validate_json([data], training_schema) | ||
assert len(errors) == n_errors |