Skip to content

Commit

Permalink
facilitate larger training files (#4827)
Browse files Browse the repository at this point in the history
* add warning for large file and change start var to long

* type for file_length
  • Loading branch information
svlandeg authored and ines committed Dec 21, 2019
1 parent cb4145a commit 732142b
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 3 deletions.
4 changes: 4 additions & 0 deletions spacy/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ class Warnings(object):
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
"previous components in the pipeline declare that they assign it.")
W026 = ("Unable to set all sentence boundaries from dependency parses.")
W027 = ("Found a large training file of {size} bytes. Note that it may "
"be more efficient to split your training data into multiple "
"smaller JSON files instead.")



@add_codes
Expand Down
10 changes: 7 additions & 3 deletions spacy/gold.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import srsly

from .syntax import nonproj
from .tokens import Doc, Span
from .errors import Errors, AlignmentError
from .errors import Errors, AlignmentError, user_warning, Warnings
from .compat import path2str
from . import util
from .util import minibatch, itershuffle
Expand Down Expand Up @@ -557,20 +557,24 @@ def _json_iterate(loc):
loc = util.ensure_path(loc)
with loc.open("rb") as file_:
py_raw = file_.read()
cdef long file_length = len(py_raw)
if file_length > 2 ** 30:
user_warning(Warnings.W027.format(size=file_length))

raw = <char*>py_raw
cdef int square_depth = 0
cdef int curly_depth = 0
cdef int inside_string = 0
cdef int escape = 0
cdef int start = -1
cdef long start = -1
cdef char c
cdef char quote = ord('"')
cdef char backslash = ord("\\")
cdef char open_square = ord("[")
cdef char close_square = ord("]")
cdef char open_curly = ord("{")
cdef char close_curly = ord("}")
for i in range(len(py_raw)):
for i in range(file_length):
c = raw[i]
if escape:
escape = False
Expand Down

0 comments on commit 732142b

Please sign in to comment.