Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactored PR #949 and edited the comments/docstrings #952

Merged
merged 9 commits into from
Jul 27, 2021
4 changes: 3 additions & 1 deletion examples/standalone/json_parser_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@

import sys

from json_parser import Lark_StandAlone, Transformer, inline_args
from json_parser import Lark_StandAlone, Transformer, v_args

inline_args = v_args(inline=True)

class TreeToJson(Transformer):
@inline_args
Expand Down
135 changes: 123 additions & 12 deletions lark/load_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from ast import literal_eval
from numbers import Integral

from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique
from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors
from .lexer import Token, TerminalDef, PatternStr, PatternRE

from .parse_tree_builder import ParseTreeBuilder
Expand Down Expand Up @@ -175,27 +175,136 @@
}


# Value 5 keeps the number of states in the lalr parser somewhat minimal
# It isn't optimal, but close to it. See PR #949
SMALL_FACTOR_THRESHOLD = 5
# The Threshold whether repeat via ~ are split up into different rules
# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low,
# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts.
# (See PR #949)
REPEAT_BREAK_THRESHOLD = 50


@inline_args
class EBNF_to_BNF(Transformer_InPlace):
def __init__(self):
self.new_rules = []
self.rules_by_expr = {}
self.rules_cache = {}
self.prefix = 'anon'
self.i = 0
self.rule_options = None

def _add_recurse_rule(self, type_, expr):
if expr in self.rules_by_expr:
return self.rules_by_expr[expr]

new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
def _name_rule(self, inner):
new_name = '__%s_%s_%d' % (self.prefix, inner, self.i)
self.i += 1
t = NonTerminal(new_name)
tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])])
self.new_rules.append((new_name, tree, self.rule_options))
self.rules_by_expr[expr] = t
return new_name

def _add_rule(self, key, name, expansions):
t = NonTerminal(name)
self.new_rules.append((name, expansions, self.rule_options))
self.rules_cache[key] = t
return t

def _add_recurse_rule(self, type_, expr):
try:
return self.rules_cache[expr]
except KeyError:
new_name = self._name_rule(type_)
t = NonTerminal(new_name)
tree = ST('expansions', [
ST('expansion', [expr]),
ST('expansion', [t, expr])
])
return self._add_rule(expr, new_name, tree)

def _add_repeat_rule(self, a, b, target, atom):
"""Generate a rule that repeats target ``a`` times, and repeats atom ``b`` times.

When called recursively (into target), it repeats atom for x(n) times, where:
x(0) = 1
x(n) = a(n) * x(n-1) + b

Example rule when a=3, b=4:

new_rule: target target target atom atom atom atom

"""
key = (a, b, target, atom)
try:
return self.rules_cache[key]
except KeyError:
new_name = self._name_rule('repeat_a%d_b%d' % (a, b))
tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)])
return self._add_rule(key, new_name, tree)

def _add_repeat_opt_rule(self, a, b, target, target_opt, atom):
"""Creates a rule that matches atom 0 to (a*n+b)-1 times.

When target matches n times atom, and target_opt 0 to n-1 times target_opt,

First we generate target * i followed by target_opt, for i from 0 to a-1
These match 0 to n*a - 1 times atom

Then we generate target * a followed by atom * i, for i from 0 to b-1
These match n*a to n*a + b-1 times atom

The created rule will not have any shift/reduce conflicts so that it can be used with lalr

Example rule when a=3, b=4:

new_rule: target_opt
| target target_opt
| target target target_opt

| target target target
| target target target atom
| target target target atom atom
| target target target atom atom atom

"""
key = (a, b, target, atom, "opt")
try:
return self.rules_cache[key]
except KeyError:
new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b))
tree = ST('expansions', [
ST('expansion', [target]*i + [target_opt]) for i in range(a)
] + [
ST('expansion', [target]*a + [atom]*i) for i in range(b)
])
return self._add_rule(key, new_name, tree)

def _generate_repeats(self, rule, mn, mx):
"""Generates a rule tree that repeats ``rule`` exactly between ``mn`` to ``mx`` times.
"""
# For a small number of repeats, we can take the naive approach
if mx < REPEAT_BREAK_THRESHOLD:
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)])

# For large repeat values, we break the repetition into sub-rules.
# We treat ``rule~mn..mx`` as ``rule~mn rule~0..(diff=mx-mn)``.
# We then use small_factors to split up mn and diff up into values [(a, b), ...]
# This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt
# to generate a complete rule/expression that matches the corresponding number of repeats
mn_target = rule
for a, b in small_factors(mn, SMALL_FACTOR_THRESHOLD):
mn_target = self._add_repeat_rule(a, b, mn_target, rule)
if mx == mn:
return mn_target

diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less
diff_factors = small_factors(diff, SMALL_FACTOR_THRESHOLD)
diff_target = rule # Match rule 1 times
diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times)
for a, b in diff_factors[:-1]:
diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)
diff_target = self._add_repeat_rule(a, b, diff_target, rule)

a, b = diff_factors[-1]
diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)

return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])])

def expr(self, rule, op, *args):
if op.value == '?':
empty = ST('expansion', [])
Expand All @@ -220,7 +329,9 @@ def expr(self, rule, op, *args):
mn, mx = map(int, args)
if mx < mn or mn < 0:
raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)])

return self._generate_repeats(rule, mn, mx)

assert False, op

def maybe(self, rule):
Expand Down
2 changes: 1 addition & 1 deletion lark/parsers/lalr_interactive_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def pretty(self):
"""Print the output of ``choices()`` in a way that's easier to read."""
out = ["Parser choices:"]
for k, v in self.choices().items():
out.append('\t- %s -> %s' % (k, v))
out.append('\t- %s -> %r' % (k, v))
out.append('stack size: %s' % len(self.parser_state.state_stack))
return '\n'.join(out)

Expand Down
30 changes: 28 additions & 2 deletions lark/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def get_regexp_width(expr):
return 1, sre_constants.MAXREPEAT
else:
return 0, sre_constants.MAXREPEAT

###}


Expand Down Expand Up @@ -288,7 +288,7 @@ def combine_alternatives(lists):

class FS:
exists = os.path.exists

@staticmethod
def open(name, mode="r", **kwargs):
if atomicwrites and "w" in mode:
Expand Down Expand Up @@ -359,3 +359,29 @@ def _serialize(value, memo):
return {key:_serialize(elem, memo) for key, elem in value.items()}
# assert value is None or isinstance(value, (int, float, str, tuple)), value
return value




def small_factors(n, max_factor):
"""
Splits n up into smaller factors and summands <= max_factor.
Returns a list of [(a, b), ...]
so that the following code returns n:

n = 1
for a, b in values:
n = n * a + b

Currently, we also keep a + b <= max_factor, but that might change
"""
assert n >= 0
assert max_factor > 2
if n <= max_factor:
return [(n, 0)]

for a in range(max_factor, 1, -1):
r, b = divmod(n, a)
if a + b <= max_factor:
return small_factors(r, max_factor) + [(a, b)]
assert False, "Failed to factorize %s" % n
49 changes: 48 additions & 1 deletion tests/test_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sys
from unittest import TestCase, main

from lark import Lark, Token, Tree
from lark import Lark, Token, Tree, ParseError, UnexpectedInput
from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors
from lark.load_grammar import FromPackageLoader

Expand Down Expand Up @@ -198,6 +198,53 @@ def test_find_grammar_errors(self):
x = find_grammar_errors(text)
assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6]

def test_ranged_repeat_terms(self):
g = u"""!start: AAA
AAA: "A"~3
"""
l = Lark(g, parser='lalr')
self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')

g = u"""!start: AABB CC
AABB: "A"~0..2 "B"~2
CC: "C"~1..2
"""
l = Lark(g, parser='lalr')
self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')

def test_ranged_repeat_large(self):
g = u"""!start: "A"~60
"""
l = Lark(g, parser='lalr')
self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated")
self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60))
self.assertRaises(ParseError, l.parse, u'A' * 59)
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61)

g = u"""!start: "A"~15..100
"""
l = Lark(g, parser='lalr')
for i in range(0, 110):
if 15 <= i <= 100:
self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i))
else:
self.assertRaises(UnexpectedInput, l.parse, u'A' * i)

# 8191 is a Mersenne prime
g = u"""start: "A"~8191
"""
l = Lark(g, parser='lalr')
self.assertEqual(l.parse(u'A' * 8191), Tree('start', []))
self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190)
self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192)


if __name__ == '__main__':
Expand Down
20 changes: 0 additions & 20 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2204,27 +2204,7 @@ def test_ranged_repeat_rules(self):
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')


def test_ranged_repeat_terms(self):
g = u"""!start: AAA
AAA: "A"~3
"""
l = _Lark(g)
self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')

g = u"""!start: AABB CC
AABB: "A"~0..2 "B"~2
CC: "C"~1..2
"""
l = _Lark(g)
self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')

@unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX
def test_priority_vs_embedded(self):
Expand Down