lark-parser · erezsh · Aug 17, 2021 · Jun 28, 2021 · Jun 28, 2021 · Jun 28, 2021
diff --git a/lark/common.py b/lark/common.py
@@ -1,8 +1,9 @@
+from types import ModuleType
+
 from .utils import Serialize
 from .lexer import TerminalDef, Token
 
 ###{standalone
-from types import ModuleType
 from typing import Any, Callable, Collection, Dict, Optional, TYPE_CHECKING
 
 if TYPE_CHECKING:
@@ -17,13 +18,13 @@ class LexerConf(Serialize):
  terminals: Collection[TerminalDef]
  re_module: ModuleType
  ignore: Collection[str] = ()
- postlex: 'PostLex' = None
- callbacks: Optional[Dict[str, _Callback]] = None
+ postlex: 'Optional[PostLex]' = None
+ callbacks: Dict[str, _Callback] = {}
  g_regex_flags: int = 0
  skip_validation: bool = False
  use_bytes: bool = False
 
- def __init__(self, terminals: Collection[TerminalDef], re_module: ModuleType, ignore: Collection[str]=(), postlex: 'PostLex'=None, callbacks: Optional[Dict[str, _Callback]]=None, g_regex_flags: int=0, skip_validation: bool=False, use_bytes: bool=False):
+ def __init__(self, terminals: Collection[TerminalDef], re_module: ModuleType, ignore: Collection[str]=(), postlex: 'Optional[PostLex]'=None, callbacks: Optional[Dict[str, _Callback]]=None, g_regex_flags: int=0, skip_validation: bool=False, use_bytes: bool=False):
  self.terminals = terminals
  self.terminals_by_name = {t.name: t for t in self.terminals}
  assert len(self.terminals) == len(self.terminals_by_name)

diff --git a/lark/exceptions.py b/lark/exceptions.py
@@ -3,7 +3,7 @@
 
 ###{standalone
 
-from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set, TYPE_CHECKING
+from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set, Optional, TYPE_CHECKING
 
 if TYPE_CHECKING:
  from .lexer import Token
@@ -73,7 +73,7 @@ def get_context(self, text: str, span: int=40) -> str:
  after = text[pos:end].split(b'\n', 1)[0]
  return (before + after + b'\n' + b' ' * len(before.expandtabs()) + b'^\n').decode("ascii", "backslashreplace")
 
- def match_examples(self, parse_fn: 'Callable[[str], Tree]', examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], token_type_match_fallback: bool=False, use_accepts: bool=False) -> T:
+ def match_examples(self, parse_fn: 'Callable[[str], Tree]', examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], token_type_match_fallback: bool=False, use_accepts: bool=False) -> Optional[T]:
  """Allows you to detect what's wrong in the input text by matching
  against example errors.
 

diff --git a/lark/indenter.py b/lark/indenter.py
@@ -14,12 +14,12 @@ class DedentError(LarkError):
 
 class Indenter(PostLex, ABC):
 
- paren_level: Optional[int]
- indent_level: Optional[List[int]]
+ paren_level: int
+ indent_level: List[int]
 
  def __init__(self) -> None:
- self.paren_level = None
- self.indent_level = None
+ self.paren_level = 0
+ self.indent_level = [0]
  assert self.tab_len > 0
 
  def handle_NL(self, token: Token) -> Iterator[Token]:

diff --git a/lark/lark.py b/lark/lark.py
@@ -15,7 +15,7 @@
 
 import re
 try:
- import regex
+ import regex # type: ignore
 except ImportError:
  regex = None
 
@@ -149,7 +149,7 @@ class LarkOptions(Serialize):
  # - As an attribute of `LarkOptions` above
  # - Potentially in `_LOAD_ALLOWED_OPTIONS` below this class, when the option doesn't change how the grammar is loaded
  # - Potentially in `lark.tools.__init__`, if it makes sense, and it can easily be passed as a cmd argument
- _defaults = {
+ _defaults: Dict[str, Any] = {
  'debug': False,
  'keep_all_tokens': False,
  'tree_class': None,
@@ -414,6 +414,7 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
  if cache_fn:
  logger.debug('Saving grammar to cache: %s', cache_fn)
  with FS.open(cache_fn, 'wb') as f:
+ assert cache_md5 is not None
  f.write(cache_md5.encode('utf8') + b'\n')
  pickle.dump(used_files, f)
  self.save(f)
@@ -574,7 +575,7 @@ def get_terminal(self, name: str) -> TerminalDef:
  """Get information about a terminal"""
  return self._terminals_dict[name]
 
- def parse_interactive(self, text: str=None, start: Optional[str]=None) -> 'InteractiveParser':
+ def parse_interactive(self, text: Optional[str]=None, start: Optional[str]=None) -> 'InteractiveParser':
  """Start an interactive parsing session.
 
  Parameters:
@@ -588,7 +589,7 @@ def parse_interactive(self, text: str=None, start: Optional[str]=None) -> 'Inter
  """
  return self.parser.parse_interactive(text, start=start)
 
- def parse(self, text: str, start: Optional[str]=None, on_error: 'Callable[[UnexpectedInput], bool]'=None) -> Tree:
+ def parse(self, text: str, start: Optional[str]=None, on_error: 'Optional[Callable[[UnexpectedInput], bool]]'=None) -> Tree:
  """Parse the given text, according to the options provided.
 
  Parameters:

diff --git a/lark/lexer.py b/lark/lexer.py
@@ -23,10 +23,10 @@ class Pattern(Serialize, ABC):
 
  value: str
  flags: Collection[str]
- raw: str = None
- type: str = None
+ raw: Optional[str] = None
+ type: Optional[str] = None
 
- def __init__(self, value: str, flags: Collection[str]=(), raw: str=None) -> None:
+ def __init__(self, value: str, flags: Collection[str]=(), raw: Optional[str]=None) -> None:
  self.value = value
  self.flags = frozenset(flags)
  self.raw = raw
@@ -81,7 +81,10 @@ def to_regexp(self) -> str:
  @property
  def min_width(self) -> int:
  return len(self.value)
- max_width = min_width
+
+ @property
+ def max_width(self) -> int:
+ return len(self.value)
 
 
 class PatternRE(Pattern):
@@ -320,15 +323,36 @@ def _regexp_has_newline(r):
  """
  return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r)
 
+
+class LexerState(object):
+ __slots__ = 'text', 'line_ctr', 'last_token'
+
+ def __init__(self, text, line_ctr, last_token=None):
+ self.text = text
+ self.line_ctr = line_ctr
+ self.last_token = last_token
+
+ def __eq__(self, other):
+ if not isinstance(other, LexerState):
+ return NotImplemented
+
+ return self.text is other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token
+
+ def __copy__(self):
+ return type(self)(self.text, copy(self.line_ctr), self.last_token)
+
+
 _Callback = Callable[[Token], Token]
 
 class Lexer(ABC):
  """Lexer interface
 
  Method Signatures:
- lex(self, text) -> Iterator[Token]
+ lex(self, lexer_state, parser_state) -> Iterator[Token]
  """
- lex: Callable[..., Iterator[Token]] = NotImplemented
+ @abstractmethod
+ def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]:
+ ...
 
  def make_lexer_state(self, text):
  line_ctr = LineCounter(b'\n' if isinstance(text, bytes) else '\n')
@@ -394,6 +418,7 @@ def _build(self) -> None:
  def mres(self) -> List[Tuple[REPattern, Dict[int, str]]]:
  if self._mres is None:
  self._build()
+ assert self._mres is not None
  return self._mres
 
  def match(self, text: str, pos: int) -> Optional[Tuple[str, str]]:
@@ -402,12 +427,12 @@ def match(self, text: str, pos: int) -> Optional[Tuple[str, str]]:
  if m:
  return m.group(0), type_from_index[m.lastindex]
 
- def lex(self, state: Any, parser_state: Any) -> Iterator[Token]:
+ def lex(self, state: LexerState, parser_state: Any) -> Iterator[Token]:
  with suppress(EOFError):
  while True:
  yield self.next_token(state, parser_state)
 
- def next_token(self, lex_state: Any, parser_state: Any=None) -> Token:
+ def next_token(self, lex_state: LexerState, parser_state: Any=None) -> Token:
  line_ctr = lex_state.line_ctr
  while line_ctr.char_pos < len(lex_state.text):
  res = self.match(lex_state.text, line_ctr.char_pos)
@@ -443,24 +468,6 @@ def next_token(self, lex_state: Any, parser_state: Any=None) -> Token:
  raise EOFError(self)
 
 
-class LexerState(object):
- __slots__ = 'text', 'line_ctr', 'last_token'
-
- def __init__(self, text, line_ctr, last_token=None):
- self.text = text
- self.line_ctr = line_ctr
- self.last_token = last_token
-
- def __eq__(self, other):
- if not isinstance(other, LexerState):
- return NotImplemented
-
- return self.text is other.text and self.line_ctr == other.line_ctr and self.last_token == other.last_token
-
- def __copy__(self):
- return type(self)(self.text, copy(self.line_ctr), self.last_token)
-
-
 class ContextualLexer(Lexer):
 
  lexers: Dict[str, TraditionalLexer]
@@ -494,7 +501,7 @@ def __init__(self, conf: 'LexerConf', states: Dict[str, Collection[str]], always
  def make_lexer_state(self, text):
  return self.root_lexer.make_lexer_state(text)
 
- def lex(self, lexer_state: Any, parser_state: Any) -> Iterator[Token]:
+ def lex(self, lexer_state: LexerState, parser_state: Any) -> Iterator[Token]:
  try:
  while True:
  lexer = self.lexers[parser_state.position]

diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
@@ -7,7 +7,7 @@
 from .tree import Tree
 from .common import LexerConf, ParserConf
 try:
- import regex
+ import regex # type: ignore
 except ImportError:
  regex = None
 import re

diff --git a/lark/tree.py b/lark/tree.py
@@ -1,8 +1,9 @@
 try:
- from future_builtins import filter
+ from future_builtins import filter # type: ignore
 except ImportError:
  pass
 
+import sys
 from copy import deepcopy
 
 
@@ -49,7 +50,7 @@ class Tree(object):
  data: str
  children: 'List[Union[str, Tree]]'
 
- def __init__(self, data: str, children: 'List[Union[str, Tree]]', meta: Meta=None) -> None:
+ def __init__(self, data: str, children: 'List[Union[str, Tree]]', meta: Optional[Meta]=None) -> None:
  self.data = data
  self.children = children
  self._meta = meta
@@ -196,7 +197,7 @@ def pydot__tree_to_graph(tree, rankdir="LR", **kwargs):
  possible attributes, see https://www.graphviz.org/doc/info/attrs.html.
  """
 
- import pydot
+ import pydot # type: ignore
  graph = pydot.Dot(graph_type='digraph', rankdir=rankdir, **kwargs)
 
  i = [0]

diff --git a/lark/utils.py b/lark/utils.py
@@ -134,7 +134,7 @@ def smart_decorator(f, create_decorator):
 
 
 try:
- import regex
+ import regex # type: ignore
 except ImportError:
  regex = None