Accelerate block tag iteration (#205)

* Accelerate block tag iteration. * Add changelog entry * Fix formatting.
dbt-labs · Oct 15, 2024 · 1bad6b7 · 1bad6b7
1 parent ed11c6c
commit 1bad6b7
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 2 deletions.
diff --git a/.changes/unreleased/Features-20241015-174841.yaml b/.changes/unreleased/Features-20241015-174841.yaml
@@ -0,0 +1,6 @@
+kind: Features
+body: Accelerate block tag iteration.
+time: 2024-10-15T17:48:41.299686-04:00
+custom:
+ Author: peterallenwebb
+ Issue: "205"
diff --git a/dbt_common/clients/_jinja_blocks.py b/dbt_common/clients/_jinja_blocks.py
@@ -1,6 +1,7 @@
+import dataclasses
 import re
 from collections import namedtuple
-from typing import Iterator, List, Optional, Set, Union
+from typing import Dict, Iterator, List, Optional, Set, Union
 
 from dbt_common.exceptions import (
  BlockDefinitionNotAtTopError,
@@ -104,11 +105,25 @@ def end_pat(self) -> re.Pattern:
 QUOTE_START_PATTERN = regex(r"""(?P<quote>(['"]))""")
 
 
+@dataclasses.dataclass
+class PositionedMatch:
+ """This class is used to cache search information, accelerating TagIterator.
+ It records the result of searching a string from the start_pos and also
+ the position of the first match, or None if there is no match."""
+
+ start_pos: int
+ match: Optional[re.Match]
+
+
 class TagIterator:
  def __init__(self, text: str) -> None:
  self.text: str = text
  self.pos: int = 0
 
+ # A cache of the most recent matches seen for each pattern, maintained
+ # in order to avoid slowly re-searching long inputs many times.
+ self._past_matches: Dict[re.Pattern, PositionedMatch] = {}
+
  def linepos(self, end: Optional[int] = None) -> str:
  """Return relative position in line.
 
@@ -130,7 +145,31 @@ def rewind(self, amount: int = 1) -> None:
  self.pos -= amount
 
  def _search(self, pattern: re.Pattern) -> Optional[re.Match]:
- return pattern.search(self.text, self.pos)
+ # Check to see if we have cached a search for this pattern already.
+ positioned_match = self._past_matches.get(pattern)
+
+ if positioned_match is None or positioned_match.start_pos > self.pos:
+ # We did not have a cached search, or we did, but it was done at a location
+ # further along in the string and can't be used. Do a search and cache it.
+ match = pattern.search(self.text, self.pos)
+ self._past_matches[pattern] = PositionedMatch(self.pos, match)
+ else:
+ # We have a cached search and its start position falls before (or at) the
+ # current search position...
+ if positioned_match.match is None:
+ # ...but there is no match in the rest of the text.
+ match = None
+ elif positioned_match.match.start() >= self.pos:
+ # ...and there is a match we can reuse, because we have not yet passed
+ # the start position of the match. It's still the next match.
+ match = positioned_match.match
+ else:
+ # ...but we have passed the start of the cached match, and need to do a
+ # new search from our current position and cache it.
+ match = pattern.search(self.text, self.pos)
+ self._past_matches[pattern] = PositionedMatch(self.pos, match)
+
+ return match
 
  def _match(self, pattern: re.Pattern) -> Optional[re.Match]:
  return pattern.match(self.text, self.pos)