Skip to content

Commit

Permalink
Preload all project files at start of parsing [#3244]
Browse files Browse the repository at this point in the history
  • Loading branch information
gshank committed Apr 12, 2021
1 parent 749f873 commit dc6350f
Show file tree
Hide file tree
Showing 23 changed files with 381 additions and 310 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
- General development environment clean up and improve experience running tests locally ([#3194](https:/fishtown-analytics/dbt/issues/3194), [#3204](https:/fishtown-analytics/dbt/pull/3204))
- Add a new materialization for tests, update data tests to use test materialization when executing. ([#3154](https:/fishtown-analytics/dbt/issues/3154), [#3181](https:/fishtown-analytics/dbt/pull/3181))
- Switch from externally storing parsing state in ParseResult object to using Manifest ([#3163](http:/fishtown-analytics/dbt/issues/3163), [#3219](https:/fishtown-analytics/dbt/pull/3219))
- Switch from loading project files in separate parsers to loading in one place([#3244](http:/fishtown-analytics/dbt/issues/3244), [#3248](https:/fishtown-analytics/dbt/pull/3248))

Contributors:
- [@yu-iskw](https:/yu-iskw) ([#2928](https:/fishtown-analytics/dbt/pull/2928))
Expand Down
18 changes: 17 additions & 1 deletion core/dbt/contracts/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from dataclasses import dataclass, field
from typing import List, Optional, Union

from dbt.dataclass_schema import dbtClassMixin
from dbt.dataclass_schema import dbtClassMixin, StrEnum

from dbt.exceptions import InternalException

Expand All @@ -14,6 +14,18 @@
MAXIMUM_SEED_SIZE_NAME = '1MB'


class ParseFileType(StrEnum):
Macro = 'macro'
Model = 'model'
Snapshot = 'snapshot'
Analysis = 'analysis'
Test = 'test'
Seed = 'seed'
Documentation = 'docs'
Schema = 'schema'
Hook = 'hook'


@dataclass
class FilePath(dbtClassMixin):
searched_path: str
Expand Down Expand Up @@ -114,6 +126,10 @@ class SourceFile(dbtClassMixin):
"""Define a source file in dbt"""
path: Union[FilePath, RemoteFile] # the path information
checksum: FileHash
# Seems like knowing which project the file came from would be useful
project_name: Optional[str] = None
# Parse file type: i.e. which parser will process this file
parse_file_type: Optional[ParseFileType] = None
# we don't want to serialize this
_contents: Optional[str] = None
# the unique IDs contained in this file
Expand Down
9 changes: 5 additions & 4 deletions core/dbt/contracts/graph/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from dbt.contracts.util import (
BaseArtifactMetadata, MacroKey, SourceKey, ArtifactMixin, schema_version
)
from dbt.dataclass_schema import dbtClassMixin
from dbt.exceptions import (
InternalException, CompilationException,
raise_duplicate_resource_name, raise_compiler_error, warn_or_error,
Expand Down Expand Up @@ -512,7 +513,7 @@ def _find_macros_by_name(


@dataclass
class ManifestStateCheck():
class ManifestStateCheck(dbtClassMixin):
vars_hash: FileHash
profile_hash: FileHash
project_hashes: MutableMapping[str, FileHash]
Expand Down Expand Up @@ -775,10 +776,11 @@ def deepcopy(self):
macros={k: _deepcopy(v) for k, v in self.macros.items()},
docs={k: _deepcopy(v) for k, v in self.docs.items()},
exposures={k: _deepcopy(v) for k, v in self.exposures.items()},
selectors=self.root_project.manifest_selectors,
selectors={k: _deepcopy(v) for k, v in self.selectors.items()},
metadata=self.metadata,
disabled=[_deepcopy(n) for n in self.disabled],
files={k: _deepcopy(v) for k, v in self.files.items()},
state_check=_deepcopy(self.state_check),
)

def writable_manifest(self):
Expand Down Expand Up @@ -1195,9 +1197,8 @@ def __reduce_ex__(self, protocol):


class MacroManifest(MacroMethods):
def __init__(self, macros, files):
def __init__(self, macros):
self.macros = macros
self.files = files
self.metadata = ManifestMetadata()
# This is returned by the 'graph' context property
# in the ProviderContext class.
Expand Down
7 changes: 1 addition & 6 deletions core/dbt/parser/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,10 @@
from dbt.contracts.graph.parsed import ParsedAnalysisNode
from dbt.node_types import NodeType
from dbt.parser.base import SimpleSQLParser
from dbt.parser.search import FilesystemSearcher, FileBlock
from dbt.parser.search import FileBlock


class AnalysisParser(SimpleSQLParser[ParsedAnalysisNode]):
def get_paths(self):
return FilesystemSearcher(
self.project, self.project.analysis_paths, '.sql'
)

def parse_from_dict(self, dct, validate=True) -> ParsedAnalysisNode:
if validate:
ParsedAnalysisNode.validate(dct)
Expand Down
32 changes: 1 addition & 31 deletions core/dbt/parser/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@
import itertools
import os
from typing import (
List, Dict, Any, Iterable, Generic, TypeVar
List, Dict, Any, Generic, TypeVar
)

from dbt.dataclass_schema import ValidationError

from dbt import utils
from dbt.clients.jinja import MacroGenerator
from dbt.clients.system import load_file_contents
from dbt.context.providers import (
generate_parser_model,
generate_generate_component_name_macro,
Expand All @@ -20,9 +19,6 @@
from dbt.context.context_config import (
ContextConfig
)
from dbt.contracts.files import (
SourceFile, FilePath, FileHash
)
from dbt.contracts.graph.manifest import Manifest
from dbt.contracts.graph.parsed import HasUniqueID, ManifestNodes
from dbt.contracts.graph.unparsed import UnparsedNode
Expand Down Expand Up @@ -50,17 +46,6 @@ class BaseParser(Generic[FinalValue]):
def __init__(self, project: Project, manifest: Manifest) -> None:
self.project = project
self.manifest = manifest
# this should be a superset of [x.path for x in self.manifest.files]
# because we fill it via search()
self.searched: List[FilePath] = []

@abc.abstractmethod
def get_paths(self) -> Iterable[FilePath]:
pass

def search(self) -> List[FilePath]:
self.searched = list(self.get_paths())
return self.searched

@abc.abstractmethod
def parse_file(self, block: FileBlock) -> None:
Expand All @@ -76,21 +61,6 @@ def generate_unique_id(self, resource_name: str) -> str:
self.project.project_name,
resource_name)

def load_file(
self,
path: FilePath,
*,
set_contents: bool = True,
) -> SourceFile:
file_contents = load_file_contents(path.absolute_path, strip=False)
checksum = FileHash.from_contents(file_contents)
source_file = SourceFile(path=path, checksum=checksum)
if set_contents:
source_file.contents = file_contents.strip()
else:
source_file.contents = ''
return source_file


class Parser(BaseParser[FinalValue], Generic[FinalValue]):
def __init__(
Expand Down
7 changes: 1 addition & 6 deletions core/dbt/parser/data_test.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
from dbt.contracts.graph.parsed import ParsedDataTestNode
from dbt.node_types import NodeType
from dbt.parser.base import SimpleSQLParser
from dbt.parser.search import FilesystemSearcher, FileBlock
from dbt.parser.search import FileBlock
from dbt.utils import get_pseudo_test_path


class DataTestParser(SimpleSQLParser[ParsedDataTestNode]):
def get_paths(self):
return FilesystemSearcher(
self.project, self.project.test_paths, '.sql'
)

def parse_from_dict(self, dct, validate=True) -> ParsedDataTestNode:
if validate:
ParsedDataTestNode.validate(dct)
Expand Down
11 changes: 1 addition & 10 deletions core/dbt/parser/docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,14 @@
from dbt.node_types import NodeType
from dbt.parser.base import Parser
from dbt.parser.search import (
BlockContents, FileBlock, FilesystemSearcher, BlockSearcher
BlockContents, FileBlock, BlockSearcher
)


SHOULD_PARSE_RE = re.compile(r'{[{%]')


class DocumentationParser(Parser[ParsedDocumentation]):
def get_paths(self):
return FilesystemSearcher(
project=self.project,
relative_dirs=self.project.docs_paths,
extension='.md',
)

@property
def resource_type(self) -> NodeType:
return NodeType.Documentation
Expand Down Expand Up @@ -61,5 +54,3 @@ def parse_file(self, file_block: FileBlock):
for block in searcher:
for parsed in self.parse_block(block):
self.manifest.add_doc(file_block.file, parsed)
# mark the file as seen, even if there are no macros in it
self.manifest.get_file(file_block.file)
5 changes: 3 additions & 2 deletions core/dbt/parser/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,14 @@ class HookParser(SimpleParser[HookBlock, ParsedHookNode]):
def transform(self, node):
return node

def get_paths(self) -> List[FilePath]:
# Hooks are only in the dbt_project.yml file for the project
def get_path(self) -> FilePath:
path = FilePath(
project_root=self.project.project_root,
searched_path='.',
relative_path='dbt_project.yml',
)
return [path]
return path

def parse_from_dict(self, dct, validate=True) -> ParsedHookNode:
if validate:
Expand Down
9 changes: 6 additions & 3 deletions core/dbt/parser/macros.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from dbt.clients import jinja
from dbt.contracts.graph.unparsed import UnparsedMacro
from dbt.contracts.graph.parsed import ParsedMacro
from dbt.contracts.files import FilePath
from dbt.exceptions import CompilationException
from dbt.logger import GLOBAL_LOGGER as logger
from dbt.node_types import NodeType
Expand All @@ -14,12 +15,14 @@


class MacroParser(BaseParser[ParsedMacro]):
def get_paths(self):
return FilesystemSearcher(
# This is only used when creating a MacroManifest separate
# from the normal parsing flow.
def get_paths(self) -> List[FilePath]:
return list(FilesystemSearcher(
project=self.project,
relative_dirs=self.project.macro_paths,
extension='.sql',
)
))

@property
def resource_type(self) -> NodeType:
Expand Down
Loading

0 comments on commit dc6350f

Please sign in to comment.