Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cherry pick] update project load time #3500

Merged
merged 1 commit into from
Jun 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- Dispatch the core SQL statement of the new test materialization, to benefit adapter maintainers ([#3465](https:/fishtown-analytics/dbt/pull/3465), [#3461](https:/fishtown-analytics/dbt/pull/3461))
- Minimal validation of yaml dictionaries prior to partial parsing ([#3246](https:/fishtown-analytics/dbt/issues/3246), [#3460](https:/fishtown-analytics/dbt/pull/3460))
- Add partial parsing tests and improve partial parsing handling of macros ([#3449](https:/fishtown-analytics/dbt/issues/3449), [#3505](https:/fishtown-analytics/dbt/pull/3505))
- Update project loading event data to include experimental parser information. ([#3438](https:/fishtown-analytics/dbt/issues/3438), [#3495](https:/fishtown-analytics/dbt/pull/3495))

Contributors:
- [@swanderz](https:/swanderz) ([#3461](https:/fishtown-analytics/dbt/pull/3461))
Expand Down
10 changes: 10 additions & 0 deletions core/dbt/contracts/graph/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,12 @@ def _find_macros_by_name(
return candidates


@dataclass
class ParsingInfo:
static_analysis_parsed_path_count: int = 0
static_analysis_path_count: int = 0


@dataclass
class ManifestStateCheck(dbtClassMixin):
vars_hash: FileHash = field(default_factory=FileHash.empty)
Expand Down Expand Up @@ -578,6 +584,10 @@ class Manifest(MacroMethods, DataClassMessagePackMixin, dbtClassMixin):
_analysis_lookup: Optional[AnalysisLookup] = field(
default=None, metadata={'serialize': lambda x: None, 'deserialize': lambda x: None}
)
_parsing_info: ParsingInfo = field(
default_factory=ParsingInfo,
metadata={'serialize': lambda x: None, 'deserialize': lambda x: None}
)
_lock: Lock = field(
default_factory=flags.MP_CONTEXT.Lock,
metadata={'serialize': lambda x: None, 'deserialize': lambda x: None}
Expand Down
58 changes: 39 additions & 19 deletions core/dbt/parser/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from dbt.parser.partial import PartialParsing
from dbt.contracts.graph.compiled import ManifestNode
from dbt.contracts.graph.manifest import (
Manifest, Disabled, MacroManifest, ManifestStateCheck
Manifest, Disabled, MacroManifest, ManifestStateCheck, ParsingInfo
)
from dbt.contracts.graph.parsed import (
ParsedSourceDefinition, ParsedNode, ParsedMacro, ColumnInfo, ParsedExposure
Expand Down Expand Up @@ -71,7 +71,7 @@
class ParserInfo(dbtClassMixin):
parser: str
elapsed: float
path_count: int = 0
parsed_path_count: int = 0


# Part of saved performance info
Expand All @@ -80,14 +80,18 @@ class ProjectLoaderInfo(dbtClassMixin):
project_name: str
elapsed: float
parsers: List[ParserInfo] = field(default_factory=list)
path_count: int = 0
parsed_path_count: int = 0


# Part of saved performance info
@dataclass
class ManifestLoaderInfo(dbtClassMixin, Writable):
path_count: int = 0
parsed_path_count: int = 0
static_analysis_path_count: int = 0
static_analysis_parsed_path_count: int = 0
is_partial_parse_enabled: Optional[bool] = None
is_static_analysis_enabled: Optional[bool] = None
read_files_elapsed: Optional[float] = None
load_macros_elapsed: Optional[float] = None
parse_project_elapsed: Optional[float] = None
Expand Down Expand Up @@ -135,8 +139,6 @@ def __init__(
# have been enabled, but not happening because of some issue.
self.partially_parsing = False

self._perf_info = self.build_perf_info()

# This is a saved manifest from a previous run that's used for partial parsing
self.saved_manifest: Optional[Manifest] = self.read_manifest_for_partial_parse()

Expand Down Expand Up @@ -184,7 +186,6 @@ def get_full_manifest(

# This is where the main action happens
def load(self):

# Read files creates a dictionary of projects to a dictionary
# of parsers to lists of file strings. The file strings are
# used to get the SourceFiles from the manifest files.
Expand All @@ -196,6 +197,7 @@ def load(self):
project_parser_files = {}
for project in self.all_projects.values():
read_files(project, self.manifest.files, project_parser_files)
self._perf_info.path_count = len(self.manifest.files)
self._perf_info.read_files_elapsed = (time.perf_counter() - start_read_files)

skip_parsing = False
Expand All @@ -208,13 +210,15 @@ def load(self):
# files are different, we need to create a new set of
# project_parser_files.
project_parser_files = partial_parsing.get_parsing_files()
self.manifest = self.saved_manifest
self.partially_parsing = True

if skip_parsing:
logger.info("Partial parsing enabled, no changes found, skipping parsing")
self.manifest = self.saved_manifest

if self.manifest._parsing_info is None:
self.manifest._parsing_info = ParsingInfo()

if skip_parsing:
logger.info("Partial parsing enabled, no changes found, skipping parsing")
else:
# Load Macros
# We need to parse the macros first, so they're resolvable when
Expand All @@ -230,6 +234,8 @@ def load(self):
for file_id in parser_files['MacroParser']:
block = FileBlock(self.manifest.files[file_id])
parser.parse_file(block)
# increment parsed path count for performance tracking
self._perf_info.parsed_path_count = self._perf_info.parsed_path_count + 1
# Look at changed macros and update the macro.depends_on.macros
self.macro_depends_on()
self._perf_info.load_macros_elapsed = (time.perf_counter() - start_load_macros)
Expand Down Expand Up @@ -301,9 +307,17 @@ def load(self):
self.process_sources(self.root_project.project_name)
self.process_refs(self.root_project.project_name)
self.process_docs(self.root_project)

# update tracking data
self._perf_info.process_manifest_elapsed = (
time.perf_counter() - start_process
)
self._perf_info.static_analysis_parsed_path_count = (
self.manifest._parsing_info.static_analysis_parsed_path_count
)
self._perf_info.static_analysis_path_count = (
self.manifest._parsing_info.static_analysis_path_count
)

# write out the fully parsed manifest
self.write_manifest_for_partial_parse()
Expand All @@ -321,7 +335,7 @@ def parse_project(

project_loader_info = self._perf_info._project_index[project.project_name]
start_timer = time.perf_counter()
total_path_count = 0
total_parsed_path_count = 0

# Loop through parsers with loaded files.
for parser_cls in parser_types:
Expand All @@ -331,7 +345,7 @@ def parse_project(
continue

# Initialize timing info
parser_path_count = 0
project_parsed_path_count = 0
parser_start_timer = time.perf_counter()

# Parse the project files for this parser
Expand All @@ -347,15 +361,15 @@ def parse_project(
parser.parse_file(block, dct=dct)
else:
parser.parse_file(block)
parser_path_count = parser_path_count + 1
project_parsed_path_count = project_parsed_path_count + 1

# Save timing info
project_loader_info.parsers.append(ParserInfo(
parser=parser.resource_type,
path_count=parser_path_count,
parsed_path_count=project_parsed_path_count,
elapsed=time.perf_counter() - parser_start_timer
))
total_path_count = total_path_count + parser_path_count
total_parsed_path_count = total_parsed_path_count + project_parsed_path_count

# HookParser doesn't run from loaded files, just dbt_project.yml,
# so do separately
Expand All @@ -372,10 +386,12 @@ def parse_project(

# Store the performance info
elapsed = time.perf_counter() - start_timer
project_loader_info.path_count = project_loader_info.path_count + total_path_count
project_loader_info.parsed_path_count = (
project_loader_info.parsed_path_count + total_parsed_path_count
)
project_loader_info.elapsed = project_loader_info.elapsed + elapsed
self._perf_info.path_count = (
self._perf_info.path_count + total_path_count
self._perf_info.parsed_path_count = (
self._perf_info.parsed_path_count + total_parsed_path_count
)

# Loop through macros in the manifest and statically parse
Expand Down Expand Up @@ -501,12 +517,12 @@ def read_manifest_for_partial_parse(self) -> Optional[Manifest]:

def build_perf_info(self):
mli = ManifestLoaderInfo(
is_partial_parse_enabled=self._partial_parse_enabled()
is_partial_parse_enabled=self._partial_parse_enabled(),
is_static_analysis_enabled=flags.USE_EXPERIMENTAL_PARSER
)
for project in self.all_projects.values():
project_info = ProjectLoaderInfo(
project_name=project.project_name,
path_count=0,
elapsed=0,
)
mli.projects.append(project_info)
Expand Down Expand Up @@ -603,6 +619,7 @@ def track_project_load(self):
"invocation_id": invocation_id,
"project_id": self.root_project.hashed_name(),
"path_count": self._perf_info.path_count,
"parsed_path_count": self._perf_info.parsed_path_count,
"read_files_elapsed": self._perf_info.read_files_elapsed,
"load_macros_elapsed": self._perf_info.load_macros_elapsed,
"parse_project_elapsed": self._perf_info.parse_project_elapsed,
Expand All @@ -614,6 +631,9 @@ def track_project_load(self):
"is_partial_parse_enabled": (
self._perf_info.is_partial_parse_enabled
),
"is_static_analysis_enabled": self._perf_info.is_static_analysis_enabled,
"static_analysis_path_count": self._perf_info.static_analysis_path_count,
"static_analysis_parsed_path_count": self._perf_info.static_analysis_parsed_path_count,
})

# Takes references in 'refs' array of nodes and exposures, finds the target
Expand Down
7 changes: 5 additions & 2 deletions core/dbt/parser/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from dbt.contracts.graph.parsed import ParsedModelNode
import dbt.flags as flags
from dbt.node_types import NodeType
from dbt.parser.base import IntermediateNode, SimpleSQLParser
from dbt.parser.base import SimpleSQLParser
from dbt.parser.search import FileBlock
from dbt.tree_sitter_jinja.extractor import extract_from_source

Expand All @@ -22,8 +22,9 @@ def get_compiled_path(cls, block: FileBlock):
return block.path.relative_path

def render_update(
self, node: IntermediateNode, config: ContextConfig
self, node: ParsedModelNode, config: ContextConfig
) -> None:
self.manifest._parsing_info.static_analysis_path_count += 1

# normal dbt run
if not flags.USE_EXPERIMENTAL_PARSER:
Expand Down Expand Up @@ -63,5 +64,7 @@ def render_update(
for configv in res['configs']:
node.config[configv[0]] = configv[1]

self.manifest._parsing_info.static_analysis_parsed_path_count += 1

else:
super().render_update(node, config)
5 changes: 2 additions & 3 deletions core/dbt/parser/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,10 @@ def error_context(

def yaml_from_file(
source_file: SchemaSourceFile
) -> Optional[Dict[str, Any]]:
) -> Dict[str, Any]:
"""If loading the yaml fails, raise an exception.
"""
path: str = source_file.path.relative_path
path = source_file.path.relative_path
try:
return load_yaml_text(source_file.contents)
except ValidationException as e:
Expand All @@ -110,7 +110,6 @@ def yaml_from_file(
'Error reading {}: {} - {}'
.format(source_file.project_name, path, reason)
)
return None


class ParserRef:
Expand Down
2 changes: 1 addition & 1 deletion core/dbt/tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
PACKAGE_INSTALL_SPEC = 'iglu:com.dbt/package_install/jsonschema/1-0-0'
RPC_REQUEST_SPEC = 'iglu:com.dbt/rpc_request/jsonschema/1-0-1'
DEPRECATION_WARN_SPEC = 'iglu:com.dbt/deprecation_warn/jsonschema/1-0-0'
LOAD_ALL_TIMING_SPEC = 'iglu:com.dbt/load_all_timing/jsonschema/1-0-2'
LOAD_ALL_TIMING_SPEC = 'iglu:com.dbt/load_all_timing/jsonschema/1-0-3'
RESOURCE_COUNTS = 'iglu:com.dbt/resource_counts/jsonschema/1-0-0'

DBT_INVOCATION_ENV = 'DBT_INVOCATION_ENV'
Expand Down
6 changes: 5 additions & 1 deletion test/integration/033_event_tracking_test/test_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,16 @@ def load_context(self):

def populate(project_id, user_id, invocation_id, version):
return [{
'schema': 'iglu:com.dbt/load_all_timing/jsonschema/1-0-2',
'schema': 'iglu:com.dbt/load_all_timing/jsonschema/1-0-3',
'data': {
'invocation_id': invocation_id,
'project_id': project_id,
'parsed_path_count': ANY,
'path_count': ANY,
'is_partial_parse_enabled': ANY,
'is_static_analysis_enabled': ANY,
'static_analysis_path_count': ANY,
'static_analysis_parsed_path_count': ANY,
'load_all_elapsed': ANY,
'read_files_elapsed': ANY,
'load_macros_elapsed': ANY,
Expand Down