Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add event tracking for project parse/load time #2893

Merged
merged 4 commits into from
Nov 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

### Under the hood
- Bump hologram version to 0.0.11. Add scripts/dtr.py ([#2888](https:/fishtown-analytics/dbt/issues/2840),[#2889](https:/fishtown-analytics/dbt/pull/2889))
- Add event tracking for project parser/load times ([#2823](https:/fishtown-analytics/dbt/issues/2823),[#2893](https:/fishtown-analytics/dbt/pull/2893))

Contributors:
- [@feluelle](https:/feluelle) ([#2841](https:/fishtown-analytics/dbt/pull/2841))
Expand Down
111 changes: 88 additions & 23 deletions core/dbt/parser/manifest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from dataclasses import dataclass
from dataclasses import field
import os
import pickle
from typing import (
Expand All @@ -6,6 +8,7 @@
import time

import dbt.exceptions
import dbt.tracking
import dbt.flags as flags

from dbt.adapters.factory import (
Expand All @@ -24,6 +27,7 @@
from dbt.contracts.graph.parsed import (
ParsedSourceDefinition, ParsedNode, ParsedMacro, ColumnInfo, ParsedExposure
)
from dbt.contracts.util import Writable
from dbt.exceptions import (
ref_target_not_found,
get_target_not_found_or_disabled_msg,
Expand All @@ -47,12 +51,39 @@
from dbt.ui import warning_tag
from dbt.version import __version__

from hologram import JsonSchemaMixin

PARTIAL_PARSE_FILE_NAME = 'partial_parse.pickle'
PARSING_STATE = DbtProcessState('parsing')
DEFAULT_PARTIAL_PARSE = False


@dataclass
class ParserInfo(JsonSchemaMixin):
parser: str
elapsed: float
path_count: int = 0


@dataclass
class ProjectLoaderInfo(JsonSchemaMixin):
project_name: str
elapsed: float
parsers: List[ParserInfo]
path_count: int = 0


@dataclass
class ManifestLoaderInfo(JsonSchemaMixin, Writable):
path_count: int = 0
is_partial_parse_enabled: Optional[bool] = None
parse_project_elapsed: Optional[float] = None
patch_sources_elapsed: Optional[float] = None
process_manifest_elapsed: Optional[float] = None
load_all_elapsed: Optional[float] = None
projects: List[ProjectLoaderInfo] = field(default_factory=list)


_parser_types: List[Type[Parser]] = [
ModelParser,
SnapshotParser,
Expand Down Expand Up @@ -120,10 +151,26 @@ def __init__(
root_project, all_projects,
)
self._loaded_file_cache: Dict[str, FileBlock] = {}
partial_parse = self._partial_parse_enabled()
self._perf_info: Dict[str, Any] = {
'path_count': 0, 'projects': [],
'is_partial_parse_enabled': partial_parse}
self._perf_info = ManifestLoaderInfo(
is_partial_parse_enabled=self._partial_parse_enabled()
)

def track_project_load(self):
invocation_id = dbt.tracking.active_user.invocation_id
dbt.tracking.track_project_load({
"invocation_id": invocation_id,
"project_id": self.root_project.hashed_name(),
"path_count": self._perf_info.path_count,
"parse_project_elapsed": self._perf_info.parse_project_elapsed,
"patch_sources_elapsed": self._perf_info.patch_sources_elapsed,
"process_manifest_elapsed": (
self._perf_info.process_manifest_elapsed
),
"load_all_elapsed": self._perf_info.load_all_elapsed,
"is_partial_parse_enabled": (
self._perf_info.is_partial_parse_enabled
),
})

def parse_with_cache(
self,
Expand Down Expand Up @@ -175,7 +222,7 @@ def parse_project(
# per-project cache.
self._loaded_file_cache.clear()

project_info: Dict[str, Any] = {'parsers': []}
project_parser_info: List[ParserInfo] = []
start_timer = time.perf_counter()
total_path_count = 0
for parser in parsers:
Expand All @@ -188,21 +235,26 @@ def parse_project(
print("..", end='', flush=True)

if parser_path_count > 0:
parser_elapsed = time.perf_counter() - parser_start_timer
project_info['parsers'].append({'parser': type(
parser).__name__, 'path_count': parser_path_count,
'elapsed': '{:.2f}'.format(parser_elapsed)})
project_parser_info.append(ParserInfo(
parser=parser.resource_type,
path_count=parser_path_count,
elapsed=time.perf_counter() - parser_start_timer
))
total_path_count = total_path_count + parser_path_count
if total_path_count > 100:
print("..")

elapsed = time.perf_counter() - start_timer
project_info['project_name'] = project.project_name
project_info['path_count'] = total_path_count
project_info['elapsed'] = '{:.2f}'.format(elapsed)
self._perf_info['projects'].append(project_info)
self._perf_info['path_count'] = self._perf_info['path_count'] + \
total_path_count
project_info = ProjectLoaderInfo(
project_name=project.project_name,
path_count=total_path_count,
elapsed=elapsed,
parsers=project_parser_info
)
self._perf_info.projects.append(project_info)
self._perf_info.path_count = (
self._perf_info.path_count + total_path_count
)

def load_only_macros(self) -> Manifest:
old_results = self.read_parse_results()
Expand All @@ -228,11 +280,14 @@ def load(self, macro_manifest: Manifest):
self.results.files.update(macro_manifest.files)

start_timer = time.perf_counter()

for project in self.all_projects.values():
# parse a single project
self.parse_project(project, macro_manifest, old_results)
self._perf_info['parse_project_elapsed'] = '{:.2f}'.format(
time.perf_counter() - start_timer)

self._perf_info.parse_project_elapsed = (
time.perf_counter() - start_timer
)

def write_parse_results(self):
path = os.path.join(self.root_project.target_path,
Expand Down Expand Up @@ -335,8 +390,9 @@ def create_manifest(self) -> Manifest:
# list is created
start_patch = time.perf_counter()
sources = patch_sources(self.results, self.root_project)
self._perf_info['patch_sources_elapsed'] = '{:.2f}'.format(
time.perf_counter() - start_patch)
self._perf_info.patch_sources_elapsed = (
time.perf_counter() - start_patch
)
disabled = []
for value in self.results.disabled.values():
disabled.extend(value)
Expand All @@ -360,8 +416,11 @@ def create_manifest(self) -> Manifest:
manifest.patch_macros(self.results.macro_patches)
start_process = time.perf_counter()
self.process_manifest(manifest)
self._perf_info['process_manifest_elapsed'] = '{:.2f}'.format(
time.perf_counter() - start_process)

self._perf_info.process_manifest_elapsed = (
time.perf_counter() - start_process
)

return manifest

@classmethod
Expand All @@ -373,15 +432,21 @@ def load_all(
) -> Manifest:
with PARSING_STATE:
start_load_all = time.perf_counter()

projects = root_config.load_dependencies()
loader = cls(root_config, projects, macro_hook)
loader.load(macro_manifest=macro_manifest)
loader.write_parse_results()
manifest = loader.create_manifest()
_check_manifest(manifest, root_config)
manifest.build_flat_graph()
loader._perf_info['load_all_elapsed'] = '{:.2f}'.format(
time.perf_counter() - start_load_all)

loader._perf_info.load_all_elapsed = (
time.perf_counter() - start_load_all
)

loader.track_project_load()

return manifest

@classmethod
Expand Down
11 changes: 4 additions & 7 deletions core/dbt/task/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,7 @@
from dbt.adapters.factory import get_adapter
from dbt.parser.manifest import Manifest, ManifestLoader, _check_manifest
from dbt.logger import DbtProcessState, print_timestamped_line
from dbt.clients.system import write_file
from dbt.graph import Graph
import dbt.utils
import json
import time
from typing import Optional
import os
Expand All @@ -36,8 +33,7 @@ def write_manifest(self):

def write_perf_info(self):
path = os.path.join(self.config.target_path, PERF_INFO_FILE_NAME)
write_file(path, json.dumps(self.loader._perf_info,
cls=dbt.utils.JSONEncoder, indent=4))
self.loader._perf_info.write(path)
print_timestamped_line(f"Performance info: {path}")

# This method takes code that normally exists in other files
Expand Down Expand Up @@ -71,8 +67,9 @@ def get_full_manifest(self):
print_timestamped_line("Manifest checked")
manifest.build_flat_graph()
print_timestamped_line("Flat graph built")
loader._perf_info['load_all_elapsed'] = '{:.2f}'.format(
time.perf_counter() - start_load_all)
loader._perf_info.load_all_elapsed = (
time.perf_counter() - start_load_all
)

self.loader = loader
self.manifest = manifest
Expand Down
15 changes: 15 additions & 0 deletions core/dbt/tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
PACKAGE_INSTALL_SPEC = 'iglu:com.dbt/package_install/jsonschema/1-0-0'
RPC_REQUEST_SPEC = 'iglu:com.dbt/rpc_request/jsonschema/1-0-1'
DEPRECATION_WARN_SPEC = 'iglu:com.dbt/deprecation_warn/jsonschema/1-0-0'
LOAD_ALL_TIMING_SPEC = 'iglu:com.dbt/load_all_timing/jsonschema/1-0-0'

DBT_INVOCATION_ENV = 'DBT_INVOCATION_ENV'

Expand Down Expand Up @@ -273,6 +274,20 @@ def track_invocation_start(config=None, args=None):
)


def track_project_load(options):
context = [SelfDescribingJson(LOAD_ALL_TIMING_SPEC, options)]
assert active_user is not None, \
'Cannot track project loading time when active user is None'

track(
active_user,
category='dbt',
action='load_project',
label=active_user.invocation_id,
context=context
)


def track_model_run(options):
context = [SelfDescribingJson(RUN_MODEL_SPEC, options)]
assert active_user is not None, \
Expand Down
Loading