diff --git a/.changes/unreleased/Features-20230206-084749.yaml b/.changes/unreleased/Features-20230206-084749.yaml new file mode 100644 index 00000000000..53d867ed403 --- /dev/null +++ b/.changes/unreleased/Features-20230206-084749.yaml @@ -0,0 +1,6 @@ +kind: Features +body: Enable diff based partial parsing +time: 2023-02-06T08:47:49.688889-05:00 +custom: + Author: gshank + Issue: "6592" diff --git a/core/dbt/clients/system.py b/core/dbt/clients/system.py index dd802afaeca..831f80f2f22 100644 --- a/core/dbt/clients/system.py +++ b/core/dbt/clients/system.py @@ -18,7 +18,6 @@ from dbt.events.functions import fire_event from dbt.events.types import ( SystemCouldNotWrite, - SystemErrorRetrievingModTime, SystemExecutingCmd, SystemStdOut, SystemStdErr, @@ -77,11 +76,7 @@ def find_matching( relative_path = os.path.relpath(absolute_path, absolute_path_to_search) relative_path_to_root = os.path.join(relative_path_to_search, relative_path) - modification_time = 0.0 - try: - modification_time = os.path.getmtime(absolute_path) - except OSError: - fire_event(SystemErrorRetrievingModTime(path=absolute_path)) + modification_time = os.path.getmtime(absolute_path) if reobj.match(local_file) and ( not ignore_spec or not ignore_spec.match_file(relative_path_to_root) ): diff --git a/core/dbt/contracts/files.py b/core/dbt/contracts/files.py index 9e82247da00..ec98ec6b2b7 100644 --- a/core/dbt/contracts/files.py +++ b/core/dbt/contracts/files.py @@ -61,8 +61,6 @@ def absolute_path(self) -> str: @property def original_file_path(self) -> str: - # this is mostly used for reporting errors. It doesn't show the project - # name, should it? return os.path.join(self.searched_path, self.relative_path) def seed_too_large(self) -> bool: diff --git a/core/dbt/events/proto_types.py b/core/dbt/events/proto_types.py index bd6610f0d6d..05d7fca6c17 100644 --- a/core/dbt/events/proto_types.py +++ b/core/dbt/events/proto_types.py @@ -962,6 +962,20 @@ class FinishedRunningStatsMsg(betterproto.Message): data: "FinishedRunningStats" = betterproto.message_field(2) +@dataclass +class InputFileDiffError(betterproto.Message): + """I001""" + + category: str = betterproto.string_field(1) + file_id: str = betterproto.string_field(2) + + +@dataclass +class InputFileDiffErrorMsg(betterproto.Message): + info: "EventInfo" = betterproto.message_field(1) + data: "InputFileDiffError" = betterproto.message_field(2) + + @dataclass class InvalidValueForField(betterproto.Message): """I008""" @@ -2253,19 +2267,6 @@ class MainStackTraceMsg(betterproto.Message): data: "MainStackTrace" = betterproto.message_field(2) -@dataclass -class SystemErrorRetrievingModTime(betterproto.Message): - """Z004""" - - path: str = betterproto.string_field(1) - - -@dataclass -class SystemErrorRetrievingModTimeMsg(betterproto.Message): - info: "EventInfo" = betterproto.message_field(1) - data: "SystemErrorRetrievingModTime" = betterproto.message_field(2) - - @dataclass class SystemCouldNotWrite(betterproto.Message): """Z005""" diff --git a/core/dbt/events/types.proto b/core/dbt/events/types.proto index 374536bfe19..fbfb6015287 100644 --- a/core/dbt/events/types.proto +++ b/core/dbt/events/types.proto @@ -762,7 +762,18 @@ message FinishedRunningStatsMsg { // I - Project parsing -// Skipping I001, I002, I003, I004, I005, I006, I007 +// I001 +message InputFileDiffError { + string category = 1; + string file_id = 2; +} + +message InputFileDiffErrorMsg { + EventInfo info = 1; + InputFileDiffError data = 2; +} + +// Skipping I002, I003, I004, I005, I006, I007 // I008 message InvalidValueForField { @@ -1808,15 +1819,7 @@ message MainStackTraceMsg { MainStackTrace data = 2; } -// Z004 -message SystemErrorRetrievingModTime { - string path = 1; -} - -message SystemErrorRetrievingModTimeMsg { - EventInfo info = 1; - SystemErrorRetrievingModTime data = 2; -} +// skipping Z004 // Z005 message SystemCouldNotWrite { diff --git a/core/dbt/events/types.py b/core/dbt/events/types.py index 848a8d4b029..0bb16966f1d 100644 --- a/core/dbt/events/types.py +++ b/core/dbt/events/types.py @@ -768,7 +768,16 @@ def message(self) -> str: # ======================================================= -# Skipping I001, I002, I003, I004, I005, I006, I007 +@dataclass +class InputFileDiffError(DebugLevel, pt.InputFileDiffError): + def code(self): + return "I001" + + def message(self) -> str: + return f"Error processing file diff: {self.category}, {self.file_id}" + + +# Skipping I002, I003, I004, I005, I006, I007 @dataclass @@ -1891,13 +1900,7 @@ def message(self) -> str: return self.stack_trace -@dataclass -class SystemErrorRetrievingModTime(ErrorLevel, pt.SystemErrorRetrievingModTime): - def code(self): - return "Z004" - - def message(self) -> str: - return f"Error retrieving modification time for file {self.path}" +# Skipped Z004 @dataclass diff --git a/core/dbt/parser/manifest.py b/core/dbt/parser/manifest.py index 57c08cc223b..afe078181c4 100644 --- a/core/dbt/parser/manifest.py +++ b/core/dbt/parser/manifest.py @@ -40,14 +40,19 @@ from dbt.node_types import NodeType, AccessType from dbt.clients.jinja import get_rendered, MacroStack from dbt.clients.jinja_static import statically_extract_macro_calls -from dbt.clients.system import make_directory, write_file +from dbt.clients.system import make_directory, path_exists, read_json, write_file from dbt.config import Project, RuntimeConfig from dbt.context.docs import generate_runtime_docs_context from dbt.context.macro_resolver import MacroResolver, TestMacroNamespace from dbt.context.configured import generate_macro_context from dbt.context.providers import ParseProvider from dbt.contracts.files import FileHash, ParseFileType, SchemaSourceFile -from dbt.parser.read_files import read_files, load_source_file +from dbt.parser.read_files import ( + ReadFilesFromFileSystem, + load_source_file, + FileDiff, + ReadFilesFromDiff, +) from dbt.parser.partial import PartialParsing, special_override_macros from dbt.contracts.graph.manifest import ( Manifest, @@ -153,9 +158,11 @@ def __init__( root_project: RuntimeConfig, all_projects: Mapping[str, Project], macro_hook: Optional[Callable[[Manifest], Any]] = None, + file_diff: Optional[FileDiff] = None, ) -> None: self.root_project: RuntimeConfig = root_project self.all_projects: Mapping[str, Project] = all_projects + self.file_diff = file_diff self.manifest: Manifest = Manifest() self.new_manifest = self.manifest self.manifest.metadata = root_project.get_metadata() @@ -190,6 +197,7 @@ def get_full_manifest( cls, config: RuntimeConfig, *, + file_diff: Optional[FileDiff] = None, reset: bool = False, write_perf_info=False, ) -> Manifest: @@ -202,12 +210,19 @@ def get_full_manifest( adapter.clear_macro_manifest() macro_hook = adapter.connections.set_query_header + # Hack to test file_diffs + if os.environ.get("DBT_PP_FILE_DIFF_TEST"): + file_diff_path = "file_diff.json" + if path_exists(file_diff_path): + file_diff_dct = read_json(file_diff_path) + file_diff = FileDiff.from_dict(file_diff_dct) + with PARSING_STATE: # set up logbook.Processor for parsing # Start performance counting start_load_all = time.perf_counter() projects = config.load_dependencies() - loader = cls(config, projects, macro_hook) + loader = cls(config, projects, macro_hook=macro_hook, file_diff=file_diff) manifest = loader.load() @@ -229,17 +244,35 @@ def get_full_manifest( # This is where the main action happens def load(self): - # Read files creates a dictionary of projects to a dictionary + start_read_files = time.perf_counter() + + # This updates the "files" dictionary in self.manifest, and creates + # the partial_parser_files dictionary (see read_files.py), + # which is a dictionary of projects to a dictionary # of parsers to lists of file strings. The file strings are # used to get the SourceFiles from the manifest files. - start_read_files = time.perf_counter() - project_parser_files = {} - saved_files = {} - if self.saved_manifest: - saved_files = self.saved_manifest.files - for project in self.all_projects.values(): - read_files(project, self.manifest.files, project_parser_files, saved_files) - orig_project_parser_files = project_parser_files + saved_files = self.saved_manifest.files if self.saved_manifest else {} + if self.file_diff: + # We're getting files from a file diff + file_reader = ReadFilesFromDiff( + all_projects=self.all_projects, + files=self.manifest.files, + saved_files=saved_files, + root_project_name=self.root_project.project_name, + file_diff=self.file_diff, + ) + else: + # We're getting files from the file system + file_reader = ReadFilesFromFileSystem( + all_projects=self.all_projects, + files=self.manifest.files, + saved_files=saved_files, + ) + + # Set the files in the manifest and save the project_parser_files + file_reader.read_files() + self.manifest.files = file_reader.files + project_parser_files = orig_project_parser_files = file_reader.project_parser_files self._perf_info.path_count = len(self.manifest.files) self._perf_info.read_files_elapsed = time.perf_counter() - start_read_files diff --git a/core/dbt/parser/partial.py b/core/dbt/parser/partial.py index e44b78145d1..83280d3a780 100644 --- a/core/dbt/parser/partial.py +++ b/core/dbt/parser/partial.py @@ -73,7 +73,6 @@ def __init__(self, saved_manifest: Manifest, new_files: MutableMapping[str, AnyS self.project_parser_files: Dict = {} self.saved_files = self.saved_manifest.files self.project_parser_files = {} - self.deleted_manifest = Manifest() self.macro_child_map: Dict[str, List[str]] = {} ( self.env_vars_changed_source_files, @@ -268,7 +267,7 @@ def delete_from_saved(self, file_id): # macros/tests if saved_source_file.parse_file_type in mssat_files: self.remove_mssat_file(saved_source_file) - self.deleted_manifest.files[file_id] = self.saved_manifest.files.pop(file_id) + self.saved_manifest.files.pop(file_id) # macros if saved_source_file.parse_file_type in mg_files: @@ -311,7 +310,6 @@ def update_mssat_in_saved(self, new_source_file, old_source_file): # replace source_file in saved and add to parsing list file_id = new_source_file.file_id - self.deleted_manifest.files[file_id] = old_source_file self.saved_files[file_id] = deepcopy(new_source_file) self.add_to_pp_files(new_source_file) for unique_id in unique_ids: @@ -321,7 +319,6 @@ def remove_node_in_saved(self, source_file, unique_id): if unique_id in self.saved_manifest.nodes: # delete node in saved node = self.saved_manifest.nodes.pop(unique_id) - self.deleted_manifest.nodes[unique_id] = node elif ( source_file.file_id in self.disabled_by_file_id and unique_id in self.saved_manifest.disabled @@ -456,7 +453,7 @@ def delete_macro_file(self, source_file, follow_references=False): file_id = source_file.file_id # It's not clear when this file_id would not exist in saved_files if file_id in self.saved_files: - self.deleted_manifest.files[file_id] = self.saved_files.pop(file_id) + self.saved_files.pop(file_id) def check_for_special_deleted_macros(self, source_file): for unique_id in source_file.macros: @@ -487,7 +484,6 @@ def handle_macro_file_links(self, source_file, follow_references=False): continue base_macro = self.saved_manifest.macros.pop(unique_id) - self.deleted_manifest.macros[unique_id] = base_macro # Recursively check children of this macro # The macro_child_map might not exist if a macro is removed by @@ -565,16 +561,14 @@ def delete_doc_node(self, source_file): # remove the nodes in the 'docs' dictionary docs = source_file.docs.copy() for unique_id in docs: - self.deleted_manifest.docs[unique_id] = self.saved_manifest.docs.pop(unique_id) + self.saved_manifest.docs.pop(unique_id) source_file.docs.remove(unique_id) # The unique_id of objects that contain a doc call are stored in the # doc source_file.nodes self.schedule_nodes_for_parsing(source_file.nodes) source_file.nodes = [] # Remove the file object - self.deleted_manifest.files[source_file.file_id] = self.saved_manifest.files.pop( - source_file.file_id - ) + self.saved_manifest.files.pop(source_file.file_id) # Schema files ----------------------- # Changed schema files @@ -608,7 +602,7 @@ def delete_schema_file(self, file_id): saved_yaml_dict = saved_schema_file.dict_from_yaml new_yaml_dict = {} self.handle_schema_file_changes(saved_schema_file, saved_yaml_dict, new_yaml_dict) - self.deleted_manifest.files[file_id] = self.saved_manifest.files.pop(file_id) + self.saved_manifest.files.pop(file_id) # For each key in a schema file dictionary, process the changed, deleted, and added # elemnts for the key lists @@ -876,8 +870,7 @@ def remove_tests(self, schema_file, dict_key, name): tests = schema_file.get_tests(dict_key, name) for test_unique_id in tests: if test_unique_id in self.saved_manifest.nodes: - node = self.saved_manifest.nodes.pop(test_unique_id) - self.deleted_manifest.nodes[test_unique_id] = node + self.saved_manifest.nodes.pop(test_unique_id) schema_file.remove_tests(dict_key, name) def delete_schema_source(self, schema_file, source_dict): @@ -892,7 +885,6 @@ def delete_schema_source(self, schema_file, source_dict): source = self.saved_manifest.sources[unique_id] if source.source_name == source_name: source = self.saved_manifest.sources.pop(unique_id) - self.deleted_manifest.sources[unique_id] = source schema_file.sources.remove(unique_id) self.schedule_referencing_nodes_for_parsing(unique_id) @@ -904,7 +896,6 @@ def delete_schema_macro_patch(self, schema_file, macro): del schema_file.macro_patches[macro["name"]] if macro_unique_id and macro_unique_id in self.saved_manifest.macros: macro = self.saved_manifest.macros.pop(macro_unique_id) - self.deleted_manifest.macros[macro_unique_id] = macro macro_file_id = macro.file_id if macro_file_id in self.new_files: self.saved_files[macro_file_id] = deepcopy(self.new_files[macro_file_id]) @@ -919,9 +910,7 @@ def delete_schema_exposure(self, schema_file, exposure_dict): if unique_id in self.saved_manifest.exposures: exposure = self.saved_manifest.exposures[unique_id] if exposure.name == exposure_name: - self.deleted_manifest.exposures[unique_id] = self.saved_manifest.exposures.pop( - unique_id - ) + self.saved_manifest.exposures.pop(unique_id) schema_file.exposures.remove(unique_id) elif unique_id in self.saved_manifest.disabled: self.delete_disabled(unique_id, schema_file.file_id) @@ -935,9 +924,7 @@ def delete_schema_group(self, schema_file, group_dict): group = self.saved_manifest.groups[unique_id] if group.name == group_name: self.schedule_nodes_for_parsing(self.saved_manifest.group_map[group.name]) - self.deleted_manifest.groups[unique_id] = self.saved_manifest.groups.pop( - unique_id - ) + self.saved_manifest.groups.pop(unique_id) schema_file.groups.remove(unique_id) # metrics are created only from schema files, but also can be referred to by other nodes @@ -951,9 +938,7 @@ def delete_schema_metric(self, schema_file, metric_dict): # Need to find everything that referenced this metric and schedule for parsing if unique_id in self.saved_manifest.child_map: self.schedule_nodes_for_parsing(self.saved_manifest.child_map[unique_id]) - self.deleted_manifest.metrics[unique_id] = self.saved_manifest.metrics.pop( - unique_id - ) + self.saved_manifest.metrics.pop(unique_id) schema_file.metrics.remove(unique_id) elif unique_id in self.saved_manifest.disabled: self.delete_disabled(unique_id, schema_file.file_id) diff --git a/core/dbt/parser/read_files.py b/core/dbt/parser/read_files.py index 531e5f39560..92a91070594 100644 --- a/core/dbt/parser/read_files.py +++ b/core/dbt/parser/read_files.py @@ -1,6 +1,7 @@ import os import pathspec # type: ignore import pathlib +from dataclasses import dataclass, field from dbt.clients.system import load_file_contents from dbt.contracts.files import ( FilePath, @@ -10,11 +11,30 @@ AnySourceFile, SchemaSourceFile, ) - +from dbt.config import Project +from dbt.dataclass_schema import dbtClassMixin from dbt.parser.schemas import yaml_from_file, schema_file_keys, check_format_version from dbt.exceptions import ParsingError from dbt.parser.search import filesystem_search -from typing import Optional +from typing import Optional, Dict, List, Mapping +from dbt.events.types import InputFileDiffError +from dbt.events.functions import fire_event + + +@dataclass +class InputFile(dbtClassMixin): + path: str + content: str + modification_time: float = 0.0 + + +@dataclass +class FileDiff(dbtClassMixin): + deleted: List[str] + # Note: it would be possible to not distinguish between + # added and changed files, but we would lose some error handling. + changed: List[InputFile] + added: List[InputFile] # This loads the files contents and creates the SourceFile object @@ -49,17 +69,17 @@ def load_source_file( skip_loading_schema_file = True if not skip_loading_schema_file: - file_contents = load_file_contents(path.absolute_path, strip=False) - source_file.checksum = FileHash.from_contents(file_contents) - source_file.contents = file_contents.strip() + # We strip the file_contents before generating the checksum because we want + # the checksum to match the stored file contents + file_contents = load_file_contents(path.absolute_path, strip=True) + source_file.contents = file_contents + source_file.checksum = FileHash.from_contents(source_file.contents) if parse_file_type == ParseFileType.Schema and source_file.contents: dfy = yaml_from_file(source_file) if dfy: validate_yaml(source_file.path.original_file_path, dfy) source_file.dfy = dfy - else: - source_file = None return source_file @@ -98,7 +118,7 @@ def load_seed_source_file(match: FilePath, project_name) -> SourceFile: # We don't want to calculate a hash of this file. Use the path. source_file = SourceFile.big_seed(match) else: - file_contents = load_file_contents(match.absolute_path, strip=False) + file_contents = load_file_contents(match.absolute_path, strip=True) checksum = FileHash.from_contents(file_contents) source_file = SourceFile(path=match, checksum=checksum) source_file.contents = "" @@ -131,9 +151,10 @@ def get_source_files(project, paths, extension, parse_file_type, saved_files, ig return fb_list -def read_files_for_parser(project, files, dirs, extensions, parse_ft, saved_files, ignore_spec): +def read_files_for_parser(project, files, parse_ft, file_type_info, saved_files, ignore_spec): + dirs = file_type_info["paths"] parser_files = [] - for extension in extensions: + for extension in file_type_info["extensions"]: source_files = get_source_files( project, dirs, extension, parse_ft, saved_files, ignore_spec ) @@ -153,104 +174,244 @@ def generate_dbt_ignore_spec(project_root): return ignore_spec -# This needs to read files for multiple projects, so the 'files' -# dictionary needs to be passed in. What determines the order of -# the various projects? Is the root project always last? Do the -# non-root projects need to be done separately in order? -def read_files(project, files, parser_files, saved_files): - dbt_ignore_spec = generate_dbt_ignore_spec(project.project_root) - project_files = {} - - project_files["MacroParser"] = read_files_for_parser( - project, - files, - project.macro_paths, - [".sql"], - ParseFileType.Macro, - saved_files, - dbt_ignore_spec, - ) +@dataclass +class ReadFilesFromFileSystem: + all_projects: Mapping[str, Project] + files: Dict[str, AnySourceFile] = field(default_factory=dict) + # saved_files is only used to compare schema files + saved_files: Dict[str, AnySourceFile] = field(default_factory=dict) + # project_parser_files = { + # "my_project": { + # "ModelParser": ["my_project://models/my_model.sql"] + # } + # } + # + project_parser_files: Dict = field(default_factory=dict) - project_files["ModelParser"] = read_files_for_parser( - project, - files, - project.model_paths, - [".sql", ".py"], - ParseFileType.Model, - saved_files, - dbt_ignore_spec, - ) + def read_files(self): + for project in self.all_projects.values(): + file_types = get_file_types_for_project(project) + self.read_files_for_project(project, file_types) - project_files["SnapshotParser"] = read_files_for_parser( - project, - files, - project.snapshot_paths, - [".sql"], - ParseFileType.Snapshot, - saved_files, - dbt_ignore_spec, - ) + def read_files_for_project(self, project, file_types): + dbt_ignore_spec = generate_dbt_ignore_spec(project.project_root) + project_files = self.project_parser_files[project.project_name] = {} - project_files["AnalysisParser"] = read_files_for_parser( - project, - files, - project.analysis_paths, - [".sql"], - ParseFileType.Analysis, - saved_files, - dbt_ignore_spec, - ) + for parse_ft, file_type_info in file_types.items(): + project_files[file_type_info["parser"]] = read_files_for_parser( + project, + self.files, + parse_ft, + file_type_info, + self.saved_files, + dbt_ignore_spec, + ) - project_files["SingularTestParser"] = read_files_for_parser( - project, - files, - project.test_paths, - [".sql"], - ParseFileType.SingularTest, - saved_files, - dbt_ignore_spec, - ) - # all generic tests within /tests must be nested under a /generic subfolder - project_files["GenericTestParser"] = read_files_for_parser( - project, - files, - project.generic_test_paths, - [".sql"], - ParseFileType.GenericTest, - saved_files, - dbt_ignore_spec, - ) +@dataclass +class ReadFilesFromDiff: + root_project_name: str + all_projects: Mapping[str, Project] + file_diff: FileDiff + files: Dict[str, AnySourceFile] = field(default_factory=dict) + # saved_files is used to construct a fresh copy of files, without + # additional information from parsing + saved_files: Dict[str, AnySourceFile] = field(default_factory=dict) + project_parser_files: Dict = field(default_factory=dict) + project_file_types: Dict = field(default_factory=dict) + local_package_dirs: Optional[List[str]] = None - project_files["SeedParser"] = read_files_for_parser( - project, - files, - project.seed_paths, - [".csv"], - ParseFileType.Seed, - saved_files, - dbt_ignore_spec, - ) + def read_files(self): + # Copy the base file information from the existing manifest. + # We will do deletions, adds, changes from the file_diff to emulate + # a complete read of the project file system. + for file_id, source_file in self.saved_files.items(): + if isinstance(source_file, SchemaSourceFile): + file_cls = SchemaSourceFile + else: + file_cls = SourceFile + new_source_file = file_cls( + path=source_file.path, + checksum=source_file.checksum, + project_name=source_file.project_name, + parse_file_type=source_file.parse_file_type, + contents=source_file.contents, + ) + self.files[file_id] = new_source_file - project_files["DocumentationParser"] = read_files_for_parser( - project, - files, - project.docs_paths, - [".md"], - ParseFileType.Documentation, - saved_files, - dbt_ignore_spec, - ) + # Now that we have a copy of the files, remove deleted files + # For now, we assume that all files are in the root_project, until + # we've determined whether project name will be provided or deduced + # from the directory. + for input_file_path in self.file_diff.deleted: + project_name = self.get_project_name(input_file_path) + file_id = f"{project_name}://{input_file_path}" + if file_id in self.files: + self.files.pop(file_id) + else: + fire_event(InputFileDiffError(category="deleted file not found", file_id=file_id)) + + # Now we do the changes + for input_file in self.file_diff.changed: + project_name = self.get_project_name(input_file.path) + file_id = f"{project_name}://{input_file.path}" + if file_id in self.files: + # Get the existing source_file object and update the contents and mod time + source_file = self.files[file_id] + source_file.contents = input_file.content + source_file.checksum = FileHash.from_contents(input_file.content) + source_file.path.modification_time = input_file.modification_time + # Handle creation of dictionary version of schema file content + if isinstance(source_file, SchemaSourceFile) and source_file.contents: + dfy = yaml_from_file(source_file) + if dfy: + validate_yaml(source_file.path.original_file_path, dfy) + source_file.dfy = dfy + # TODO: ensure we have a file object even for empty files, such as schema files + + # Now the new files + for input_file in self.file_diff.added: + project_name = self.get_project_name(input_file.path) + # FilePath + # searched_path i.e. "models" + # relative_path i.e. the part after searched_path, or "model.sql" + # modification_time float, default 0.0... + # project_root + # We use PurePath because there's no actual filesystem to look at + input_file_path = pathlib.PurePath(input_file.path) + extension = input_file_path.suffix + searched_path = input_file_path.parts[0] + # check what happens with generic tests... searched_path/relative_path + + relative_path_parts = input_file_path.parts[1:] + relative_path = pathlib.PurePath("").joinpath(*relative_path_parts) + # Create FilePath object + input_file_path = FilePath( + searched_path=searched_path, + relative_path=str(relative_path), + modification_time=input_file.modification_time, + project_root=self.all_projects[project_name].project_root, + ) + + # Now use the extension and "searched_path" to determine which file_type + (file_types, file_type_lookup) = self.get_project_file_types(project_name) + parse_ft_for_extension = set() + parse_ft_for_path = set() + if extension in file_type_lookup["extensions"]: + parse_ft_for_extension = file_type_lookup["extensions"][extension] + if searched_path in file_type_lookup["paths"]: + parse_ft_for_path = file_type_lookup["paths"][searched_path] + if len(parse_ft_for_extension) == 0 or len(parse_ft_for_path) == 0: + fire_event(InputFileDiffError(category="not a project file", file_id=file_id)) + continue + parse_ft_set = parse_ft_for_extension.intersection(parse_ft_for_path) + if ( + len(parse_ft_set) != 1 + ): # There should only be one result for a path/extension combination + fire_event( + InputFileDiffError( + category="unable to resolve diff file location", file_id=file_id + ) + ) + continue + parse_ft = parse_ft_set.pop() + source_file_cls = SourceFile + if parse_ft == ParseFileType.Schema: + source_file_cls = SchemaSourceFile + source_file = source_file_cls( + path=input_file_path, + contents=input_file.content, + checksum=FileHash.from_contents(input_file.content), + project_name=project_name, + parse_file_type=parse_ft, + ) + if source_file_cls == SchemaSourceFile: + dfy = yaml_from_file(source_file) + if dfy: + validate_yaml(source_file.path.original_file_path, dfy) + source_file.dfy = dfy + else: + # don't include in files because no content + continue + self.files[source_file.file_id] = source_file + + def get_project_name(self, path): + # It's not currently possible to recognize any other project files, + # and it's an open issue how to handle deps. + return self.root_project_name + + def get_project_file_types(self, project_name): + if project_name not in self.project_file_types: + file_types = get_file_types_for_project(self.all_projects[project_name]) + file_type_lookup = self.get_file_type_lookup(file_types) + self.project_file_types[project_name] = { + "file_types": file_types, + "file_type_lookup": file_type_lookup, + } + file_types = self.project_file_types[project_name]["file_types"] + file_type_lookup = self.project_file_types[project_name]["file_type_lookup"] + return (file_types, file_type_lookup) + + def get_file_type_lookup(self, file_types): + file_type_lookup = {"paths": {}, "extensions": {}} + for parse_ft, file_type in file_types.items(): + for path in file_type["paths"]: + if path not in file_type_lookup["paths"]: + file_type_lookup["paths"][path] = set() + file_type_lookup["paths"][path].add(parse_ft) + for extension in file_type["extensions"]: + if extension not in file_type_lookup["extensions"]: + file_type_lookup["extensions"][extension] = set() + file_type_lookup["extensions"][extension].add(parse_ft) + return file_type_lookup - project_files["SchemaParser"] = read_files_for_parser( - project, - files, - project.all_source_paths, - [".yml", ".yaml"], - ParseFileType.Schema, - saved_files, - dbt_ignore_spec, - ) - # Store the parser files for this particular project - parser_files[project.project_name] = project_files +def get_file_types_for_project(project): + file_types = { + ParseFileType.Macro: { + "paths": project.macro_paths, + "extensions": [".sql"], + "parser": "MacroParser", + }, + ParseFileType.Model: { + "paths": project.model_paths, + "extensions": [".sql", ".py"], + "parser": "ModelParser", + }, + ParseFileType.Snapshot: { + "paths": project.snapshot_paths, + "extensions": [".sql"], + "parser": "SnapshotParser", + }, + ParseFileType.Analysis: { + "paths": project.analysis_paths, + "extensions": [".sql"], + "parser": "AnalysisParser", + }, + ParseFileType.SingularTest: { + "paths": project.test_paths, + "extensions": [".sql"], + "parser": "SingularTestParser", + }, + ParseFileType.GenericTest: { + "paths": project.generic_test_paths, + "extensions": [".sql"], + "parser": "GenericTestParser", + }, + ParseFileType.Seed: { + "paths": project.seed_paths, + "extensions": [".csv"], + "parser": "SeedParser", + }, + ParseFileType.Documentation: { + "paths": project.docs_paths, + "extensions": [".md"], + "parser": "DocumentationParser", + }, + ParseFileType.Schema: { + "paths": project.all_source_paths, + "extensions": [".yml", ".yaml"], + "parser": "SchemaParser", + }, + } + return file_types diff --git a/core/dbt/tests/util.py b/core/dbt/tests/util.py index f777522f2cd..d26bd441b4f 100644 --- a/core/dbt/tests/util.py +++ b/core/dbt/tests/util.py @@ -192,6 +192,11 @@ def get_artifact(*paths): return dct +def write_artifact(dct, *paths): + json_output = json.dumps(dct) + write_file(json_output, *paths) + + # For updating yaml config files def update_config_file(updates, *paths): current_yaml = read_file(*paths) diff --git a/tests/functional/artifacts/expected_manifest.py b/tests/functional/artifacts/expected_manifest.py index a7ebf270ae6..1d36c538632 100644 --- a/tests/functional/artifacts/expected_manifest.py +++ b/tests/functional/artifacts/expected_manifest.py @@ -167,7 +167,8 @@ def checksum_file(path): silly things if we just open(..., 'r').encode('utf-8'). """ with open(path, "rb") as fp: - hashed = hashlib.sha256(fp.read()).hexdigest() + # We strip the file contents because we want the checksum to match the stored contents + hashed = hashlib.sha256(fp.read().strip()).hexdigest() return { "name": "sha256", "checksum": hashed, diff --git a/tests/functional/partial_parsing/test_file_diff.py b/tests/functional/partial_parsing/test_file_diff.py new file mode 100644 index 00000000000..a9f4c8fdd09 --- /dev/null +++ b/tests/functional/partial_parsing/test_file_diff.py @@ -0,0 +1,37 @@ +import os + +from dbt.tests.util import run_dbt, write_artifact + + +first_file_diff = { + "deleted": [], + "changed": [], + "added": [{"path": "models/model_one.sql", "content": "select 1 as fun"}], +} + + +second_file_diff = { + "deleted": [], + "changed": [], + "added": [{"path": "models/model_two.sql", "content": "select 123 as notfun"}], +} + + +class TestFileDiffs: + def test_file_diffs(self, project): + + os.environ["DBT_PP_FILE_DIFF_TEST"] = "true" + + run_dbt(["deps"]) + run_dbt(["seed"]) + + # We start with an empty project + results = run_dbt() + + write_artifact(first_file_diff, "file_diff.json") + results = run_dbt() + assert len(results) == 1 + + write_artifact(second_file_diff, "file_diff.json") + results = run_dbt() + assert len(results) == 2 diff --git a/tests/unit/test_events.py b/tests/unit/test_events.py index e70a095fa2b..3487ce737cb 100644 --- a/tests/unit/test_events.py +++ b/tests/unit/test_events.py @@ -172,6 +172,7 @@ def test_event_codes(self): types.HooksRunning(num_hooks=0, hook_type=""), types.FinishedRunningStats(stat_line="", execution="", execution_time=0), # I - Project parsing ====================== + types.InputFileDiffError(category="testing", file_id="my_file"), types.InvalidValueForField(field_name="test", field_value="test"), types.ValidationWarning(resource_type="model", field_name="access", node_name="my_macro"), types.ParsePerfInfoPath(path=""), @@ -333,7 +334,6 @@ def test_event_codes(self): types.MainKeyboardInterrupt(), types.MainEncounteredError(exc=""), types.MainStackTrace(stack_trace=""), - types.SystemErrorRetrievingModTime(path=""), types.SystemCouldNotWrite(path="", reason="", exc=""), types.SystemExecutingCmd(cmd=[""]), types.SystemStdOut(bmsg=b""),