[Tooling] Add --exclude flag to Generator to support field removal te…

…sting (#1411) (#1431) * add --exclude flag to Generator
elastic · May 26, 2021 · a00eb89 · a00eb89
1 parent 97ce65c
commit a00eb89
Show file tree

Hide file tree

Showing 17 changed files with 308 additions and 79 deletions.
diff --git a/CHANGELOG.next.md b/CHANGELOG.next.md
@@ -22,6 +22,7 @@ Thanks, you're awesome :-) -->
 #### Improvements
 
 * Fix ecs GitHub repo link source branch #1393
+* Add --exclude flag to Generator to support field removal testing #1411
 
 #### Deprecated
 

diff --git a/USAGE.md b/USAGE.md
@@ -26,6 +26,7 @@ relevant artifacts for their unique set of data sources.
  * [Generator Options](#generator-options)
  + [Out](#out)
  + [Include](#include)
+ + [Exclude](#exclude)
  + [Subset](#subset)
  + [Ref](#ref)
  + [Mapping & Template Settings](#mapping--template-settings)
@@ -192,6 +193,41 @@ Include can be used together with the `--ref` flag to merge custom fields into a
 
 > NOTE: The `--include` mechanism will not validate custom YAML files prior to merging. This allows for modifying existing ECS fields in a custom schema without having to redefine all the mandatory field attributes.
 
+#### Exclude
+
+Use the `--exclude` flag to generate ephemeral ECS artifacts based on the current ECS schema field definitions minus fields considered for removal, e.g. to assess impact of removing these. Warning! This is not the recommended route to remove a field permanently as it is not intentended to be invoked during the build process. Definitive field removal should be implemented using a custom [Subset](#subset) or via the [RFC process](https:/elastic/ecs/tree/master/rfcs/README.md). Example:
+
+```
+$ python scripts/generator.py --exclude=../my-project/my-exclude-file.yml
+$ python scripts/generator.py --exclude="../my-project/schemas/a*.yml"
+```
+
+The `--exclude` flag expects a path to one or more YAML files using the same [file format](https:/elastic/ecs/tree/master/schemas#fields-supported-in-schemasyml) as the ECS schema files. You can also use a subset, provided that relevant `name` and `fields` fields are preserved.
+
+```
+---
+- name: log
+ fields:
+ - name: original
+```
+
+The root Field Set `name` must always be present and specified with no dots `.`. Subfields may be specified using dot notation, for example:
+
+```
+---
+- name: log
+ fields:
+ - name: syslog.severity.name
+```
+
+Generate artifacts using `--exclude` to load our custom definitions in addition to `--out` to place them in the desired output directory:
+
+```
+$ python scripts/generator.py --exclude ../myproject/exclude-set.yml/ --out ../myproject/out/
+Loading schemas from local files
+Running generator. ECS version 1.11.0
+```
+
 #### Subset
 
 If your indices will never populate particular ECS fields, there's no need to include those field definitions in your index mappings. The `--subset` argument allows for passing a subset definition YAML file which indicates which field sets or specific fields to include in the generated artifacts.

diff --git a/scripts/generator.py b/scripts/generator.py
@@ -16,6 +16,7 @@
 from schema import cleaner
 from schema import finalizer
 from schema import subset_filter
+from schema import exclude_filter
 
 
 def main():
@@ -52,6 +53,7 @@ def main():
  cleaner.clean(fields, strict=args.strict)
  finalizer.finalize(fields)
  fields = subset_filter.filter(fields, args.subset, out_dir)
+ fields = exclude_filter.exclude(fields, args.exclude)
  nested, flat = intermediate_files.generate(fields, os.path.join(out_dir, 'ecs'), default_dirs)
 
  if args.intermediate_only:
@@ -61,7 +63,7 @@ def main():
  es_template.generate(nested, ecs_generated_version, out_dir, args.mapping_settings)
  es_template.generate_legacy(flat, ecs_generated_version, out_dir, args.template_settings, args.mapping_settings)
  beats.generate(nested, ecs_generated_version, out_dir)
- if args.include or args.subset:
+ if args.include or args.subset or args.exclude:
  exit()
 
  ecs_helpers.make_dirs(docs_dir)
@@ -74,6 +76,8 @@ def argument_parser():
  Note that "--include experimental/schemas" will also respect this git ref.')
  parser.add_argument('--include', nargs='+',
  help='include user specified directory of custom field definitions')
+ parser.add_argument('--exclude', nargs='+',
+ help='exclude user specified subset of the schema')
  parser.add_argument('--subset', nargs='+',
  help='render a subset of the schema')
  parser.add_argument('--out', action='store', help='directory to output the generated files')

diff --git a/scripts/generators/beats.py b/scripts/generators/beats.py
@@ -83,9 +83,9 @@ def write_beats_yaml(beats_file, ecs_version, out_dir):
 
 
 def file_header():
- return '''
+ return """
 # WARNING! Do not edit this file directly, it was generated by the ECS project,
 # based on ECS version {version}.
 # Please visit https:/elastic/ecs to suggest changes to ECS fields.
 
-'''.lstrip()
+""".lstrip()
diff --git a/scripts/generators/ecs_helpers.py b/scripts/generators/ecs_helpers.py
@@ -159,7 +159,7 @@ def yaml_load(filename):
 
 
 def list_subtract(original, subtracted):
- '''Subtract two lists. original = subtracted'''
+ """Subtract two lists. original = subtracted"""
  return [item for item in original if item not in subtracted]
 
 
@@ -175,7 +175,7 @@ def list_extract_keys(lst, key_name):
 
 
 def is_intermediate(field):
- '''Encapsulates the check to see if a field is an intermediate field or a "real" field.'''
+ """Encapsulates the check to see if a field is an intermediate field or a "real" field."""
  return ('intermediate' in field['field_details'] and field['field_details']['intermediate'])
 
 

diff --git a/scripts/generators/es_template.py b/scripts/generators/es_template.py
@@ -263,13 +263,13 @@ def default_mapping_settings():
 
 
 def es6_type_fallback(mappings):
- '''
+ """
  Visits each leaf in mappings object and fallback to an
  Elasticsearch 6.x supported type.
 
  Since a field like `wildcard` won't have the same defaults as
  a `keyword` field, we must add any missing defaults.
- '''
+ """
 
  for (name, details) in mappings.items():
  if 'type' in details:

diff --git a/scripts/generators/intermediate_files.py b/scripts/generators/intermediate_files.py
@@ -20,15 +20,15 @@ def generate(fields, out_dir, default_dirs):
 
 
 def generate_flat_fields(fields):
- '''Generate ecs_flat.yml'''
+ """Generate ecs_flat.yml"""
  filtered = remove_non_root_reusables(fields)
  flattened = {}
  visitor.visit_fields_with_memo(filtered, accumulate_field, flattened)
  return flattened
 
 
 def accumulate_field(details, memo):
- '''Visitor function that accumulates all field details in the memo dict'''
+ """Visitor function that accumulates all field details in the memo dict"""
  if 'schema_details' in details or ecs_helpers.is_intermediate(details):
  return
  field_details = copy.deepcopy(details['field_details'])
@@ -39,7 +39,7 @@ def accumulate_field(details, memo):
 
 
 def generate_nested_fields(fields):
- '''Generate ecs_nested.yml'''
+ """Generate ecs_nested.yml"""
  nested = {}
  # Flatten each field set, but keep all resulting fields nested under their
  # parent/host field set.
@@ -71,13 +71,13 @@ def generate_nested_fields(fields):
 
 
 def remove_internal_attributes(field_details):
- '''Remove attributes only relevant to the deeply nested structure, but not to ecs_flat/nested.yml.'''
+ """Remove attributes only relevant to the deeply nested structure, but not to ecs_flat/nested.yml."""
  field_details.pop('node_name', None)
  field_details.pop('intermediate', None)
 
 
 def remove_non_root_reusables(fields_nested):
- '''
+ """
  Remove field sets that have top_level=false from the root of the field definitions.
 
  This attribute means they're only meant to be in the "reusable/expected" locations
@@ -87,7 +87,7 @@ def remove_non_root_reusables(fields_nested):
  still needs to keep all field sets at the root of the YAML file, as it
  the official information about each field set. It's the responsibility of
  users consuming ecs_nested.yml to skip the field sets with top_level=false.
- '''
+ """
  fields = {}
  for (name, field) in fields_nested.items():
  if 'reusable' not in field['schema_details'] or field['schema_details']['reusable']['top_level']:

diff --git a/scripts/schema/cleaner.py b/scripts/schema/cleaner.py
@@ -56,7 +56,7 @@ def schema_cleanup(schema):
 
 
 def schema_mandatory_attributes(schema):
- '''Ensures for the presence of the mandatory schema attributes and raises if any are missing'''
+ """Ensures for the presence of the mandatory schema attributes and raises if any are missing"""
  current_schema_attributes = sorted(list(schema['field_details'].keys()) +
  list(schema['schema_details'].keys()))
  missing_attributes = ecs_helpers.list_subtract(SCHEMA_MANDATORY_ATTRIBUTES, current_schema_attributes)
@@ -74,7 +74,7 @@ def schema_mandatory_attributes(schema):
 
 
 def schema_assertions_and_warnings(schema):
- '''Additional checks on a fleshed out schema'''
+ """Additional checks on a fleshed out schema"""
  single_line_short_description(schema, strict=strict_mode)
  if 'beta' in schema['field_details']:
  single_line_beta_description(schema, strict=strict_mode)
@@ -143,7 +143,7 @@ def field_defaults(field):
 
 
 def field_or_multi_field_datatype_defaults(field_details):
- '''Sets datatype-related defaults on a canonical field or multi-field entries.'''
+ """Sets datatype-related defaults on a canonical field or multi-field entries."""
  if field_details['type'] == 'keyword':
  field_details.setdefault('ignore_above', 1024)
  if field_details['type'] == 'text':
@@ -160,7 +160,7 @@ def field_or_multi_field_datatype_defaults(field_details):
 
 
 def field_mandatory_attributes(field):
- '''Ensures for the presence of the mandatory field attributes and raises if any are missing'''
+ """Ensures for the presence of the mandatory field attributes and raises if any are missing"""
  if ecs_helpers.is_intermediate(field):
  return
  current_field_attributes = sorted(field['field_details'].keys())
@@ -180,7 +180,7 @@ def field_mandatory_attributes(field):
 
 
 def field_assertions_and_warnings(field):
- '''Additional checks on a fleshed out field'''
+ """Additional checks on a fleshed out field"""
  if not ecs_helpers.is_intermediate(field):
  # check short description length if in strict mode
  single_line_short_description(field, strict=strict_mode)

diff --git a/scripts/schema/exclude_filter.py b/scripts/schema/exclude_filter.py
@@ -0,0 +1,78 @@
+from schema import loader
+
+# This script should be run downstream of the subset filters - it takes
+# all ECS and custom fields already loaded by the latter and explicitly
+# removes a subset, for example, to simulate impact of future removals
+
+
+def exclude(fields, exclude_file_globs):
+ excludes = load_exclude_definitions(exclude_file_globs)
+
+ if excludes:
+ fields = exclude_fields(fields, excludes)
+
+ return fields
+
+
+def long_path(path_as_list):
+ return '.'.join([e for e in path_as_list])
+
+
+def pop_field(fields, node_path, path, removed):
+ """pops a field from yaml derived dict using path derived from ordered list of nodes"""
+ if node_path[0] in fields:
+ if len(node_path) == 1:
+ flat_name = long_path(path)
+ fields.pop(node_path[0])
+ return flat_name
+ else:
+ inner_field = node_path.pop(0)
+ if 'fields' in fields[inner_field]:
+ popped = pop_field(fields[inner_field]['fields'], node_path, path, removed)
+ # if object field with no remaining fields and not 'base', pop it
+ if fields[inner_field]['fields'] == {} and inner_field != 'base':
+ fields.pop(inner_field)
+ return popped
+ else:
+ raise ValueError(
+ '--exclude specified, but no path to field {} found'.format(long_path(path)))
+ else:
+ this_long_path = long_path(path)
+ # Check in case already removed parent
+ if not any([this_long_path.startswith(long_path) for long_path in removed if long_path != None]):
+ raise ValueError('--exclude specified, but no field {} found'.format(this_long_path))
+
+
+def exclude_trace_path(fields, item, path, removed):
+ """traverses paths to one or more nodes in a yaml derived dict"""
+ for list_item in item:
+ node_path = path.copy()
+ # cater for name.with.dots
+ for name in list_item['name'].split('.'):
+ node_path.append(name)
+ if not 'fields' in list_item:
+ parent = node_path[0]
+ removed.append(pop_field(fields, node_path, node_path.copy(), removed))
+ # if parent field has no remaining fields and not 'base', pop it
+ if parent != 'base' and parent in fields and len(fields[parent]['fields']) == 0:
+ fields.pop(parent)
+ else:
+ raise ValueError('--exclude specified, can\'t parse fields in file {}'.format(item))
+
+
+def exclude_fields(fields, excludes):
+ """Traverses fields and eliminates any field which matches the excludes"""
+ if excludes:
+ for ex_list in excludes:
+ for item in ex_list:
+ exclude_trace_path(fields, item['fields'], [item['name']], [])
+ return fields
+
+
+def load_exclude_definitions(file_globs):
+ if not file_globs:
+ return []
+ excludes = loader.load_definitions(file_globs)
+ if not excludes:
+ raise ValueError('--exclude specified, but no exclusions found in {}'.format(file_globs))
+ return excludes
diff --git a/scripts/schema/finalizer.py b/scripts/schema/finalizer.py
@@ -19,7 +19,7 @@
 
 
 def finalize(fields):
- '''Intended entrypoint of the finalizer.'''
+ """Intended entrypoint of the finalizer."""
  perform_reuse(fields)
  calculate_final_values(fields)
 
@@ -46,7 +46,7 @@ def order_reuses(fields):
 
 
 def perform_reuse(fields):
- '''Performs field reuse in two phases'''
+ """Performs field reuse in two phases"""
  foreign_reuses, self_nestings = order_reuses(fields)
 
  # Phase 1: foreign reuse
@@ -99,11 +99,11 @@ def perform_reuse(fields):
 
 
 def ensure_valid_reuse(reused_schema, destination_schema=None):
- '''
+ """
  Raise if either the reused schema or destination schema have root=true.
 
  Second param is optional, if testing for a self-nesting (where source=destination).
- '''
+ """
  if reused_schema['schema_details']['root']:
  msg = "Schema {} has attribute root=true and therefore cannot be reused.".format(
  reused_schema['field_details']['name'])
@@ -115,7 +115,7 @@ def ensure_valid_reuse(reused_schema, destination_schema=None):
 
 
 def append_reused_here(reused_schema, reuse_entry, destination_schema):
- '''Captures two ways of denoting what field sets are reused under a given field set'''
+ """Captures two ways of denoting what field sets are reused under a given field set"""
  # Legacy, too limited
  destination_schema['schema_details'].setdefault('nestings', [])
  destination_schema['schema_details']['nestings'] = sorted(
@@ -136,15 +136,15 @@ def append_reused_here(reused_schema, reuse_entry, destination_schema):
 
 
 def set_original_fieldset(fields, original_fieldset):
- '''Recursively set the 'original_fieldset' attribute for all fields in a group of fields'''
+ """Recursively set the 'original_fieldset' attribute for all fields in a group of fields"""
  def func(details):
  # Don't override if already set (e.g. 'group' for user.group.* fields)
  details['field_details'].setdefault('original_fieldset', original_fieldset)
  visitor.visit_fields(fields, field_func=func)
 
 
 def field_group_at_path(dotted_path, fields):
- '''Returns the ['fields'] hash at the dotted_path.'''
+ """Returns the ['fields'] hash at the dotted_path."""
  path = dotted_path.split('.')
  nesting = fields
  for next_field in path:
@@ -163,17 +163,17 @@ def field_group_at_path(dotted_path, fields):
 
 
 def calculate_final_values(fields):
- '''
+ """
  This function navigates all fields recursively.
 
  It populates a few more values for the fields, especially path-based values
  like flat_name.
- '''
+ """
  visitor.visit_fields_with_path(fields, field_finalizer)
 
 
 def field_finalizer(details, path):
- '''This is the function called by the visitor to perform the work of calculate_final_values'''
+ """This is the function called by the visitor to perform the work of calculate_final_values"""
  name_array = path + [details['field_details']['node_name']]
  flat_name = '.'.join(name_array)
  details['field_details']['flat_name'] = flat_name