[ML] Add support for date_nanos fields in find_file_structure (#62048)

Now that #61324 is merged it is possible for the find_file_structure endpoint to suggest using date_nanos fields for timestamps where the timestamp format provides greater than millisecond accuracy.
elastic · Sep 8, 2020 · b263667 · b263667
1 parent 2bb5716
commit b263667
Show file tree

Hide file tree

Showing 9 changed files with 151 additions and 22 deletions.
diff --git a/...ain/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/...ain/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java
@@ -149,14 +149,15 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List<String
  .setJavaTimestampFormats(timeField.v2().getJavaTimestampFormats())
  .setNeedClientTimezone(needClientTimeZone)
  .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), csvProcessorSettings,
- mappings, timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone))
+ mappings, timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone,
+ timeField.v2().needNanosecondPrecision()))
  .setMultilineStartPattern(makeMultilineStartPattern(explanation, columnNamesList, maxLinesPerMessage, delimiterPattern,
  quotePattern, mappings, timeField.v1(), timeField.v2()));
 
- mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
+ mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timeField.v2().getEsDateMappingTypeWithoutFormat());
  } else {
  structureBuilder.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(),
- csvProcessorSettings, mappings, null, null, false));
+ csvProcessorSettings, mappings, null, null, false, false));
  structureBuilder.setMultilineStartPattern(makeMultilineStartPattern(explanation, columnNamesList, maxLinesPerMessage,
  delimiterPattern, quotePattern, mappings, null, null));
  }

diff --git a/...n/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java b/...n/ml/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureUtils.java
@@ -36,6 +36,7 @@ public final class FileStructureUtils {
  public static final String MAPPING_PROPERTIES_SETTING = "properties";
  public static final Map<String, String> DATE_MAPPING_WITHOUT_FORMAT =
  Collections.singletonMap(MAPPING_TYPE_SETTING, "date");
+ public static final String NANOSECOND_DATE_OUTPUT_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSSXXX";
  public static final Set<String> CONVERTIBLE_TYPES =
  Collections.unmodifiableSet(Sets.newHashSet("integer", "long", "float", "double", "boolean"));
 
@@ -397,13 +398,15 @@ static boolean isMoreLikelyTextThanKeyword(String str) {
  * @param timestampFormats Timestamp formats to be used for parsing {@code timestampField}.
  * May be <code>null</code> if {@code timestampField} is also <code>null</code>.
  * @param needClientTimezone Is the timezone of the client supplying data to ingest required to uniquely parse the timestamp?
+ * @param needNanosecondPrecision Does the timestamp have more than millisecond accuracy?
  * @return The ingest pipeline definition, or <code>null</code> if none is required.
  */
  public static Map<String, Object> makeIngestPipelineDefinition(String grokPattern, Map<String, String> customGrokPatternDefinitions,
  Map<String, Object> csvProcessorSettings,
  Map<String, Object> mappingsForConversions,
  String timestampField, List<String> timestampFormats,
- boolean needClientTimezone) {
+ boolean needClientTimezone,
+ boolean needNanosecondPrecision) {
 
  if (grokPattern == null && csvProcessorSettings == null && timestampField == null) {
  return null;
@@ -437,6 +440,9 @@ public static Map<String, Object> makeIngestPipelineDefinition(String grokPatter
  dateProcessorSettings.put("timezone", "{{ " + BEAT_TIMEZONE_FIELD + " }}");
  }
  dateProcessorSettings.put("formats", timestampFormats);
+ if (needNanosecondPrecision) {
+ dateProcessorSettings.put("output_format", NANOSECOND_DATE_OUTPUT_FORMAT);
+ }
  processors.add(Collections.singletonMap("date", dateProcessorSettings));
  }
 

diff --git a/...c/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java b/...c/main/java/org/elasticsearch/xpack/ml/filestructurefinder/NdJsonFileStructureFinder.java
@@ -64,15 +64,16 @@ static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List<String> expl
  .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), null,
  // Note: no convert processors are added based on mappings for NDJSON input
  // because it's reasonable that _source matches the supplied JSON precisely
- Collections.emptyMap(), timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone));
+ Collections.emptyMap(), timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone,
+ timeField.v2().needNanosecondPrecision()));
  }
 
  Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
  FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords, timeoutChecker);
 
- SortedMap<String, Object> mappings = mappingsAndFieldStats.v1();
+ Map<String, Object> mappings = mappingsAndFieldStats.v1();
  if (timeField != null) {
- mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
+ mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timeField.v2().getEsDateMappingTypeWithoutFormat());
  }
 
  if (mappingsAndFieldStats.v2() != null) {

diff --git a/.../main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java b/.../main/java/org/elasticsearch/xpack/ml/filestructurefinder/TextLogFileStructureFinder.java
@@ -111,7 +111,7 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List<String> ex
  Map<String, String> messageMapping = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text");
  SortedMap<String, Object> mappings = new TreeMap<>();
  mappings.put("message", messageMapping);
- mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
+ mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timestampFormatFinder.getEsDateMappingTypeWithoutFormat());
 
  SortedMap<String, FieldStats> fieldStats = new TreeMap<>();
  fieldStats.put("message", FileStructureUtils.calculateFieldStats(messageMapping, sampleMessages, timeoutChecker));
@@ -151,7 +151,8 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List<String> ex
  .setNeedClientTimezone(needClientTimeZone)
  .setGrokPattern(grokPattern)
  .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(grokPattern, customGrokPatternDefinitions, null, mappings,
- interimTimestampField, timestampFormatFinder.getJavaTimestampFormats(), needClientTimeZone))
+ interimTimestampField, timestampFormatFinder.getJavaTimestampFormats(), needClientTimeZone,
+ timestampFormatFinder.needNanosecondPrecision()))
  .setMappings(mappings)
  .setFieldStats(fieldStats)
  .setExplanation(explanation)

diff --git a/...l/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java b/...l/src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/TimestampFormatFinder.java
@@ -53,6 +53,8 @@ public final class TimestampFormatFinder {
  private static final Logger logger = LogManager.getLogger(TimestampFormatFinder.class);
  private static final String PUNCTUATION_THAT_NEEDS_ESCAPING_IN_REGEX = "\\|()[]{}^$.*?";
  private static final String FRACTIONAL_SECOND_SEPARATORS = ":.,";
+ private static final Pattern FRACTIONAL_SECOND_INTERPRETER =
+ Pattern.compile("([" + FRACTIONAL_SECOND_SEPARATORS + "])(\\d{3,9})($|[Z+-])");
  private static final char INDETERMINATE_FIELD_PLACEHOLDER = '?';
  // The ? characters in this must match INDETERMINATE_FIELD_PLACEHOLDER
  // above, but they're literals in this regex to aid readability
@@ -702,6 +704,20 @@ public List<String> getJavaTimestampFormats() {
  (matchedFormats.size() > 1) ? matchedFormats.get(0) : null);
  }
 
+ /**
+ * This is needed to decide between "date" and "date_nanos" as the index mapping type.
+ * @return Do the observed timestamps require nanosecond precision to store accurately?
+ */
+ public boolean needNanosecondPrecision() {
+ if (matchedFormats.isEmpty()) {
+ // If errorOnNoTimestamp is set and we get here it means no samples have been added, which is likely a programmer mistake
+ assert errorOnNoTimestamp == false;
+ return false;
+ }
+ return matches.stream().filter(match -> matchedFormats.size() < 2 || matchedFormats.get(0).canMergeWith(match.timestampFormat))
+ .anyMatch(match -> match.hasNanosecondPrecision);
+ }
+
  /**
  * Given a list of timestamp formats that might contain indeterminate day/month parts,
  * return the corresponding pattern with the placeholders replaced with concrete
@@ -947,6 +963,14 @@ public boolean hasTimezoneDependentParsing() {
  .anyMatch(match -> match.hasTimezoneDependentParsing);
  }
 
+ /**
+ * The @timestamp field will always have been parsed into epoch format,
+ * so we just need to know if it has nanosecond resolution or not.
+ */
+ public Map<String, String> getEsDateMappingTypeWithoutFormat() {
+ return Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, needNanosecondPrecision() ? "date_nanos" : "date");
+ }
+
  /**
  * Sometimes Elasticsearch mappings for dates need to include the format.
  * This method returns appropriate mappings settings: at minimum "type" : "date",
@@ -959,7 +983,7 @@ public Map<String, String> getEsDateMappingTypeWithFormat() {
  return Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword");
  }
  Map<String, String> mapping = new LinkedHashMap<>();
- mapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date");
+ mapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, needNanosecondPrecision() ? "date_nanos" : "date");
  String formats = javaTimestampFormats.stream().map(format -> {
  switch (format) {
  case "ISO8601":
@@ -1233,6 +1257,7 @@ static final class TimestampMatch {
  final int secondIndeterminateDateNumber;
 
  final boolean hasTimezoneDependentParsing;
+ final boolean hasNanosecondPrecision;
 
  /**
  * Text that came after the timestamp in the matched field/message.
@@ -1250,6 +1275,8 @@ static final class TimestampMatch {
  this.secondIndeterminateDateNumber = indeterminateDateNumbers[1];
  this.hasTimezoneDependentParsing = requiresTimezoneDependentParsing(timestampFormat.rawJavaTimestampFormats.get(0),
  matchedDate);
+ this.hasNanosecondPrecision = matchHasNanosecondPrecision(timestampFormat.rawJavaTimestampFormats.get(0),
+ matchedDate);
  this.epilogue = Objects.requireNonNull(epilogue);
  }
 
@@ -1259,6 +1286,7 @@ static final class TimestampMatch {
  this.firstIndeterminateDateNumber = toCopyExceptFormat.firstIndeterminateDateNumber;
  this.secondIndeterminateDateNumber = toCopyExceptFormat.secondIndeterminateDateNumber;
  this.hasTimezoneDependentParsing = toCopyExceptFormat.hasTimezoneDependentParsing;
+ this.hasNanosecondPrecision = toCopyExceptFormat.hasNanosecondPrecision;
  this.epilogue = toCopyExceptFormat.epilogue;
  }
 
@@ -1285,6 +1313,43 @@ static boolean requiresTimezoneDependentParsing(String format, String matchedDat
  }
  }
 
+ static boolean matchHasNanosecondPrecision(String format, String matchedDate) {
+ switch (format) {
+ case "ISO8601":
+ Matcher matcher = FRACTIONAL_SECOND_INTERPRETER.matcher(matchedDate);
+ return matcher.find() && matcher.group(2).length() > 3;
+ case "UNIX_MS":
+ case "UNIX":
+ return false;
+ case "TAI64N":
+ return true;
+ default:
+ boolean notQuoted = true;
+ int consecutiveSs = 0;
+ for (int pos = 0; pos < format.length(); ++pos) {
+ char curChar = format.charAt(pos);
+ if (curChar == '\'') {
+ // Literal single quotes are escaped by using two consecutive single quotes.
+ // Technically this code does the wrong thing in this case, as it flips quoting
+ // from off to on or on to off and then back. However, since by definition there
+ // is nothing in between the consecutive single quotes in this case, the net
+ // effect is correct and good enough for what this method is doing.
+ notQuoted = !notQuoted;
+ consecutiveSs = 0;
+ } else if (notQuoted) {
+ if (curChar == 'S') {
+ if (++consecutiveSs > 3) {
+ return true;
+ }
+ } else {
+ consecutiveSs = 0;
+ }
+ }
+ }
+ return false;
+ }
+ }
+
  static int[] parseIndeterminateDateNumbers(String matchedDate, List<String> rawJavaTimestampFormats) {
  int[] indeterminateDateNumbers = { -1, -1 };
 
@@ -1368,7 +1433,6 @@ public String toString() {
  */
  static final class CandidateTimestampFormat {
 
- private static final Pattern FRACTIONAL_SECOND_INTERPRETER = Pattern.compile("([" + FRACTIONAL_SECOND_SEPARATORS + "])(\\d{3,9})$");
  // This means that in the case of a literal Z, XXX is preferred
  private static final Pattern TRAILING_OFFSET_WITHOUT_COLON_FINDER = Pattern.compile("[+-]\\d{4}$");
 

diff --git a/.../src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java b/.../src/main/java/org/elasticsearch/xpack/ml/filestructurefinder/XmlFileStructureFinder.java
@@ -104,7 +104,7 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List<String> explanatio
  .setNeedClientTimezone(needClientTimeZone)
  .setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), null,
  Collections.emptyMap(), topLevelTag + "." + timeField.v1(), timeField.v2().getJavaTimestampFormats(),
- needClientTimeZone));
+ needClientTimeZone, timeField.v2().needNanosecondPrecision()));
  }
 
  Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
@@ -114,14 +114,14 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List<String> explanatio
  structureBuilder.setFieldStats(mappingsAndFieldStats.v2());
  }
 
- SortedMap<String, Object> innerMappings = mappingsAndFieldStats.v1();
+ Map<String, Object> innerMappings = mappingsAndFieldStats.v1();
  Map<String, Object> secondLevelProperties = new LinkedHashMap<>();
  secondLevelProperties.put(FileStructureUtils.MAPPING_TYPE_SETTING, "object");
  secondLevelProperties.put(FileStructureUtils.MAPPING_PROPERTIES_SETTING, innerMappings);
  SortedMap<String, Object> outerMappings = new TreeMap<>();
  outerMappings.put(topLevelTag, secondLevelProperties);
  if (timeField != null) {
- outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
+ outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timeField.v2().getEsDateMappingTypeWithoutFormat());
  }
 
  FileStructure structure = structureBuilder