Skip to content

Commit

Permalink
[ML] Add support for date_nanos fields in find_file_structure (#62048)
Browse files Browse the repository at this point in the history
Now that #61324 is merged it is possible for the find_file_structure
endpoint to suggest using date_nanos fields for timestamps where
the timestamp format provides greater than millisecond accuracy.
  • Loading branch information
droberts195 committed Sep 8, 2020
1 parent 2bb5716 commit b263667
Show file tree
Hide file tree
Showing 9 changed files with 151 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -149,14 +149,15 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List<String
.setJavaTimestampFormats(timeField.v2().getJavaTimestampFormats())
.setNeedClientTimezone(needClientTimeZone)
.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), csvProcessorSettings,
mappings, timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone))
mappings, timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone,
timeField.v2().needNanosecondPrecision()))
.setMultilineStartPattern(makeMultilineStartPattern(explanation, columnNamesList, maxLinesPerMessage, delimiterPattern,
quotePattern, mappings, timeField.v1(), timeField.v2()));

mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timeField.v2().getEsDateMappingTypeWithoutFormat());
} else {
structureBuilder.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(),
csvProcessorSettings, mappings, null, null, false));
csvProcessorSettings, mappings, null, null, false, false));
structureBuilder.setMultilineStartPattern(makeMultilineStartPattern(explanation, columnNamesList, maxLinesPerMessage,
delimiterPattern, quotePattern, mappings, null, null));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ public final class FileStructureUtils {
public static final String MAPPING_PROPERTIES_SETTING = "properties";
public static final Map<String, String> DATE_MAPPING_WITHOUT_FORMAT =
Collections.singletonMap(MAPPING_TYPE_SETTING, "date");
public static final String NANOSECOND_DATE_OUTPUT_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSSSSSSSSXXX";
public static final Set<String> CONVERTIBLE_TYPES =
Collections.unmodifiableSet(Sets.newHashSet("integer", "long", "float", "double", "boolean"));

Expand Down Expand Up @@ -397,13 +398,15 @@ static boolean isMoreLikelyTextThanKeyword(String str) {
* @param timestampFormats Timestamp formats to be used for parsing {@code timestampField}.
* May be <code>null</code> if {@code timestampField} is also <code>null</code>.
* @param needClientTimezone Is the timezone of the client supplying data to ingest required to uniquely parse the timestamp?
* @param needNanosecondPrecision Does the timestamp have more than millisecond accuracy?
* @return The ingest pipeline definition, or <code>null</code> if none is required.
*/
public static Map<String, Object> makeIngestPipelineDefinition(String grokPattern, Map<String, String> customGrokPatternDefinitions,
Map<String, Object> csvProcessorSettings,
Map<String, Object> mappingsForConversions,
String timestampField, List<String> timestampFormats,
boolean needClientTimezone) {
boolean needClientTimezone,
boolean needNanosecondPrecision) {

if (grokPattern == null && csvProcessorSettings == null && timestampField == null) {
return null;
Expand Down Expand Up @@ -437,6 +440,9 @@ public static Map<String, Object> makeIngestPipelineDefinition(String grokPatter
dateProcessorSettings.put("timezone", "{{ " + BEAT_TIMEZONE_FIELD + " }}");
}
dateProcessorSettings.put("formats", timestampFormats);
if (needNanosecondPrecision) {
dateProcessorSettings.put("output_format", NANOSECOND_DATE_OUTPUT_FORMAT);
}
processors.add(Collections.singletonMap("date", dateProcessorSettings));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,16 @@ static NdJsonFileStructureFinder makeNdJsonFileStructureFinder(List<String> expl
.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), null,
// Note: no convert processors are added based on mappings for NDJSON input
// because it's reasonable that _source matches the supplied JSON precisely
Collections.emptyMap(), timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone));
Collections.emptyMap(), timeField.v1(), timeField.v2().getJavaTimestampFormats(), needClientTimeZone,
timeField.v2().needNanosecondPrecision()));
}

Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords, timeoutChecker);

SortedMap<String, Object> mappings = mappingsAndFieldStats.v1();
Map<String, Object> mappings = mappingsAndFieldStats.v1();
if (timeField != null) {
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timeField.v2().getEsDateMappingTypeWithoutFormat());
}

if (mappingsAndFieldStats.v2() != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List<String> ex
Map<String, String> messageMapping = Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "text");
SortedMap<String, Object> mappings = new TreeMap<>();
mappings.put("message", messageMapping);
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
mappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timestampFormatFinder.getEsDateMappingTypeWithoutFormat());

SortedMap<String, FieldStats> fieldStats = new TreeMap<>();
fieldStats.put("message", FileStructureUtils.calculateFieldStats(messageMapping, sampleMessages, timeoutChecker));
Expand Down Expand Up @@ -151,7 +151,8 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List<String> ex
.setNeedClientTimezone(needClientTimeZone)
.setGrokPattern(grokPattern)
.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(grokPattern, customGrokPatternDefinitions, null, mappings,
interimTimestampField, timestampFormatFinder.getJavaTimestampFormats(), needClientTimeZone))
interimTimestampField, timestampFormatFinder.getJavaTimestampFormats(), needClientTimeZone,
timestampFormatFinder.needNanosecondPrecision()))
.setMappings(mappings)
.setFieldStats(fieldStats)
.setExplanation(explanation)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ public final class TimestampFormatFinder {
private static final Logger logger = LogManager.getLogger(TimestampFormatFinder.class);
private static final String PUNCTUATION_THAT_NEEDS_ESCAPING_IN_REGEX = "\\|()[]{}^$.*?";
private static final String FRACTIONAL_SECOND_SEPARATORS = ":.,";
private static final Pattern FRACTIONAL_SECOND_INTERPRETER =
Pattern.compile("([" + FRACTIONAL_SECOND_SEPARATORS + "])(\\d{3,9})($|[Z+-])");
private static final char INDETERMINATE_FIELD_PLACEHOLDER = '?';
// The ? characters in this must match INDETERMINATE_FIELD_PLACEHOLDER
// above, but they're literals in this regex to aid readability
Expand Down Expand Up @@ -702,6 +704,20 @@ public List<String> getJavaTimestampFormats() {
(matchedFormats.size() > 1) ? matchedFormats.get(0) : null);
}

/**
* This is needed to decide between "date" and "date_nanos" as the index mapping type.
* @return Do the observed timestamps require nanosecond precision to store accurately?
*/
public boolean needNanosecondPrecision() {
if (matchedFormats.isEmpty()) {
// If errorOnNoTimestamp is set and we get here it means no samples have been added, which is likely a programmer mistake
assert errorOnNoTimestamp == false;
return false;
}
return matches.stream().filter(match -> matchedFormats.size() < 2 || matchedFormats.get(0).canMergeWith(match.timestampFormat))
.anyMatch(match -> match.hasNanosecondPrecision);
}

/**
* Given a list of timestamp formats that might contain indeterminate day/month parts,
* return the corresponding pattern with the placeholders replaced with concrete
Expand Down Expand Up @@ -947,6 +963,14 @@ public boolean hasTimezoneDependentParsing() {
.anyMatch(match -> match.hasTimezoneDependentParsing);
}

/**
* The @timestamp field will always have been parsed into epoch format,
* so we just need to know if it has nanosecond resolution or not.
*/
public Map<String, String> getEsDateMappingTypeWithoutFormat() {
return Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, needNanosecondPrecision() ? "date_nanos" : "date");
}

/**
* Sometimes Elasticsearch mappings for dates need to include the format.
* This method returns appropriate mappings settings: at minimum "type" : "date",
Expand All @@ -959,7 +983,7 @@ public Map<String, String> getEsDateMappingTypeWithFormat() {
return Collections.singletonMap(FileStructureUtils.MAPPING_TYPE_SETTING, "keyword");
}
Map<String, String> mapping = new LinkedHashMap<>();
mapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, "date");
mapping.put(FileStructureUtils.MAPPING_TYPE_SETTING, needNanosecondPrecision() ? "date_nanos" : "date");
String formats = javaTimestampFormats.stream().map(format -> {
switch (format) {
case "ISO8601":
Expand Down Expand Up @@ -1233,6 +1257,7 @@ static final class TimestampMatch {
final int secondIndeterminateDateNumber;

final boolean hasTimezoneDependentParsing;
final boolean hasNanosecondPrecision;

/**
* Text that came after the timestamp in the matched field/message.
Expand All @@ -1250,6 +1275,8 @@ static final class TimestampMatch {
this.secondIndeterminateDateNumber = indeterminateDateNumbers[1];
this.hasTimezoneDependentParsing = requiresTimezoneDependentParsing(timestampFormat.rawJavaTimestampFormats.get(0),
matchedDate);
this.hasNanosecondPrecision = matchHasNanosecondPrecision(timestampFormat.rawJavaTimestampFormats.get(0),
matchedDate);
this.epilogue = Objects.requireNonNull(epilogue);
}

Expand All @@ -1259,6 +1286,7 @@ static final class TimestampMatch {
this.firstIndeterminateDateNumber = toCopyExceptFormat.firstIndeterminateDateNumber;
this.secondIndeterminateDateNumber = toCopyExceptFormat.secondIndeterminateDateNumber;
this.hasTimezoneDependentParsing = toCopyExceptFormat.hasTimezoneDependentParsing;
this.hasNanosecondPrecision = toCopyExceptFormat.hasNanosecondPrecision;
this.epilogue = toCopyExceptFormat.epilogue;
}

Expand All @@ -1285,6 +1313,43 @@ static boolean requiresTimezoneDependentParsing(String format, String matchedDat
}
}

static boolean matchHasNanosecondPrecision(String format, String matchedDate) {
switch (format) {
case "ISO8601":
Matcher matcher = FRACTIONAL_SECOND_INTERPRETER.matcher(matchedDate);
return matcher.find() && matcher.group(2).length() > 3;
case "UNIX_MS":
case "UNIX":
return false;
case "TAI64N":
return true;
default:
boolean notQuoted = true;
int consecutiveSs = 0;
for (int pos = 0; pos < format.length(); ++pos) {
char curChar = format.charAt(pos);
if (curChar == '\'') {
// Literal single quotes are escaped by using two consecutive single quotes.
// Technically this code does the wrong thing in this case, as it flips quoting
// from off to on or on to off and then back. However, since by definition there
// is nothing in between the consecutive single quotes in this case, the net
// effect is correct and good enough for what this method is doing.
notQuoted = !notQuoted;
consecutiveSs = 0;
} else if (notQuoted) {
if (curChar == 'S') {
if (++consecutiveSs > 3) {
return true;
}
} else {
consecutiveSs = 0;
}
}
}
return false;
}
}

static int[] parseIndeterminateDateNumbers(String matchedDate, List<String> rawJavaTimestampFormats) {
int[] indeterminateDateNumbers = { -1, -1 };

Expand Down Expand Up @@ -1368,7 +1433,6 @@ public String toString() {
*/
static final class CandidateTimestampFormat {

private static final Pattern FRACTIONAL_SECOND_INTERPRETER = Pattern.compile("([" + FRACTIONAL_SECOND_SEPARATORS + "])(\\d{3,9})$");
// This means that in the case of a literal Z, XXX is preferred
private static final Pattern TRAILING_OFFSET_WITHOUT_COLON_FINDER = Pattern.compile("[+-]\\d{4}$");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List<String> explanatio
.setNeedClientTimezone(needClientTimeZone)
.setIngestPipeline(FileStructureUtils.makeIngestPipelineDefinition(null, Collections.emptyMap(), null,
Collections.emptyMap(), topLevelTag + "." + timeField.v1(), timeField.v2().getJavaTimestampFormats(),
needClientTimeZone));
needClientTimeZone, timeField.v2().needNanosecondPrecision()));
}

Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
Expand All @@ -114,14 +114,14 @@ static XmlFileStructureFinder makeXmlFileStructureFinder(List<String> explanatio
structureBuilder.setFieldStats(mappingsAndFieldStats.v2());
}

SortedMap<String, Object> innerMappings = mappingsAndFieldStats.v1();
Map<String, Object> innerMappings = mappingsAndFieldStats.v1();
Map<String, Object> secondLevelProperties = new LinkedHashMap<>();
secondLevelProperties.put(FileStructureUtils.MAPPING_TYPE_SETTING, "object");
secondLevelProperties.put(FileStructureUtils.MAPPING_PROPERTIES_SETTING, innerMappings);
SortedMap<String, Object> outerMappings = new TreeMap<>();
outerMappings.put(topLevelTag, secondLevelProperties);
if (timeField != null) {
outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, FileStructureUtils.DATE_MAPPING_WITHOUT_FORMAT);
outerMappings.put(FileStructureUtils.DEFAULT_TIMESTAMP_FIELD, timeField.v2().getEsDateMappingTypeWithoutFormat());
}

FileStructure structure = structureBuilder
Expand Down
Loading

0 comments on commit b263667

Please sign in to comment.