diff --git a/src/Nest/Analysis/Analyzers/NoriAnalyzer.cs b/src/Nest/Analysis/Analyzers/NoriAnalyzer.cs
index 78a7043dbb2..723abe6fe2e 100644
--- a/src/Nest/Analysis/Analyzers/NoriAnalyzer.cs
+++ b/src/Nest/Analysis/Analyzers/NoriAnalyzer.cs
@@ -8,6 +8,7 @@ namespace Nest
/// - nori_tokenizer
/// - nori_part_of_speech token filter
/// - nori_readingform token filter
+ /// - nori_number token filter
/// - lowercase token filter
///
public interface INoriAnalyzer : IAnalyzer
diff --git a/src/Nest/Analysis/Tokenizers/NoriTokenizer.cs b/src/Nest/Analysis/Tokenizers/NoriTokenizer.cs
index 287d7ed2a68..fdf45c26d67 100644
--- a/src/Nest/Analysis/Tokenizers/NoriTokenizer.cs
+++ b/src/Nest/Analysis/Tokenizers/NoriTokenizer.cs
@@ -30,6 +30,12 @@ public interface INoriTokenizer : ITokenizer
[DataMember(Name = "decompound_mode")]
NoriDecompoundMode? DecompoundMode { get; set; }
+ ///
+ /// Whether punctuation should be discarded from the output. Defaults to `true`.
+ ///
+ [DataMember(Name = "discard_punctuation")]
+ bool? DiscardPunctuation { get; set; }
+
///
/// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG) may be
/// appended to
@@ -57,6 +63,9 @@ public class NoriTokenizer : TokenizerBase, INoriTokenizer
///
public NoriDecompoundMode? DecompoundMode { get; set; }
+ ///
+ public bool? DiscardPunctuation { get; set; }
+
///
public string UserDictionary { get; set; }
@@ -73,6 +82,7 @@ public class NoriTokenizerDescriptor
NoriDecompoundMode? INoriTokenizer.DecompoundMode { get; set; }
string INoriTokenizer.UserDictionary { get; set; }
IEnumerable INoriTokenizer.UserDictionaryRules { get; set; }
+ bool? INoriTokenizer.DiscardPunctuation { get; set; }
///
public NoriTokenizerDescriptor DecompoundMode(NoriDecompoundMode? mode) => Assign(mode, (a, v) => a.DecompoundMode = v);
@@ -85,5 +95,8 @@ public class NoriTokenizerDescriptor
///
public NoriTokenizerDescriptor UserDictionaryRules(IEnumerable rules) => Assign(rules, (a, v) => a.UserDictionaryRules = v);
+
+ ///
+ public NoriTokenizerDescriptor DiscardPunctuation(bool? discard = true) => Assign(discard, (a, v) => a.DiscardPunctuation = v);
}
}
diff --git a/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs b/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
index 7e9711a8c75..63fb2f4cc03 100644
--- a/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
+++ b/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
@@ -332,5 +332,21 @@ public class CharGroupTests : TokenizerAssertionBase
public override string Name => "char_group";
}
+
+ [SkipVersion("<7.7.0", "discard_punctuation introduced in 7.7.0")]
+ public class DiscardPunctuationTests : TokenizerAssertionBase
+ {
+ public override FuncTokenizer Fluent => (n, t) => t.Nori(n, e => e
+ .DiscardPunctuation()
+ );
+
+ public override ITokenizer Initializer => new NoriTokenizer
+ {
+ DiscardPunctuation = true
+ };
+
+ public override object Json => new { type = "nori_tokenizer", discard_punctuation = true };
+ public override string Name => "nori";
+ }
}
}