From d729fcf14974268d0f22d626d2862ec731bdca1c Mon Sep 17 00:00:00 2001 From: Russ Cam Date: Fri, 1 Sep 2023 04:44:07 +1000 Subject: [PATCH] Expose DelimitedTermFrequencyTokenFilter (#9479) * Expose DelimitedTermFrequencyTokenFilter Relates: #9413 This commit exposes Lucene's delimited term frequency token filter to be able to provide term frequencies along with terms. Signed-off-by: Russ Cam * fix format violations Signed-off-by: Russ Cam * fix test and add to changelog Signed-off-by: Russ Cam * Address PR feedback - Add unit tests for DelimitedTermFrequencyTokenFilterFactory - Remove IllegalArgumentException as caught exception - Add skip to yaml rest tests to skip for version < 2.10 Signed-off-by: Russ Cam * formatting Signed-off-by: Russ Cam * Rename filter Signed-off-by: Russ Cam * update naming in REST tests Signed-off-by: Russ Cam --------- Signed-off-by: Russ Cam Signed-off-by: Kaushal Kumar --- CHANGELOG.md | 1 + .../common/CommonAnalysisModulePlugin.java | 9 ++ ...imitedTermFrequencyTokenFilterFactory.java | 45 ++++++++++ .../common/CommonAnalysisFactoryTests.java | 2 + ...dTermFrequencyTokenFilterFactoryTests.java | 89 +++++++++++++++++++ .../test/analysis-common/40_token_filters.yml | 40 +++++++++ .../analysis/AnalysisFactoryTestCase.java | 4 +- 7 files changed, 187 insertions(+), 3 deletions(-) create mode 100644 modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java create mode 100644 modules/analysis-common/src/test/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactoryTests.java diff --git a/CHANGELOG.md b/CHANGELOG.md index a558dbdaf839f..93075b571f0a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -97,6 +97,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - [BWC and API enforcement] Decorate the existing APIs with proper annotations (part 1) ([#9520](https://github.com/opensearch-project/OpenSearch/pull/9520)) - Add concurrent segment search related metrics to node and index stats ([#9622](https://github.com/opensearch-project/OpenSearch/issues/9622)) - Decouple replication lag from logic to fail stale replicas ([#9507](https://github.com/opensearch-project/OpenSearch/pull/9507)) +- Expose DelimitedTermFrequencyTokenFilter to allow providing term frequencies along with terms ([#9479](https://github.com/opensearch-project/OpenSearch/pull/9479)) ### Dependencies - Bump `org.apache.logging.log4j:log4j-core` from 2.17.1 to 2.20.0 ([#8307](https://github.com/opensearch-project/OpenSearch/pull/8307)) diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java index 46220f5369d16..b0d9c1765190a 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java @@ -89,6 +89,7 @@ import org.apache.lucene.analysis.lt.LithuanianAnalyzer; import org.apache.lucene.analysis.lv.LatvianAnalyzer; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; +import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter; import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute; import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter; import org.apache.lucene.analysis.miscellaneous.LengthFilter; @@ -265,6 +266,7 @@ public Map> getTokenFilters() { ); filters.put("decimal_digit", DecimalDigitFilterFactory::new); filters.put("delimited_payload", DelimitedPayloadTokenFilterFactory::new); + filters.put("delimited_term_freq", DelimitedTermFrequencyTokenFilterFactory::new); filters.put("dictionary_decompounder", requiresAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new)); filters.put("dutch_stem", DutchStemTokenFilterFactory::new); filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new); @@ -500,6 +502,13 @@ public List getPreConfiguredTokenFilters() { ) ) ); + filters.add( + PreConfiguredTokenFilter.singleton( + "delimited_term_freq", + false, + input -> new DelimitedTermFrequencyTokenFilter(input, DelimitedTermFrequencyTokenFilterFactory.DEFAULT_DELIMITER) + ) + ); filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer()))); filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, false, input -> new EdgeNGramTokenFilter(input, 1))); filters.add(PreConfiguredTokenFilter.openSearchVersion("edgeNGram", false, false, (reader, version) -> { diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java new file mode 100644 index 0000000000000..8929a7c54ef4c --- /dev/null +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java @@ -0,0 +1,45 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.common; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.AbstractTokenFilterFactory; + +public class DelimitedTermFrequencyTokenFilterFactory extends AbstractTokenFilterFactory { + public static final char DEFAULT_DELIMITER = '|'; + private static final String DELIMITER = "delimiter"; + private final char delimiter; + + DelimitedTermFrequencyTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + delimiter = parseDelimiter(settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new DelimitedTermFrequencyTokenFilter(tokenStream, delimiter); + } + + private static char parseDelimiter(Settings settings) { + String delimiter = settings.get(DELIMITER); + if (delimiter == null) { + return DEFAULT_DELIMITER; + } else if (delimiter.length() == 1) { + return delimiter.charAt(0); + } + + throw new IllegalArgumentException( + "Setting [" + DELIMITER + "] must be a single, non-null character. [" + delimiter + "] was provided." + ); + } +} diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java index 1c4db089565ff..11713f52f5b18 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java @@ -145,6 +145,7 @@ protected Map> getTokenFilters() { filters.put("cjkwidth", CJKWidthFilterFactory.class); filters.put("cjkbigram", CJKBigramFilterFactory.class); filters.put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class); + filters.put("delimitedtermfrequency", DelimitedTermFrequencyTokenFilterFactory.class); filters.put("keepword", KeepWordFilterFactory.class); filters.put("type", KeepTypesFilterFactory.class); filters.put("classic", ClassicFilterFactory.class); @@ -202,6 +203,7 @@ protected Map> getPreConfiguredTokenFilters() { filters.put("decimal_digit", null); filters.put("delimited_payload_filter", org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class); filters.put("delimited_payload", org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class); + filters.put("delimited_term_freq", org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilterFactory.class); filters.put("dutch_stem", SnowballPorterFilterFactory.class); filters.put("edge_ngram", null); filters.put("edgeNGram", null); diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactoryTests.java new file mode 100644 index 0000000000000..fab83a75387de --- /dev/null +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactoryTests.java @@ -0,0 +1,89 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.common; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.index.analysis.AnalysisTestsHelper; +import org.opensearch.index.analysis.TokenFilterFactory; +import org.opensearch.test.OpenSearchTestCase; +import org.opensearch.test.OpenSearchTokenStreamTestCase; + +import java.io.StringReader; + +public class DelimitedTermFrequencyTokenFilterFactoryTests extends OpenSearchTokenStreamTestCase { + + public void testDefault() throws Exception { + OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_delimited_term_freq.type", "delimited_term_freq") + .build(), + new CommonAnalysisModulePlugin() + ); + doTest(analysis, "cat|4 dog|5"); + } + + public void testDelimiter() throws Exception { + OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_delimited_term_freq.type", "delimited_term_freq") + .put("index.analysis.filter.my_delimited_term_freq.delimiter", ":") + .build(), + new CommonAnalysisModulePlugin() + ); + doTest(analysis, "cat:4 dog:5"); + } + + public void testDelimiterLongerThanOneCharThrows() { + IllegalArgumentException ex = expectThrows( + IllegalArgumentException.class, + () -> AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_delimited_term_freq.type", "delimited_term_freq") + .put("index.analysis.filter.my_delimited_term_freq.delimiter", "^^") + .build(), + new CommonAnalysisModulePlugin() + ) + ); + + assertEquals("Setting [delimiter] must be a single, non-null character. [^^] was provided.", ex.getMessage()); + } + + private void doTest(OpenSearchTestCase.TestAnalysis analysis, String source) throws Exception { + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_delimited_term_freq"); + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader(source)); + + TokenStream stream = tokenFilter.create(tokenizer); + + CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class); + TermFrequencyAttribute tfAtt = stream.getAttribute(TermFrequencyAttribute.class); + stream.reset(); + assertTermEquals("cat", stream, termAtt, tfAtt, 4); + assertTermEquals("dog", stream, termAtt, tfAtt, 5); + assertFalse(stream.incrementToken()); + stream.end(); + stream.close(); + } + + void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, TermFrequencyAttribute tfAtt, int expectedTf) + throws Exception { + assertTrue(stream.incrementToken()); + assertEquals(expected, termAtt.toString()); + assertEquals(expectedTf, tfAtt.getTermFrequency()); + } +} diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml index 40c82ff185661..e92cc0c4838c7 100644 --- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml +++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml @@ -1198,6 +1198,46 @@ - match: { tokens.0.token: foo } --- +"delimited_term_freq": + - skip: + version: " - 2.9.99" + reason: "delimited_term_freq token filter was added in v2.10.0" + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_delimited_term_freq: + type: delimited_term_freq + delimiter: ^ + - do: + indices.analyze: + index: test + body: + text: foo^3 + tokenizer: keyword + filter: [my_delimited_term_freq] + attributes: termFrequency + explain: true + - length: { detail.tokenfilters: 1 } + - match: { detail.tokenfilters.0.tokens.0.token: foo } + - match: { detail.tokenfilters.0.tokens.0.termFrequency: 3 } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: foo|100 + tokenizer: keyword + filter: [delimited_term_freq] + attributes: termFrequency + explain: true + - length: { detail.tokenfilters: 1 } + - match: { detail.tokenfilters.0.tokens.0.token: foo } + - match: { detail.tokenfilters.0.tokens.0.termFrequency: 100 } +--- "keep_filter": - do: indices.create: diff --git a/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java index b93cb64e32cfe..c412ae8317f24 100644 --- a/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java @@ -98,6 +98,7 @@ public abstract class AnalysisFactoryTestCase extends OpenSearchTestCase { .put("czechstem", MovedToAnalysisCommon.class) .put("decimaldigit", MovedToAnalysisCommon.class) .put("delimitedpayload", MovedToAnalysisCommon.class) + .put("delimitedtermfrequency", MovedToAnalysisCommon.class) .put("dictionarycompoundword", MovedToAnalysisCommon.class) .put("edgengram", MovedToAnalysisCommon.class) .put("elision", MovedToAnalysisCommon.class) @@ -201,9 +202,6 @@ public abstract class AnalysisFactoryTestCase extends OpenSearchTestCase { .put("daterecognizer", Void.class) // for token filters that generate bad offsets, which are now rejected since Lucene 7 .put("fixbrokenoffsets", Void.class) - // should we expose it, or maybe think about higher level integration of the - // fake term frequency feature (LUCENE-7854) - .put("delimitedtermfrequency", Void.class) // LUCENE-8273: ProtectedTermFilterFactory allows analysis chains to skip // particular token filters based on the attributes of the current token. .put("protectedterm", Void.class)