From 5e276cb4531a9588faa6ff82b7026c14223fc310 Mon Sep 17 00:00:00 2001
From: Rishabh Maurya <rishabhmaurya05@gmail.com>
Date: Fri, 24 May 2024 12:27:41 -0700
Subject: [PATCH] Address PR comments

Signed-off-by: Rishabh Maurya <rishabhmaurya05@gmail.com>
---
 .../index/mapper/FieldTypeInference.java      | 72 +++++++++---------
 .../index/mapper/FieldTypeInferenceTests.java | 73 ++++++++++++++++++-
 2 files changed, 110 insertions(+), 35 deletions(-)
diff --git a/server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java b/server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java
index 933b53fc8e34e..713bdc4e691cd 100644
--- a/server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java
+++ b/server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java
@@ -9,7 +9,7 @@
 package org.opensearch.index.mapper;
 
 import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.ReaderUtil;
 import org.opensearch.common.Randomness;
 import org.opensearch.common.xcontent.XContentFactory;
 import org.opensearch.common.xcontent.json.JsonXContent;
@@ -18,12 +18,13 @@
 import org.opensearch.search.lookup.SourceLookup;
 
 import java.io.IOException;
-import java.util.Arrays;
-import java.util.HashSet;
+import java.util.ArrayList;
+import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Random;
 import java.util.Set;
+import java.util.TreeSet;
 
 /**
  * This class performs type inference by analyzing the _source documents. It uses a random sample of documents to infer the field type, similar to dynamic mapping type guessing logic.
@@ -57,7 +58,10 @@ public FieldTypeInference(String indexName, MapperService mapperService, IndexRe
     }
 
     public void setSampleSize(int sampleSize) {
-        this.sampleSize = Math.min(sampleSize, MAX_SAMPLE_SIZE_ALLOWED);
+        if (sampleSize > MAX_SAMPLE_SIZE_ALLOWED) {
+            throw new IllegalArgumentException("sample_size should be less than " + MAX_SAMPLE_SIZE_ALLOWED);
+        }
+        this.sampleSize = sampleSize;
     }
 
     public int getSampleSize() {
@@ -95,11 +99,8 @@ private static class RandomSourceValuesGenerator implements Iterator<List<Object
         private final ValueFetcher valueFetcher;
         private final IndexReader indexReader;
         private final SourceLookup sourceLookup;
-        private final int numLeaves;
         private final int[] docs;
         private int iter;
-        private int offset;
-        private LeafReaderContext leafReaderContext;
         private int leaf;
         private final int MAX_ATTEMPTS_TO_GENERATE_RANDOM_SAMPLES = 10000;
 
@@ -113,17 +114,16 @@ public RandomSourceValuesGenerator(int sampleSize, IndexReader indexReader, Valu
                 Math.max(sampleSize, MAX_ATTEMPTS_TO_GENERATE_RANDOM_SAMPLES)
             );
             this.iter = 0;
-            this.offset = 0;
-            this.leaf = 0;
-            this.numLeaves = indexReader.leaves().size();
+            this.leaf = -1;
             this.sourceLookup = new SourceLookup();
-            this.leafReaderContext = indexReader.leaves().get(leaf);
-            valueFetcher.setNextReader(leafReaderContext);
+            if (hasNext()) {
+                setNextLeaf();
+            }
         }
 
         @Override
         public boolean hasNext() {
-            return iter < docs.length && leaf < numLeaves;
+            return iter < docs.length && leaf < indexReader.leaves().size();
         }
 
         /**
@@ -131,13 +131,12 @@ public boolean hasNext() {
          */
         @Override
         public List<Object> next() {
-            int docID = docs[iter] - offset;
-            if (docID >= leafReaderContext.reader().numDocs()) {
+            int docID = docs[iter] - indexReader.leaves().get(leaf).docBase;
+            if (docID >= indexReader.leaves().get(leaf).reader().numDocs()) {
                 setNextLeaf();
-                return next();
             }
             // deleted docs are getting used to infer type, which should be okay?
-            sourceLookup.setSegmentAndDocument(leafReaderContext, docID);
+            sourceLookup.setSegmentAndDocument(indexReader.leaves().get(leaf), docs[iter] - indexReader.leaves().get(leaf).docBase);
             try {
                 iter++;
                 return valueFetcher.fetchValues(sourceLookup);
@@ -147,29 +146,36 @@ public List<Object> next() {
         }
 
         private void setNextLeaf() {
-            offset += leafReaderContext.reader().numDocs();
-            leaf++;
-            if (leaf < numLeaves) {
-                leafReaderContext = indexReader.leaves().get(leaf);
-                valueFetcher.setNextReader(leafReaderContext);
+            int readerIndex = ReaderUtil.subIndex(docs[iter], indexReader.leaves());
+            if (readerIndex != leaf) {
+                leaf = readerIndex;
+            } else {
+                // this will only happen when leaves are exhausted and readerIndex will be indexReader.leaves()-1.
+                leaf++;
+            }
+            if (leaf < indexReader.leaves().size()) {
+                valueFetcher.setNextReader(indexReader.leaves().get(leaf));
             }
         }
 
         private static int[] getSortedRandomNum(int sampleSize, int upperBound, int attempts) {
-            Set<Integer> generatedNumbers = new HashSet<>();
+            Set<Integer> generatedNumbers = new TreeSet<>();
             Random random = Randomness.get();
             int itr = 0;
-            while (generatedNumbers.size() < sampleSize && itr++ < attempts) {
-                int randomNumber = random.nextInt(upperBound);
-                generatedNumbers.add(randomNumber);
-            }
-            int[] result = new int[generatedNumbers.size()];
-            int i = 0;
-            for (int number : generatedNumbers) {
-                result[i++] = number;
+            if (upperBound <= 10 * sampleSize) {
+                List<Integer> numberList = new ArrayList<>();
+                for (int i = 0; i < upperBound; i++) {
+                    numberList.add(i);
+                }
+                Collections.shuffle(numberList, random);
+                generatedNumbers.addAll(numberList.subList(0, sampleSize));
+            } else {
+                while (generatedNumbers.size() < sampleSize && itr++ < attempts) {
+                    int randomNumber = random.nextInt(upperBound);
+                    generatedNumbers.add(randomNumber);
+                }
             }
-            Arrays.sort(result);
-            return result;
+            return generatedNumbers.stream().mapToInt(Integer::valueOf).toArray();
         }
     }
 }
diff --git a/server/src/test/java/org/opensearch/index/mapper/FieldTypeInferenceTests.java b/server/src/test/java/org/opensearch/index/mapper/FieldTypeInferenceTests.java
index 3762663c1fb01..807d0e96ce5e3 100644
--- a/server/src/test/java/org/opensearch/index/mapper/FieldTypeInferenceTests.java
+++ b/server/src/test/java/org/opensearch/index/mapper/FieldTypeInferenceTests.java
@@ -53,7 +53,6 @@ public void testJsonSupportedTypes() throws IOException {
         when(queryShardContext.index()).thenReturn(new Index("test_index", "uuid"));
         int totalDocs = 10000;
         int docsPerLeafCount = 1000;
-        int leaves = 0;
         try (Directory dir = newDirectory()) {
             IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(Lucene.STANDARD_ANALYZER));
             Document d = new Document();
@@ -61,7 +60,6 @@ public void testJsonSupportedTypes() throws IOException {
                 iw.addDocument(d);
                 if ((i + 1) % docsPerLeafCount == 0) {
                     iw.commit();
-                    leaves++;
                 }
             }
             try (IndexReader reader = DirectoryReader.open(iw)) {
@@ -138,4 +136,75 @@ public void setNextReader(LeafReaderContext leafReaderContext) {
             }
         }
     }
+
+    public void testDeleteAllDocs() throws IOException {
+        MapperService mapperService = createMapperService(topMapping(b -> {}));
+        QueryShardContext queryShardContext = createQueryShardContext(mapperService);
+        when(queryShardContext.index()).thenReturn(new Index("test_index", "uuid"));
+        int totalDocs = 10000;
+        int docsPerLeafCount = 1000;
+        try (Directory dir = newDirectory()) {
+            IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(Lucene.STANDARD_ANALYZER));
+            Document d = new Document();
+            for (int i = 0; i < totalDocs; i++) {
+                iw.addDocument(d);
+                if ((i + 1) % docsPerLeafCount == 0) {
+                    iw.commit();
+                }
+            }
+            iw.deleteAll();
+            iw.commit();
+
+            try (IndexReader reader = DirectoryReader.open(iw)) {
+                iw.close();
+                FieldTypeInference typeInference = new FieldTypeInference("test_index", queryShardContext.getMapperService(), reader);
+                String[] fieldName = { "text_field" };
+                Mapper mapper = typeInference.infer(lookup -> documentMap.get(fieldName[0]));
+                assertNull(mapper);
+            }
+        }
+    }
+
+    public void testZeroDoc() throws IOException {
+        MapperService mapperService = createMapperService(topMapping(b -> {}));
+        QueryShardContext queryShardContext = createQueryShardContext(mapperService);
+        when(queryShardContext.index()).thenReturn(new Index("test_index", "uuid"));
+        try (Directory dir = newDirectory()) {
+            IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(Lucene.STANDARD_ANALYZER));
+            try (IndexReader reader = DirectoryReader.open(iw)) {
+                iw.close();
+                FieldTypeInference typeInference = new FieldTypeInference("test_index", queryShardContext.getMapperService(), reader);
+                String[] fieldName = { "text_field" };
+                Mapper mapper = typeInference.infer(lookup -> documentMap.get(fieldName[0]));
+                assertNull(mapper);
+            }
+        }
+    }
+
+    public void testSampleGeneration() throws IOException {
+        MapperService mapperService = createMapperService(topMapping(b -> {}));
+        QueryShardContext queryShardContext = createQueryShardContext(mapperService);
+        when(queryShardContext.index()).thenReturn(new Index("test_index", "uuid"));
+        int totalDocs = 10000;
+        int docsPerLeafCount = 1000;
+        try (Directory dir = newDirectory()) {
+            IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(Lucene.STANDARD_ANALYZER));
+            Document d = new Document();
+            for (int i = 0; i < totalDocs; i++) {
+                iw.addDocument(d);
+                if ((i + 1) % docsPerLeafCount == 0) {
+                    iw.commit();
+                }
+            }
+            try (IndexReader reader = DirectoryReader.open(iw)) {
+                iw.close();
+                FieldTypeInference typeInference = new FieldTypeInference("test_index", queryShardContext.getMapperService(), reader);
+                typeInference.setSampleSize(1000 - 1);
+                typeInference.infer(lookup -> documentMap.get("unknown_field"));
+                assertThrows(IllegalArgumentException.class, () -> typeInference.setSampleSize(1000 + 1));
+                typeInference.setSampleSize(1000);
+                typeInference.infer(lookup -> documentMap.get("unknown_field"));
+            }
+        }
+    }
 }