From 2310c2f295236e1092add55191f1364c6bc3e3c8 Mon Sep 17 00:00:00 2001 From: jimczi Date: Thu, 29 Oct 2020 13:41:03 +0100 Subject: [PATCH 1/6] Do not skip not available shard exception in search response Today search responses do not report failures for shard that were not available for the search. So if one shard is not assigned on a search over 5 shards, the search response will report: ``` "_shards": { "total": 5, "successful": 4, "skipped": 0, "failed": 0 } ``` If all shards are unassigned, we report a generic search phase exception with no cause. It's easy to spot that `successful` is less than `total` in the response but not reporting the failure is misleading for users. This change removes the special handling of not available shards exception in search responses and treat them as any other failure that could occur on a shard. These exceptions will count in the `failed` section and will be reported in details in the `shard_failures` section. If all shards are unavailable, the search API will now return 404 NOT_FOUND as an indication that the search failed because it couldn't find any of the resources. Closes #47700 --- .../search/basic/SearchRedStateIndexIT.java | 6 ++++-- .../action/search/AbstractSearchAsyncAction.java | 16 ++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/search/basic/SearchRedStateIndexIT.java b/server/src/internalClusterTest/java/org/elasticsearch/search/basic/SearchRedStateIndexIT.java index cc2f94e3af9a3..ecfcff15f7056 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/search/basic/SearchRedStateIndexIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/search/basic/SearchRedStateIndexIT.java @@ -36,10 +36,12 @@ import java.util.List; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.hamcrest.Matchers.allOf; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.lessThan; +import static org.hamcrest.Matchers.lessThanOrEqualTo; @ESIntegTestCase.ClusterScope(minNumDataNodes = 2) public class SearchRedStateIndexIT extends ESIntegTestCase { @@ -52,7 +54,7 @@ public void testAllowPartialsWithRedState() throws Exception { SearchResponse searchResponse = client().prepareSearch().setSize(0).setAllowPartialSearchResults(true) .get(); assertThat(RestStatus.OK, equalTo(searchResponse.status())); - assertThat("Expect no shards failed", searchResponse.getFailedShards(), equalTo(0)); + assertThat("Expect some shards failed", searchResponse.getFailedShards(), allOf(greaterThan(0), lessThanOrEqualTo(numShards))); assertThat("Expect no shards skipped", searchResponse.getSkippedShards(), equalTo(0)); assertThat("Expect subset of shards successful", searchResponse.getSuccessfulShards(), lessThan(numShards)); assertThat("Expected total shards", searchResponse.getTotalShards(), equalTo(numShards)); @@ -66,7 +68,7 @@ public void testClusterAllowPartialsWithRedState() throws Exception { SearchResponse searchResponse = client().prepareSearch().setSize(0).get(); assertThat(RestStatus.OK, equalTo(searchResponse.status())); - assertThat("Expect no shards failed", searchResponse.getFailedShards(), equalTo(0)); + assertThat("Expect some shards failed", searchResponse.getFailedShards(), allOf(greaterThan(0), lessThanOrEqualTo(numShards))); assertThat("Expect no shards skipped", searchResponse.getSkippedShards(), equalTo(0)); assertThat("Expect subset of shards successful", searchResponse.getSuccessfulShards(), lessThan(numShards)); assertThat("Expected total shards", searchResponse.getTotalShards(), equalTo(numShards)); diff --git a/server/src/main/java/org/elasticsearch/action/search/AbstractSearchAsyncAction.java b/server/src/main/java/org/elasticsearch/action/search/AbstractSearchAsyncAction.java index 0653d79f31e84..493419e0005a5 100644 --- a/server/src/main/java/org/elasticsearch/action/search/AbstractSearchAsyncAction.java +++ b/server/src/main/java/org/elasticsearch/action/search/AbstractSearchAsyncAction.java @@ -32,7 +32,6 @@ import org.elasticsearch.action.support.TransportActions; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.routing.GroupShardsIterator; -import org.elasticsearch.common.Nullable; import org.elasticsearch.common.lease.Releasable; import org.elasticsearch.common.lease.Releasables; import org.elasticsearch.common.util.concurrent.AbstractRunnable; @@ -235,7 +234,9 @@ private void performPhaseOnShard(final int shardIndex, final SearchShardIterator * we can continue (cf. InitialSearchPhase#maybeFork). */ if (shard == null) { - fork(() -> onShardFailure(shardIndex, null, shardIt, new NoShardAvailableActionException(shardIt.shardId()))); + SearchShardTarget unassignedShard = new SearchShardTarget(null, shardIt.shardId(), + shardIt.getClusterAlias(), shardIt.getOriginalIndices()); + fork(() -> onShardFailure(shardIndex, unassignedShard, shardIt, new NoShardAvailableActionException(shardIt.shardId()))); } else { final PendingExecutions pendingExecutions = throttleConcurrentRequests ? pendingExecutionsPerNode.computeIfAbsent(shard.getNodeId(), n -> new PendingExecutions(maxConcurrentRequestsPerNode)) @@ -386,14 +387,13 @@ ShardSearchFailure[] buildShardFailures() { return failures; } - private void onShardFailure(final int shardIndex, @Nullable SearchShardTarget shard, final SearchShardIterator shardIt, Exception e) { + private void onShardFailure(final int shardIndex, SearchShardTarget shard, final SearchShardIterator shardIt, Exception e) { // we always add the shard failure for a specific shard instance // we do make sure to clean it on a successful response from a shard onShardFailure(shardIndex, shard, e); final SearchShardTarget nextShard = shardIt.nextOrNull(); final boolean lastShard = nextShard == null; - logger.debug(() -> new ParameterizedMessage("{}: Failed to execute [{}] lastShard [{}]", - shard != null ? shard : shardIt.shardId(), request, lastShard), e); + logger.debug(() -> new ParameterizedMessage("{}: Failed to execute [{}] lastShard [{}]", shard, request, lastShard), e); if (lastShard) { if (request.allowPartialSearchResults() == false) { if (requestCancelled.compareAndSet(false, true)) { @@ -437,10 +437,10 @@ protected void onShardGroupFailure(int shardIndex, SearchShardTarget shardTarget * @param e the failure reason */ @Override - public final void onShardFailure(final int shardIndex, @Nullable SearchShardTarget shardTarget, Exception e) { - // we don't aggregate shard failures on non active shards and failures due to the internal cancellation, + public final void onShardFailure(final int shardIndex, SearchShardTarget shardTarget, Exception e) { + // we don't aggregate shard on failures due to the internal cancellation, // but do keep the header counts right - if (TransportActions.isShardNotAvailableException(e) == false && (requestCancelled.get() && isTaskCancelledException(e)) == false) { + if ((requestCancelled.get() && isTaskCancelledException(e)) == false) { AtomicArray shardFailures = this.shardFailures.get(); // lazily create shard failures, so we can early build the empty shard failure list in most cases (no failures) if (shardFailures == null) { // this is double checked locking but it's fine since SetOnce uses a volatile read internally From ae283b3bf95764b5ecd4f72206ea74dec7f4bf8d Mon Sep 17 00:00:00 2001 From: jimczi Date: Thu, 29 Oct 2020 14:11:39 +0100 Subject: [PATCH 2/6] fix error status code in test --- .../org/elasticsearch/xpack/search/AsyncSearchActionIT.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/x-pack/plugin/async-search/src/internalClusterTest/java/org/elasticsearch/xpack/search/AsyncSearchActionIT.java b/x-pack/plugin/async-search/src/internalClusterTest/java/org/elasticsearch/xpack/search/AsyncSearchActionIT.java index b9e2232e0f61b..8a571ceaa24f7 100644 --- a/x-pack/plugin/async-search/src/internalClusterTest/java/org/elasticsearch/xpack/search/AsyncSearchActionIT.java +++ b/x-pack/plugin/async-search/src/internalClusterTest/java/org/elasticsearch/xpack/search/AsyncSearchActionIT.java @@ -406,18 +406,17 @@ public void testRemoveAsyncIndex() throws Exception { ensureTaskRemoval(newResp.getId()); } - public void testSearchPhaseFailureNoCause() throws Exception { + public void testSearchPhaseFailure() throws Exception { SubmitAsyncSearchRequest request = new SubmitAsyncSearchRequest(indexName); request.setKeepOnCompletion(true); request.setWaitForCompletionTimeout(TimeValue.timeValueMinutes(10)); request.getSearchRequest().allowPartialSearchResults(false); request.getSearchRequest() - // AlreadyClosedException are ignored by the coordinating node .source(new SearchSourceBuilder().query(new ThrowingQueryBuilder(randomLong(), new AlreadyClosedException("boom"), 0))); AsyncSearchResponse response = submitAsyncSearch(request); assertFalse(response.isRunning()); assertTrue(response.isPartial()); - assertThat(response.status(), equalTo(RestStatus.SERVICE_UNAVAILABLE)); + assertThat(response.status(), equalTo(RestStatus.INTERNAL_SERVER_ERROR)); assertNotNull(response.getFailure()); ensureTaskNotRunning(response.getId()); } From 5ae20cc1b2dbc16b0632a2f003f4501445ecf93a Mon Sep 17 00:00:00 2001 From: jimczi Date: Thu, 29 Oct 2020 21:24:46 +0100 Subject: [PATCH 3/6] fix transform test case --- .../xpack/transform/integration/TransformRestTestCase.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugin/transform/qa/single-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/transform/integration/TransformRestTestCase.java b/x-pack/plugin/transform/qa/single-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/transform/integration/TransformRestTestCase.java index 97f0b0a556f41..44159181d9cde 100644 --- a/x-pack/plugin/transform/qa/single-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/transform/integration/TransformRestTestCase.java +++ b/x-pack/plugin/transform/qa/single-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/transform/integration/TransformRestTestCase.java @@ -611,7 +611,7 @@ private void logAudits() throws Exception { } } catch (ResponseException e) { // see gh#54810, wrap temporary 503's as assertion error for retry - if (e.getResponse().getStatusLine().getStatusCode() != 503) { + if (e.getResponse().getStatusLine().getStatusCode() != 404) { throw e; } throw new AssertionError("Failed to retrieve audit logs", e); From 1427b5c1dde9e716c161f296b46a2e50aea3342a Mon Sep 17 00:00:00 2001 From: jimczi Date: Fri, 30 Oct 2020 10:44:35 +0100 Subject: [PATCH 4/6] apply review comment --- .../search/basic/SearchRedStateIndexIT.java | 9 +++++++++ .../action/search/AbstractSearchAsyncAction.java | 5 +++++ .../org/elasticsearch/action/search/SearchResponse.java | 2 -- .../elasticsearch/xpack/search/AsyncSearchActionIT.java | 2 +- .../transform/integration/TransformRestTestCase.java | 2 +- 5 files changed, 16 insertions(+), 4 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/search/basic/SearchRedStateIndexIT.java b/server/src/internalClusterTest/java/org/elasticsearch/search/basic/SearchRedStateIndexIT.java index ecfcff15f7056..e0d8f7316d1d2 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/search/basic/SearchRedStateIndexIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/search/basic/SearchRedStateIndexIT.java @@ -20,9 +20,11 @@ package org.elasticsearch.search.basic; +import org.elasticsearch.action.NoShardAvailableActionException; import org.elasticsearch.action.admin.cluster.settings.ClusterUpdateSettingsResponse; import org.elasticsearch.action.search.SearchPhaseExecutionException; import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.action.search.ShardSearchFailure; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.health.ClusterHealthStatus; import org.elasticsearch.cluster.routing.ShardRouting; @@ -40,6 +42,7 @@ import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.lessThan; import static org.hamcrest.Matchers.lessThanOrEqualTo; @@ -58,6 +61,9 @@ public void testAllowPartialsWithRedState() throws Exception { assertThat("Expect no shards skipped", searchResponse.getSkippedShards(), equalTo(0)); assertThat("Expect subset of shards successful", searchResponse.getSuccessfulShards(), lessThan(numShards)); assertThat("Expected total shards", searchResponse.getTotalShards(), equalTo(numShards)); + for (ShardSearchFailure failure : searchResponse.getShardFailures()) { + assertThat(failure.getCause(), instanceOf(NoShardAvailableActionException.class)); + } } public void testClusterAllowPartialsWithRedState() throws Exception { @@ -72,6 +78,9 @@ public void testClusterAllowPartialsWithRedState() throws Exception { assertThat("Expect no shards skipped", searchResponse.getSkippedShards(), equalTo(0)); assertThat("Expect subset of shards successful", searchResponse.getSuccessfulShards(), lessThan(numShards)); assertThat("Expected total shards", searchResponse.getTotalShards(), equalTo(numShards)); + for (ShardSearchFailure failure : searchResponse.getShardFailures()) { + assertThat(failure.getCause(), instanceOf(NoShardAvailableActionException.class)); + } } diff --git a/server/src/main/java/org/elasticsearch/action/search/AbstractSearchAsyncAction.java b/server/src/main/java/org/elasticsearch/action/search/AbstractSearchAsyncAction.java index 493419e0005a5..3ad63821efc90 100644 --- a/server/src/main/java/org/elasticsearch/action/search/AbstractSearchAsyncAction.java +++ b/server/src/main/java/org/elasticsearch/action/search/AbstractSearchAsyncAction.java @@ -438,6 +438,11 @@ protected void onShardGroupFailure(int shardIndex, SearchShardTarget shardTarget */ @Override public final void onShardFailure(final int shardIndex, SearchShardTarget shardTarget, Exception e) { + if (TransportActions.isShardNotAvailableException(e)) { + // Groups shard not available exceptions under a generic exception that returns a SERVICE_UNAVAILABLE(503) + // temporary error. + e = new NoShardAvailableActionException(shardTarget.getShardId(), e.getMessage()); + } // we don't aggregate shard on failures due to the internal cancellation, // but do keep the header counts right if ((requestCancelled.get() && isTaskCancelledException(e)) == false) { diff --git a/server/src/main/java/org/elasticsearch/action/search/SearchResponse.java b/server/src/main/java/org/elasticsearch/action/search/SearchResponse.java index 6cf10742dc303..9a110138a458d 100644 --- a/server/src/main/java/org/elasticsearch/action/search/SearchResponse.java +++ b/server/src/main/java/org/elasticsearch/action/search/SearchResponse.java @@ -204,8 +204,6 @@ public int getSkippedShards() { * The failed number of shards the search was executed on. */ public int getFailedShards() { - // we don't return totalShards - successfulShards, we don't count "no shards available" as a failed shard, just don't - // count it in the successful counter return shardFailures.length; } diff --git a/x-pack/plugin/async-search/src/internalClusterTest/java/org/elasticsearch/xpack/search/AsyncSearchActionIT.java b/x-pack/plugin/async-search/src/internalClusterTest/java/org/elasticsearch/xpack/search/AsyncSearchActionIT.java index 8a571ceaa24f7..9a17858c5386a 100644 --- a/x-pack/plugin/async-search/src/internalClusterTest/java/org/elasticsearch/xpack/search/AsyncSearchActionIT.java +++ b/x-pack/plugin/async-search/src/internalClusterTest/java/org/elasticsearch/xpack/search/AsyncSearchActionIT.java @@ -416,7 +416,7 @@ public void testSearchPhaseFailure() throws Exception { AsyncSearchResponse response = submitAsyncSearch(request); assertFalse(response.isRunning()); assertTrue(response.isPartial()); - assertThat(response.status(), equalTo(RestStatus.INTERNAL_SERVER_ERROR)); + assertThat(response.status(), equalTo(RestStatus.SERVICE_UNAVAILABLE)); assertNotNull(response.getFailure()); ensureTaskNotRunning(response.getId()); } diff --git a/x-pack/plugin/transform/qa/single-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/transform/integration/TransformRestTestCase.java b/x-pack/plugin/transform/qa/single-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/transform/integration/TransformRestTestCase.java index 44159181d9cde..97f0b0a556f41 100644 --- a/x-pack/plugin/transform/qa/single-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/transform/integration/TransformRestTestCase.java +++ b/x-pack/plugin/transform/qa/single-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/transform/integration/TransformRestTestCase.java @@ -611,7 +611,7 @@ private void logAudits() throws Exception { } } catch (ResponseException e) { // see gh#54810, wrap temporary 503's as assertion error for retry - if (e.getResponse().getStatusLine().getStatusCode() != 404) { + if (e.getResponse().getStatusLine().getStatusCode() != 503) { throw e; } throw new AssertionError("Failed to retrieve audit logs", e); From 235416573feb3cae1b85b52417a6f1eaa6ce2edf Mon Sep 17 00:00:00 2001 From: jimczi Date: Fri, 20 Nov 2020 12:26:05 +0100 Subject: [PATCH 5/6] add assert in AbstractSearchAsyncAction#buildSearchResponse --- .../action/search/AbstractSearchAsyncAction.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/action/search/AbstractSearchAsyncAction.java b/server/src/main/java/org/elasticsearch/action/search/AbstractSearchAsyncAction.java index 3ad63821efc90..1adc37ac6c1b5 100644 --- a/server/src/main/java/org/elasticsearch/action/search/AbstractSearchAsyncAction.java +++ b/server/src/main/java/org/elasticsearch/action/search/AbstractSearchAsyncAction.java @@ -550,7 +550,11 @@ public final SearchRequest getRequest() { protected final SearchResponse buildSearchResponse(InternalSearchResponse internalSearchResponse, ShardSearchFailure[] failures, String scrollId, String searchContextId) { - return new SearchResponse(internalSearchResponse, scrollId, getNumShards(), successfulOps.get(), + int numSuccess = successfulOps.get(); + int numFailures = failures.length; + assert numSuccess + numFailures == getNumShards() + : "numSuccess(" + numSuccess + ") + numFailures(" + numFailures + ") != totalShards(" + getNumShards() + ")"; + return new SearchResponse(internalSearchResponse, scrollId, getNumShards(), numSuccess, skippedOps.get(), buildTookInMillis(), failures, clusters, searchContextId); } From f093ed5da5d62a1d8321d0f162b949f0b04011a3 Mon Sep 17 00:00:00 2001 From: jimczi Date: Fri, 20 Nov 2020 18:35:02 +0100 Subject: [PATCH 6/6] remove noop tests --- .../AbstractSearchAsyncActionTests.java | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/action/search/AbstractSearchAsyncActionTests.java b/server/src/test/java/org/elasticsearch/action/search/AbstractSearchAsyncActionTests.java index 9f1199f774a63..46b1e057c41a2 100644 --- a/server/src/test/java/org/elasticsearch/action/search/AbstractSearchAsyncActionTests.java +++ b/server/src/test/java/org/elasticsearch/action/search/AbstractSearchAsyncActionTests.java @@ -160,33 +160,6 @@ public void testBuildShardSearchTransportRequest() { assertEquals(clusterAlias, shardSearchTransportRequest.getClusterAlias()); } - public void testBuildSearchResponse() { - SearchRequest searchRequest = new SearchRequest().allowPartialSearchResults(randomBoolean()); - ArraySearchPhaseResults phaseResults = new ArraySearchPhaseResults<>(10); - AbstractSearchAsyncAction action = createAction(searchRequest, - phaseResults, null, false, new AtomicLong()); - InternalSearchResponse internalSearchResponse = InternalSearchResponse.empty(); - SearchResponse searchResponse = action.buildSearchResponse(internalSearchResponse, action.buildShardFailures(), null, null); - assertSame(searchResponse.getAggregations(), internalSearchResponse.aggregations()); - assertSame(searchResponse.getSuggest(), internalSearchResponse.suggest()); - assertSame(searchResponse.getProfileResults(), internalSearchResponse.profile()); - assertSame(searchResponse.getHits(), internalSearchResponse.hits()); - } - - public void testBuildSearchResponseAllowPartialFailures() { - SearchRequest searchRequest = new SearchRequest().allowPartialSearchResults(true); - final ArraySearchPhaseResults queryResult = new ArraySearchPhaseResults<>(10); - AbstractSearchAsyncAction action = createAction(searchRequest, queryResult, null, false, new AtomicLong()); - action.onShardFailure(0, new SearchShardTarget("node", new ShardId("index", "index-uuid", 0), null, OriginalIndices.NONE), - new IllegalArgumentException()); - InternalSearchResponse internalSearchResponse = InternalSearchResponse.empty(); - SearchResponse searchResponse = action.buildSearchResponse(internalSearchResponse, action.buildShardFailures(), null, null); - assertSame(searchResponse.getAggregations(), internalSearchResponse.aggregations()); - assertSame(searchResponse.getSuggest(), internalSearchResponse.suggest()); - assertSame(searchResponse.getProfileResults(), internalSearchResponse.profile()); - assertSame(searchResponse.getHits(), internalSearchResponse.hits()); - } - public void testSendSearchResponseDisallowPartialFailures() { SearchRequest searchRequest = new SearchRequest().allowPartialSearchResults(false); AtomicReference exception = new AtomicReference<>();