From 3ab7846ee89ff6d170b66ac0595807599067ede8 Mon Sep 17 00:00:00 2001 From: joe-mongodb Date: Thu, 7 Nov 2024 12:04:37 -0600 Subject: [PATCH] SERVER-96064: Optimize away metadata $sort directly after $vectorSearch for single node environments (#28699) GitOrigin-RevId: 0ff41f52dc290e20210257ecfd43fa711610a414 --- jstests/libs/query/analyze_plan.js | 17 +++ .../sort_after_vector_search_optimization.js | 120 ++++++++++++++++++ .../search/document_source_vector_search.cpp | 37 ++++++ .../search/document_source_vector_search.h | 17 +++ src/mongo/db/query/sort_pattern.cpp | 27 ++++ src/mongo/db/query/sort_pattern.h | 13 ++ src/mongo/db/query/sort_pattern_test.cpp | 60 +++++++++ 7 files changed, 291 insertions(+) create mode 100644 jstests/with_mongot/e2e/sort_after_vector_search_optimization.js diff --git a/jstests/libs/query/analyze_plan.js b/jstests/libs/query/analyze_plan.js index 3fa49b8ee78..7ac603d835e 100644 --- a/jstests/libs/query/analyze_plan.js +++ b/jstests/libs/query/analyze_plan.js @@ -1214,3 +1214,20 @@ export function canonicalizePlan(p) { }); } } + +/** + * Returns index of stage in a aggregation pipeline stage plan running on a single node + * (will not work for sharded clusters). + * 'root' is root of explain JSON. + * Returns -1 if stage does not exist. + */ +export function getIndexOfStageOnSingleNode(root, stageName) { + if (root.hasOwnProperty("stages")) { + for (let i = 0; i < root.stages.length; i++) { + if (root.stages[i].hasOwnProperty(stageName)) { + return i; + } + } + } + return -1; +} diff --git a/jstests/with_mongot/e2e/sort_after_vector_search_optimization.js b/jstests/with_mongot/e2e/sort_after_vector_search_optimization.js new file mode 100644 index 00000000000..ec2785a652a --- /dev/null +++ b/jstests/with_mongot/e2e/sort_after_vector_search_optimization.js @@ -0,0 +1,120 @@ +/** + * Test an aggregation pipeline optimization where a $sort stage can be removed after + * a $vectorSearch stage, given that the $sort is on the same criteria that the $vectorSearch + * results are sorted by (the 'vectorSearchScore') . + * + * Also, this test should only run in single-node environments because a $sort after a $vectorSearch + * in a sharded cluster will end up with the $vectorSearch on mongod and $sort on mongos. + * + * @tags: [featureFlagSearchHybridScoringPrerequisites, assumes_against_mongod_not_mongos] + */ + +import {getIndexOfStageOnSingleNode} from "jstests/libs/query/analyze_plan.js"; +import {createSearchIndex, dropSearchIndex} from "jstests/libs/search.js"; + +// Helper functions to check if optimization being tested for exists: + +function assertSortExistsAfterVectorSearch(aggPipeline) { + let explain = coll.explain().aggregate(aggPipeline); + // $vectorSearch must be the first step of the pipeline + assert( + getIndexOfStageOnSingleNode(explain, "$vectorSearch") == 0, + "'$vectorSearch' is not first step of the pipeline. explain for query: " + tojson(explain)); + // A $sort stage must exist somewhere in the pipeline after $_internalSearchMongotRemote. + assert(getIndexOfStageOnSingleNode(explain, "$sort") > 0, + "'$sort' does not exist in the pipeline after $search. explain for query: " + + tojson(explain)); +} + +function assertNoSortExistsAfterVectorSearch(aggPipeline) { + let explain = coll.explain().aggregate(aggPipeline); + // $vectorSearch must be the first step of the pipeline + assert( + getIndexOfStageOnSingleNode(explain, "$vectorSearch") == 0, + "'$vectorSearch' is not first step of the pipeline. explain for query: " + tojson(explain)); + // A $sort stage must not exist somewhere in the pipeline after $_internalSearchMongotRemote. + assert( + getIndexOfStageOnSingleNode(explain, "$sort") < 0, + "'$sort' does exist in the pipeline after $search. explain for query: " + tojson(explain)); +} + +const coll = db.foo; +coll.drop(); + +assert.commandWorked(coll.insertMany( + [{a: -1, v: [1, 0, 8, 1, 8]}, {a: 100, v: [2, -2, 1, 4, 4]}, {a: 10, v: [4, 10, -8, 22, 0]}])); + +const indexName = "sort-after-vector-search-test-index"; +// Create vector search index on movie plot embeddings. +const vectorIndex = { + name: indexName, + type: "vectorSearch", + definition: + {"fields": [{"type": "vector", "numDimensions": 5, "path": "v", "similarity": "euclidean"}]} +}; + +createSearchIndex(coll, vectorIndex); + +const vectorSearchQuery = { + queryVector: [2, 4, -8, 2, 10], + path: "v", + numCandidates: 3, + index: indexName, + limit: 3, +}; + +// Run test cases: +// +// Cases where optimization applies and $sort should be removed: + +// Standard case where a single sort on 'vectorSearchScore' should be removed. +assertNoSortExistsAfterVectorSearch([ + {$vectorSearch: vectorSearchQuery}, + {$sort: {score: {$meta: "vectorSearchScore"}}}, +]); + +// Multiple sorts in a row should all be removed. +assertNoSortExistsAfterVectorSearch([ + {$vectorSearch: vectorSearchQuery}, + {$sort: {score: {$meta: "vectorSearchScore"}}}, + {$sort: {score: {$meta: "vectorSearchScore"}}}, + {$sort: {score: {$meta: "vectorSearchScore"}}}, + {$limit: 10}, +]); + +// Implicit $sort after $vectorSearch from desugared $setWindowFields should get removed. +assertNoSortExistsAfterVectorSearch([ + {$vectorSearch: vectorSearchQuery}, + {$setWindowFields: {sortBy: {score: {$meta: "vectorSearchScore"}}, output: {rank: {$rank: {}}}}} +]); + +// Mixed explicit and implicit $sort after $vectorSearch should both get removed. +assertNoSortExistsAfterVectorSearch([ + {$vectorSearch: vectorSearchQuery}, + {$sort: {score: {$meta: "vectorSearchScore"}}}, + {$setWindowFields: {sortBy: {score: {$meta: "vectorSearchScore"}}, output: {rank: {$rank: {}}}}} +]); + +// Cases where optimization should not apply and $sort should remain: + +// Explicit $sort that does not sort on 'vectorSearchScore' should not be removed. +assertSortExistsAfterVectorSearch([ + {$vectorSearch: vectorSearchQuery}, + {$sort: {a: 1}}, +]); + +// $sort with multi-field criteria on 'vectorSearchScore' and another field should not be removed. +assertSortExistsAfterVectorSearch([ + {$vectorSearch: vectorSearchQuery}, + {$sort: {score: {$meta: "vectorSearchScore"}, a: 1}}, +]); + +// Currently cannot optimize $sort that is not directly after $vectorSearch. +// TODO SERVER-96068: check that $sort is removed for these types of pipelines. +assertSortExistsAfterVectorSearch([ + {$vectorSearch: vectorSearchQuery}, + {$limit: 10}, + {$sort: {score: {$meta: "vectorSearchScore"}}}, +]); + +dropSearchIndex(coll, {name: indexName}); diff --git a/src/mongo/db/pipeline/search/document_source_vector_search.cpp b/src/mongo/db/pipeline/search/document_source_vector_search.cpp index e9e75536449..355b33d1388 100644 --- a/src/mongo/db/pipeline/search/document_source_vector_search.cpp +++ b/src/mongo/db/pipeline/search/document_source_vector_search.cpp @@ -30,6 +30,7 @@ #include "mongo/db/pipeline/search/document_source_vector_search.h" #include "mongo/base/string_data.h" +#include "mongo/db/pipeline/document_source_sort.h" #include "mongo/db/pipeline/search/document_source_internal_search_id_lookup.h" #include "mongo/db/pipeline/search/lite_parsed_search.h" #include "mongo/db/pipeline/search/vector_search_helper.h" @@ -260,8 +261,44 @@ std::list> DocumentSourceVectorSearch::desugar() { return desugaredPipeline; } +std::pair +DocumentSourceVectorSearch::_attemptSortAfterVectorSearchOptimization( + Pipeline::SourceContainer::iterator itr, Pipeline::SourceContainer* container) { + auto isSortOnVectorSearchMeta = [](const SortPattern& sortPattern) -> bool { + return isSortOnSingleMetaField(sortPattern, + (1 << DocumentMetadataFields::MetaType::kVectorSearchScore)); + }; + auto optItr = std::next(itr); + if (optItr != container->end()) { + if (auto sortStage = dynamic_cast(optItr->get())) { + // A $sort stage has been found directly after this stage. + // $vectorSearch results are always sorted by 'vectorSearchScore', + // so if the $sort stage is also sorted by 'vectorSearchScore', the $sort stage + // is redundant and can safely be removed. + if (isSortOnVectorSearchMeta(sortStage->getSortKeyPattern())) { + // Optimization successful. + container->remove(*optItr); + return {itr, true}; // Return the same pointer in case there are other + // optimizations to still be applied. + } + } + } + + // Optimization not possible. + return {itr, false}; +} + Pipeline::SourceContainer::iterator DocumentSourceVectorSearch::doOptimizeAt( Pipeline::SourceContainer::iterator itr, Pipeline::SourceContainer* container) { + // Attempt to remove a $sort on metadata after this $vectorSearch stage. + { + const auto&& [returnItr, optimizationSucceeded] = + _attemptSortAfterVectorSearchOptimization(itr, container); + if (optimizationSucceeded) { + return returnItr; + } + } + auto stageItr = std::next(itr); // Only attempt to get the limit from the query if there are further stages in the pipeline. if (stageItr != container->end()) { diff --git a/src/mongo/db/pipeline/search/document_source_vector_search.h b/src/mongo/db/pipeline/search/document_source_vector_search.h index 860bff9da45..79f5ca600dd 100644 --- a/src/mongo/db/pipeline/search/document_source_vector_search.h +++ b/src/mongo/db/pipeline/search/document_source_vector_search.h @@ -121,6 +121,23 @@ private: // Initialize metrics related to the $vectorSearch stage on the OpDebug object. void initializeOpDebugVectorSearchMetrics(); + /** + * Attempts a pipeline optimization that removes a $sort stage that comes after the output of + * of mongot, if the resulting documents from mongot are sorted by the same criteria as the + * $sort ('vectorSearchScore'). + * + * Also, this optimization only applies to cases where the $sort comes directly after this + * stage. + * TODO SERVER-96068 generalize this optimization to cases where any number of stages that + * preserve sort order come between this stage and the sort. + * + * Returns a pair of the iterator to return to the optimizer, and a bool of whether or not the + * optimization was successful. If optimization was successful, the container will be modified + * appropriately. + */ + std::pair _attemptSortAfterVectorSearchOptimization( + Pipeline::SourceContainer::iterator itr, Pipeline::SourceContainer* container); + std::unique_ptr _filterExpr; std::shared_ptr _taskExecutor; diff --git a/src/mongo/db/query/sort_pattern.cpp b/src/mongo/db/query/sort_pattern.cpp index 23f63b98ae3..488d5001b1d 100644 --- a/src/mongo/db/query/sort_pattern.cpp +++ b/src/mongo/db/query/sort_pattern.cpp @@ -198,4 +198,31 @@ bool SortPattern::isExtensionOf(const SortPattern& other) const { } return true; } + +bool isSortOnSingleMetaField(const SortPattern& sortPattern, + QueryMetadataBitSet metadataToConsider) { + // Exactly 1 expression in the sort pattern is needed. + if (sortPattern.begin() == sortPattern.end() || + std::next(sortPattern.begin()) != sortPattern.end()) { + // 0 parts, or more than 1 part. + return false; + } + const auto& firstAndOnlyPart = *sortPattern.begin(); + if (auto* expr = firstAndOnlyPart.expression.get()) { + if (auto metaExpr = dynamic_cast(expr)) { + if (metadataToConsider.none()) { + // Any metadata field. + return true; + } + for (std::size_t i = 1; i < DocumentMetadataFields::kNumFields; ++i) { + if (metadataToConsider[i] && + metaExpr->getMetaType() == static_cast(i)) { + return true; + } + } + return false; + } + } + return false; +} } // namespace mongo diff --git a/src/mongo/db/query/sort_pattern.h b/src/mongo/db/query/sort_pattern.h index 1cc6d4a4ab9..cca706a07a0 100644 --- a/src/mongo/db/query/sort_pattern.h +++ b/src/mongo/db/query/sort_pattern.h @@ -172,4 +172,17 @@ private: // The set of paths on which we're sorting. OrderedPathSet _paths; }; + +/** + * Returns true if 'sortPattern' represents a sort pattern on a single metadata field like: + * {score: {$meta: "searchScore"}}. + * + * Sort clause must only be on a single field, i.e. {score: {$meta: "searchScore"}, _id: 1} will + * return false. + * + * The 'metadataToConsider' field represents a bitset of all possible metadata fields to consider + * the sort is on. If the bitset is empty, any metadata will be considered. + */ +bool isSortOnSingleMetaField(const SortPattern& sortPattern, + QueryMetadataBitSet metadataToConsider = QueryMetadataBitSet{}); } // namespace mongo diff --git a/src/mongo/db/query/sort_pattern_test.cpp b/src/mongo/db/query/sort_pattern_test.cpp index 082ca6c9ca1..57b4beba9f5 100644 --- a/src/mongo/db/query/sort_pattern_test.cpp +++ b/src/mongo/db/query/sort_pattern_test.cpp @@ -36,11 +36,13 @@ #include "mongo/base/string_data.h" #include "mongo/bson/json.h" #include "mongo/db/exec/document_value/document.h" +#include "mongo/db/exec/document_value/document_metadata_fields.h" #include "mongo/db/exec/document_value/document_value_test_util.h" #include "mongo/db/namespace_string.h" #include "mongo/db/pipeline/expression.h" #include "mongo/db/pipeline/expression_context_for_test.h" #include "mongo/db/query/sort_pattern.h" +#include "mongo/idl/server_parameter_test_util.h" #include "mongo/unittest/assert.h" #include "mongo/unittest/framework.h" @@ -125,5 +127,63 @@ TEST(SortStageDefaultTest, WrongSortKeyDefinition) { ASSERT_THROWS_CODE(SortPattern(std::move(sortKeys)), AssertionException, 7472501); } +// Testing expected behavior of 'isSortOnSingleMetaField()' stateless function. +TEST(IsSortOnSingleMetaFieldTest, TestingIsSortOnSingleMetaFieldFn) { + RAIIServerParameterControllerForTest searchHybridScoringPrerequisitesController( + "featureFlagSearchHybridScoringPrerequisites", true); + + auto expCtx = getExpCtx(); + + // SortPattern must have a field. + ASSERT_FALSE(isSortOnSingleMetaField(SortPattern(fromjson("{}"), expCtx))); + + // SortPattern must have one field, but it must be a metadata field. + ASSERT_FALSE(isSortOnSingleMetaField(SortPattern(fromjson("{a: 1}"), expCtx))); + + // SortPattern cannot have multiple fields. + ASSERT_FALSE(isSortOnSingleMetaField(SortPattern(fromjson("{a: 1, b: 1}"), expCtx))); + + // SortPattern on a single metadata field, without QueryMetadataBitSet specified should pass for + // any valid metadata. + ASSERT_TRUE(isSortOnSingleMetaField( + SortPattern(fromjson("{score: {$meta: 'vectorSearchScore'}}"), expCtx))); + + // SortPattern on invalid metadata type should throw. + ASSERT_THROWS_CODE(isSortOnSingleMetaField( + SortPattern(fromjson("{score: {$meta: 'notRealMetadata'}}"), expCtx)), + DBException, + 31138); + + // SortPattern on valid metadata, but with multiple fields should be false. + ASSERT_FALSE(isSortOnSingleMetaField( + SortPattern(fromjson("{score: {$meta: 'vectorSearchScore'}, a: 1}"), expCtx))); + + // Explicitly specifying the metadata to consider, matching the metadata in the SortPattern + // should pass. + ASSERT_TRUE(isSortOnSingleMetaField( + SortPattern(fromjson("{score: {$meta: 'vectorSearchScore'}}"), expCtx), + (1 << DocumentMetadataFields::MetaType::kVectorSearchScore))); + + // Explicitly specifying the metadata to consider, that does not match the metadata in the + // SortPattern should fail. + ASSERT_FALSE( + isSortOnSingleMetaField(SortPattern(fromjson("{score: {$meta: 'searchScore'}}"), expCtx), + (1 << DocumentMetadataFields::MetaType::kVectorSearchScore))); + + // Explicitly specifying multiple metadata to consider, one of them matching the meatada in the + // SortPattern should pass. + ASSERT_TRUE(isSortOnSingleMetaField( + SortPattern(fromjson("{score: {$meta: 'vectorSearchScore'}}"), expCtx), + ((1 << DocumentMetadataFields::MetaType::kSearchScore) | + (1 << DocumentMetadataFields::MetaType::kVectorSearchScore)))); + + // Explicitly specifying multiple metadata to consider, neither of them matching the meatada in + // the SortPattern should fail. + ASSERT_FALSE(isSortOnSingleMetaField( + SortPattern(fromjson("{score: {$meta: 'vectorSearchScore'}}"), expCtx), + ((1 << DocumentMetadataFields::MetaType::kSearchScore) | + (1 << DocumentMetadataFields::MetaType::kScore)))); +} + } // namespace } // namespace mongo