mirror of
https://github.com/mongodb/mongo.git
synced 2024-11-21 12:39:08 +01:00
SERVER-96064: Optimize away metadata $sort directly after $vectorSearch for single node environments (#28699)
GitOrigin-RevId: 0ff41f52dc290e20210257ecfd43fa711610a414
This commit is contained in:
parent
8ca48aacef
commit
3ab7846ee8
@ -1214,3 +1214,20 @@ export function canonicalizePlan(p) {
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns index of stage in a aggregation pipeline stage plan running on a single node
|
||||
* (will not work for sharded clusters).
|
||||
* 'root' is root of explain JSON.
|
||||
* Returns -1 if stage does not exist.
|
||||
*/
|
||||
export function getIndexOfStageOnSingleNode(root, stageName) {
|
||||
if (root.hasOwnProperty("stages")) {
|
||||
for (let i = 0; i < root.stages.length; i++) {
|
||||
if (root.stages[i].hasOwnProperty(stageName)) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
120
jstests/with_mongot/e2e/sort_after_vector_search_optimization.js
Normal file
120
jstests/with_mongot/e2e/sort_after_vector_search_optimization.js
Normal file
@ -0,0 +1,120 @@
|
||||
/**
|
||||
* Test an aggregation pipeline optimization where a $sort stage can be removed after
|
||||
* a $vectorSearch stage, given that the $sort is on the same criteria that the $vectorSearch
|
||||
* results are sorted by (the 'vectorSearchScore') .
|
||||
*
|
||||
* Also, this test should only run in single-node environments because a $sort after a $vectorSearch
|
||||
* in a sharded cluster will end up with the $vectorSearch on mongod and $sort on mongos.
|
||||
*
|
||||
* @tags: [featureFlagSearchHybridScoringPrerequisites, assumes_against_mongod_not_mongos]
|
||||
*/
|
||||
|
||||
import {getIndexOfStageOnSingleNode} from "jstests/libs/query/analyze_plan.js";
|
||||
import {createSearchIndex, dropSearchIndex} from "jstests/libs/search.js";
|
||||
|
||||
// Helper functions to check if optimization being tested for exists:
|
||||
|
||||
function assertSortExistsAfterVectorSearch(aggPipeline) {
|
||||
let explain = coll.explain().aggregate(aggPipeline);
|
||||
// $vectorSearch must be the first step of the pipeline
|
||||
assert(
|
||||
getIndexOfStageOnSingleNode(explain, "$vectorSearch") == 0,
|
||||
"'$vectorSearch' is not first step of the pipeline. explain for query: " + tojson(explain));
|
||||
// A $sort stage must exist somewhere in the pipeline after $_internalSearchMongotRemote.
|
||||
assert(getIndexOfStageOnSingleNode(explain, "$sort") > 0,
|
||||
"'$sort' does not exist in the pipeline after $search. explain for query: " +
|
||||
tojson(explain));
|
||||
}
|
||||
|
||||
function assertNoSortExistsAfterVectorSearch(aggPipeline) {
|
||||
let explain = coll.explain().aggregate(aggPipeline);
|
||||
// $vectorSearch must be the first step of the pipeline
|
||||
assert(
|
||||
getIndexOfStageOnSingleNode(explain, "$vectorSearch") == 0,
|
||||
"'$vectorSearch' is not first step of the pipeline. explain for query: " + tojson(explain));
|
||||
// A $sort stage must not exist somewhere in the pipeline after $_internalSearchMongotRemote.
|
||||
assert(
|
||||
getIndexOfStageOnSingleNode(explain, "$sort") < 0,
|
||||
"'$sort' does exist in the pipeline after $search. explain for query: " + tojson(explain));
|
||||
}
|
||||
|
||||
const coll = db.foo;
|
||||
coll.drop();
|
||||
|
||||
assert.commandWorked(coll.insertMany(
|
||||
[{a: -1, v: [1, 0, 8, 1, 8]}, {a: 100, v: [2, -2, 1, 4, 4]}, {a: 10, v: [4, 10, -8, 22, 0]}]));
|
||||
|
||||
const indexName = "sort-after-vector-search-test-index";
|
||||
// Create vector search index on movie plot embeddings.
|
||||
const vectorIndex = {
|
||||
name: indexName,
|
||||
type: "vectorSearch",
|
||||
definition:
|
||||
{"fields": [{"type": "vector", "numDimensions": 5, "path": "v", "similarity": "euclidean"}]}
|
||||
};
|
||||
|
||||
createSearchIndex(coll, vectorIndex);
|
||||
|
||||
const vectorSearchQuery = {
|
||||
queryVector: [2, 4, -8, 2, 10],
|
||||
path: "v",
|
||||
numCandidates: 3,
|
||||
index: indexName,
|
||||
limit: 3,
|
||||
};
|
||||
|
||||
// Run test cases:
|
||||
//
|
||||
// Cases where optimization applies and $sort should be removed:
|
||||
|
||||
// Standard case where a single sort on 'vectorSearchScore' should be removed.
|
||||
assertNoSortExistsAfterVectorSearch([
|
||||
{$vectorSearch: vectorSearchQuery},
|
||||
{$sort: {score: {$meta: "vectorSearchScore"}}},
|
||||
]);
|
||||
|
||||
// Multiple sorts in a row should all be removed.
|
||||
assertNoSortExistsAfterVectorSearch([
|
||||
{$vectorSearch: vectorSearchQuery},
|
||||
{$sort: {score: {$meta: "vectorSearchScore"}}},
|
||||
{$sort: {score: {$meta: "vectorSearchScore"}}},
|
||||
{$sort: {score: {$meta: "vectorSearchScore"}}},
|
||||
{$limit: 10},
|
||||
]);
|
||||
|
||||
// Implicit $sort after $vectorSearch from desugared $setWindowFields should get removed.
|
||||
assertNoSortExistsAfterVectorSearch([
|
||||
{$vectorSearch: vectorSearchQuery},
|
||||
{$setWindowFields: {sortBy: {score: {$meta: "vectorSearchScore"}}, output: {rank: {$rank: {}}}}}
|
||||
]);
|
||||
|
||||
// Mixed explicit and implicit $sort after $vectorSearch should both get removed.
|
||||
assertNoSortExistsAfterVectorSearch([
|
||||
{$vectorSearch: vectorSearchQuery},
|
||||
{$sort: {score: {$meta: "vectorSearchScore"}}},
|
||||
{$setWindowFields: {sortBy: {score: {$meta: "vectorSearchScore"}}, output: {rank: {$rank: {}}}}}
|
||||
]);
|
||||
|
||||
// Cases where optimization should not apply and $sort should remain:
|
||||
|
||||
// Explicit $sort that does not sort on 'vectorSearchScore' should not be removed.
|
||||
assertSortExistsAfterVectorSearch([
|
||||
{$vectorSearch: vectorSearchQuery},
|
||||
{$sort: {a: 1}},
|
||||
]);
|
||||
|
||||
// $sort with multi-field criteria on 'vectorSearchScore' and another field should not be removed.
|
||||
assertSortExistsAfterVectorSearch([
|
||||
{$vectorSearch: vectorSearchQuery},
|
||||
{$sort: {score: {$meta: "vectorSearchScore"}, a: 1}},
|
||||
]);
|
||||
|
||||
// Currently cannot optimize $sort that is not directly after $vectorSearch.
|
||||
// TODO SERVER-96068: check that $sort is removed for these types of pipelines.
|
||||
assertSortExistsAfterVectorSearch([
|
||||
{$vectorSearch: vectorSearchQuery},
|
||||
{$limit: 10},
|
||||
{$sort: {score: {$meta: "vectorSearchScore"}}},
|
||||
]);
|
||||
|
||||
dropSearchIndex(coll, {name: indexName});
|
@ -30,6 +30,7 @@
|
||||
#include "mongo/db/pipeline/search/document_source_vector_search.h"
|
||||
|
||||
#include "mongo/base/string_data.h"
|
||||
#include "mongo/db/pipeline/document_source_sort.h"
|
||||
#include "mongo/db/pipeline/search/document_source_internal_search_id_lookup.h"
|
||||
#include "mongo/db/pipeline/search/lite_parsed_search.h"
|
||||
#include "mongo/db/pipeline/search/vector_search_helper.h"
|
||||
@ -260,8 +261,44 @@ std::list<intrusive_ptr<DocumentSource>> DocumentSourceVectorSearch::desugar() {
|
||||
return desugaredPipeline;
|
||||
}
|
||||
|
||||
std::pair<Pipeline::SourceContainer::iterator, bool>
|
||||
DocumentSourceVectorSearch::_attemptSortAfterVectorSearchOptimization(
|
||||
Pipeline::SourceContainer::iterator itr, Pipeline::SourceContainer* container) {
|
||||
auto isSortOnVectorSearchMeta = [](const SortPattern& sortPattern) -> bool {
|
||||
return isSortOnSingleMetaField(sortPattern,
|
||||
(1 << DocumentMetadataFields::MetaType::kVectorSearchScore));
|
||||
};
|
||||
auto optItr = std::next(itr);
|
||||
if (optItr != container->end()) {
|
||||
if (auto sortStage = dynamic_cast<DocumentSourceSort*>(optItr->get())) {
|
||||
// A $sort stage has been found directly after this stage.
|
||||
// $vectorSearch results are always sorted by 'vectorSearchScore',
|
||||
// so if the $sort stage is also sorted by 'vectorSearchScore', the $sort stage
|
||||
// is redundant and can safely be removed.
|
||||
if (isSortOnVectorSearchMeta(sortStage->getSortKeyPattern())) {
|
||||
// Optimization successful.
|
||||
container->remove(*optItr);
|
||||
return {itr, true}; // Return the same pointer in case there are other
|
||||
// optimizations to still be applied.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Optimization not possible.
|
||||
return {itr, false};
|
||||
}
|
||||
|
||||
Pipeline::SourceContainer::iterator DocumentSourceVectorSearch::doOptimizeAt(
|
||||
Pipeline::SourceContainer::iterator itr, Pipeline::SourceContainer* container) {
|
||||
// Attempt to remove a $sort on metadata after this $vectorSearch stage.
|
||||
{
|
||||
const auto&& [returnItr, optimizationSucceeded] =
|
||||
_attemptSortAfterVectorSearchOptimization(itr, container);
|
||||
if (optimizationSucceeded) {
|
||||
return returnItr;
|
||||
}
|
||||
}
|
||||
|
||||
auto stageItr = std::next(itr);
|
||||
// Only attempt to get the limit from the query if there are further stages in the pipeline.
|
||||
if (stageItr != container->end()) {
|
||||
|
@ -121,6 +121,23 @@ private:
|
||||
// Initialize metrics related to the $vectorSearch stage on the OpDebug object.
|
||||
void initializeOpDebugVectorSearchMetrics();
|
||||
|
||||
/**
|
||||
* Attempts a pipeline optimization that removes a $sort stage that comes after the output of
|
||||
* of mongot, if the resulting documents from mongot are sorted by the same criteria as the
|
||||
* $sort ('vectorSearchScore').
|
||||
*
|
||||
* Also, this optimization only applies to cases where the $sort comes directly after this
|
||||
* stage.
|
||||
* TODO SERVER-96068 generalize this optimization to cases where any number of stages that
|
||||
* preserve sort order come between this stage and the sort.
|
||||
*
|
||||
* Returns a pair of the iterator to return to the optimizer, and a bool of whether or not the
|
||||
* optimization was successful. If optimization was successful, the container will be modified
|
||||
* appropriately.
|
||||
*/
|
||||
std::pair<Pipeline::SourceContainer::iterator, bool> _attemptSortAfterVectorSearchOptimization(
|
||||
Pipeline::SourceContainer::iterator itr, Pipeline::SourceContainer* container);
|
||||
|
||||
std::unique_ptr<MatchExpression> _filterExpr;
|
||||
|
||||
std::shared_ptr<executor::TaskExecutor> _taskExecutor;
|
||||
|
@ -198,4 +198,31 @@ bool SortPattern::isExtensionOf(const SortPattern& other) const {
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool isSortOnSingleMetaField(const SortPattern& sortPattern,
|
||||
QueryMetadataBitSet metadataToConsider) {
|
||||
// Exactly 1 expression in the sort pattern is needed.
|
||||
if (sortPattern.begin() == sortPattern.end() ||
|
||||
std::next(sortPattern.begin()) != sortPattern.end()) {
|
||||
// 0 parts, or more than 1 part.
|
||||
return false;
|
||||
}
|
||||
const auto& firstAndOnlyPart = *sortPattern.begin();
|
||||
if (auto* expr = firstAndOnlyPart.expression.get()) {
|
||||
if (auto metaExpr = dynamic_cast<ExpressionMeta*>(expr)) {
|
||||
if (metadataToConsider.none()) {
|
||||
// Any metadata field.
|
||||
return true;
|
||||
}
|
||||
for (std::size_t i = 1; i < DocumentMetadataFields::kNumFields; ++i) {
|
||||
if (metadataToConsider[i] &&
|
||||
metaExpr->getMetaType() == static_cast<DocumentMetadataFields::MetaType>(i)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
} // namespace mongo
|
||||
|
@ -172,4 +172,17 @@ private:
|
||||
// The set of paths on which we're sorting.
|
||||
OrderedPathSet _paths;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns true if 'sortPattern' represents a sort pattern on a single metadata field like:
|
||||
* {score: {$meta: "searchScore"}}.
|
||||
*
|
||||
* Sort clause must only be on a single field, i.e. {score: {$meta: "searchScore"}, _id: 1} will
|
||||
* return false.
|
||||
*
|
||||
* The 'metadataToConsider' field represents a bitset of all possible metadata fields to consider
|
||||
* the sort is on. If the bitset is empty, any metadata will be considered.
|
||||
*/
|
||||
bool isSortOnSingleMetaField(const SortPattern& sortPattern,
|
||||
QueryMetadataBitSet metadataToConsider = QueryMetadataBitSet{});
|
||||
} // namespace mongo
|
||||
|
@ -36,11 +36,13 @@
|
||||
#include "mongo/base/string_data.h"
|
||||
#include "mongo/bson/json.h"
|
||||
#include "mongo/db/exec/document_value/document.h"
|
||||
#include "mongo/db/exec/document_value/document_metadata_fields.h"
|
||||
#include "mongo/db/exec/document_value/document_value_test_util.h"
|
||||
#include "mongo/db/namespace_string.h"
|
||||
#include "mongo/db/pipeline/expression.h"
|
||||
#include "mongo/db/pipeline/expression_context_for_test.h"
|
||||
#include "mongo/db/query/sort_pattern.h"
|
||||
#include "mongo/idl/server_parameter_test_util.h"
|
||||
#include "mongo/unittest/assert.h"
|
||||
#include "mongo/unittest/framework.h"
|
||||
|
||||
@ -125,5 +127,63 @@ TEST(SortStageDefaultTest, WrongSortKeyDefinition) {
|
||||
ASSERT_THROWS_CODE(SortPattern(std::move(sortKeys)), AssertionException, 7472501);
|
||||
}
|
||||
|
||||
// Testing expected behavior of 'isSortOnSingleMetaField()' stateless function.
|
||||
TEST(IsSortOnSingleMetaFieldTest, TestingIsSortOnSingleMetaFieldFn) {
|
||||
RAIIServerParameterControllerForTest searchHybridScoringPrerequisitesController(
|
||||
"featureFlagSearchHybridScoringPrerequisites", true);
|
||||
|
||||
auto expCtx = getExpCtx();
|
||||
|
||||
// SortPattern must have a field.
|
||||
ASSERT_FALSE(isSortOnSingleMetaField(SortPattern(fromjson("{}"), expCtx)));
|
||||
|
||||
// SortPattern must have one field, but it must be a metadata field.
|
||||
ASSERT_FALSE(isSortOnSingleMetaField(SortPattern(fromjson("{a: 1}"), expCtx)));
|
||||
|
||||
// SortPattern cannot have multiple fields.
|
||||
ASSERT_FALSE(isSortOnSingleMetaField(SortPattern(fromjson("{a: 1, b: 1}"), expCtx)));
|
||||
|
||||
// SortPattern on a single metadata field, without QueryMetadataBitSet specified should pass for
|
||||
// any valid metadata.
|
||||
ASSERT_TRUE(isSortOnSingleMetaField(
|
||||
SortPattern(fromjson("{score: {$meta: 'vectorSearchScore'}}"), expCtx)));
|
||||
|
||||
// SortPattern on invalid metadata type should throw.
|
||||
ASSERT_THROWS_CODE(isSortOnSingleMetaField(
|
||||
SortPattern(fromjson("{score: {$meta: 'notRealMetadata'}}"), expCtx)),
|
||||
DBException,
|
||||
31138);
|
||||
|
||||
// SortPattern on valid metadata, but with multiple fields should be false.
|
||||
ASSERT_FALSE(isSortOnSingleMetaField(
|
||||
SortPattern(fromjson("{score: {$meta: 'vectorSearchScore'}, a: 1}"), expCtx)));
|
||||
|
||||
// Explicitly specifying the metadata to consider, matching the metadata in the SortPattern
|
||||
// should pass.
|
||||
ASSERT_TRUE(isSortOnSingleMetaField(
|
||||
SortPattern(fromjson("{score: {$meta: 'vectorSearchScore'}}"), expCtx),
|
||||
(1 << DocumentMetadataFields::MetaType::kVectorSearchScore)));
|
||||
|
||||
// Explicitly specifying the metadata to consider, that does not match the metadata in the
|
||||
// SortPattern should fail.
|
||||
ASSERT_FALSE(
|
||||
isSortOnSingleMetaField(SortPattern(fromjson("{score: {$meta: 'searchScore'}}"), expCtx),
|
||||
(1 << DocumentMetadataFields::MetaType::kVectorSearchScore)));
|
||||
|
||||
// Explicitly specifying multiple metadata to consider, one of them matching the meatada in the
|
||||
// SortPattern should pass.
|
||||
ASSERT_TRUE(isSortOnSingleMetaField(
|
||||
SortPattern(fromjson("{score: {$meta: 'vectorSearchScore'}}"), expCtx),
|
||||
((1 << DocumentMetadataFields::MetaType::kSearchScore) |
|
||||
(1 << DocumentMetadataFields::MetaType::kVectorSearchScore))));
|
||||
|
||||
// Explicitly specifying multiple metadata to consider, neither of them matching the meatada in
|
||||
// the SortPattern should fail.
|
||||
ASSERT_FALSE(isSortOnSingleMetaField(
|
||||
SortPattern(fromjson("{score: {$meta: 'vectorSearchScore'}}"), expCtx),
|
||||
((1 << DocumentMetadataFields::MetaType::kSearchScore) |
|
||||
(1 << DocumentMetadataFields::MetaType::kScore))));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace mongo
|
||||
|
Loading…
Reference in New Issue
Block a user