0
0
mirror of https://github.com/mongodb/mongo.git synced 2024-11-21 12:39:08 +01:00

SERVER-96064: Optimize away metadata $sort directly after $vectorSearch for single node environments (#28699)

GitOrigin-RevId: 0ff41f52dc290e20210257ecfd43fa711610a414
This commit is contained in:
joe-mongodb 2024-11-07 12:04:37 -06:00 committed by MongoDB Bot
parent 8ca48aacef
commit 3ab7846ee8
7 changed files with 291 additions and 0 deletions

View File

@ -1214,3 +1214,20 @@ export function canonicalizePlan(p) {
});
}
}
/**
* Returns index of stage in a aggregation pipeline stage plan running on a single node
* (will not work for sharded clusters).
* 'root' is root of explain JSON.
* Returns -1 if stage does not exist.
*/
export function getIndexOfStageOnSingleNode(root, stageName) {
if (root.hasOwnProperty("stages")) {
for (let i = 0; i < root.stages.length; i++) {
if (root.stages[i].hasOwnProperty(stageName)) {
return i;
}
}
}
return -1;
}

View File

@ -0,0 +1,120 @@
/**
* Test an aggregation pipeline optimization where a $sort stage can be removed after
* a $vectorSearch stage, given that the $sort is on the same criteria that the $vectorSearch
* results are sorted by (the 'vectorSearchScore') .
*
* Also, this test should only run in single-node environments because a $sort after a $vectorSearch
* in a sharded cluster will end up with the $vectorSearch on mongod and $sort on mongos.
*
* @tags: [featureFlagSearchHybridScoringPrerequisites, assumes_against_mongod_not_mongos]
*/
import {getIndexOfStageOnSingleNode} from "jstests/libs/query/analyze_plan.js";
import {createSearchIndex, dropSearchIndex} from "jstests/libs/search.js";
// Helper functions to check if optimization being tested for exists:
function assertSortExistsAfterVectorSearch(aggPipeline) {
let explain = coll.explain().aggregate(aggPipeline);
// $vectorSearch must be the first step of the pipeline
assert(
getIndexOfStageOnSingleNode(explain, "$vectorSearch") == 0,
"'$vectorSearch' is not first step of the pipeline. explain for query: " + tojson(explain));
// A $sort stage must exist somewhere in the pipeline after $_internalSearchMongotRemote.
assert(getIndexOfStageOnSingleNode(explain, "$sort") > 0,
"'$sort' does not exist in the pipeline after $search. explain for query: " +
tojson(explain));
}
function assertNoSortExistsAfterVectorSearch(aggPipeline) {
let explain = coll.explain().aggregate(aggPipeline);
// $vectorSearch must be the first step of the pipeline
assert(
getIndexOfStageOnSingleNode(explain, "$vectorSearch") == 0,
"'$vectorSearch' is not first step of the pipeline. explain for query: " + tojson(explain));
// A $sort stage must not exist somewhere in the pipeline after $_internalSearchMongotRemote.
assert(
getIndexOfStageOnSingleNode(explain, "$sort") < 0,
"'$sort' does exist in the pipeline after $search. explain for query: " + tojson(explain));
}
const coll = db.foo;
coll.drop();
assert.commandWorked(coll.insertMany(
[{a: -1, v: [1, 0, 8, 1, 8]}, {a: 100, v: [2, -2, 1, 4, 4]}, {a: 10, v: [4, 10, -8, 22, 0]}]));
const indexName = "sort-after-vector-search-test-index";
// Create vector search index on movie plot embeddings.
const vectorIndex = {
name: indexName,
type: "vectorSearch",
definition:
{"fields": [{"type": "vector", "numDimensions": 5, "path": "v", "similarity": "euclidean"}]}
};
createSearchIndex(coll, vectorIndex);
const vectorSearchQuery = {
queryVector: [2, 4, -8, 2, 10],
path: "v",
numCandidates: 3,
index: indexName,
limit: 3,
};
// Run test cases:
//
// Cases where optimization applies and $sort should be removed:
// Standard case where a single sort on 'vectorSearchScore' should be removed.
assertNoSortExistsAfterVectorSearch([
{$vectorSearch: vectorSearchQuery},
{$sort: {score: {$meta: "vectorSearchScore"}}},
]);
// Multiple sorts in a row should all be removed.
assertNoSortExistsAfterVectorSearch([
{$vectorSearch: vectorSearchQuery},
{$sort: {score: {$meta: "vectorSearchScore"}}},
{$sort: {score: {$meta: "vectorSearchScore"}}},
{$sort: {score: {$meta: "vectorSearchScore"}}},
{$limit: 10},
]);
// Implicit $sort after $vectorSearch from desugared $setWindowFields should get removed.
assertNoSortExistsAfterVectorSearch([
{$vectorSearch: vectorSearchQuery},
{$setWindowFields: {sortBy: {score: {$meta: "vectorSearchScore"}}, output: {rank: {$rank: {}}}}}
]);
// Mixed explicit and implicit $sort after $vectorSearch should both get removed.
assertNoSortExistsAfterVectorSearch([
{$vectorSearch: vectorSearchQuery},
{$sort: {score: {$meta: "vectorSearchScore"}}},
{$setWindowFields: {sortBy: {score: {$meta: "vectorSearchScore"}}, output: {rank: {$rank: {}}}}}
]);
// Cases where optimization should not apply and $sort should remain:
// Explicit $sort that does not sort on 'vectorSearchScore' should not be removed.
assertSortExistsAfterVectorSearch([
{$vectorSearch: vectorSearchQuery},
{$sort: {a: 1}},
]);
// $sort with multi-field criteria on 'vectorSearchScore' and another field should not be removed.
assertSortExistsAfterVectorSearch([
{$vectorSearch: vectorSearchQuery},
{$sort: {score: {$meta: "vectorSearchScore"}, a: 1}},
]);
// Currently cannot optimize $sort that is not directly after $vectorSearch.
// TODO SERVER-96068: check that $sort is removed for these types of pipelines.
assertSortExistsAfterVectorSearch([
{$vectorSearch: vectorSearchQuery},
{$limit: 10},
{$sort: {score: {$meta: "vectorSearchScore"}}},
]);
dropSearchIndex(coll, {name: indexName});

View File

@ -30,6 +30,7 @@
#include "mongo/db/pipeline/search/document_source_vector_search.h"
#include "mongo/base/string_data.h"
#include "mongo/db/pipeline/document_source_sort.h"
#include "mongo/db/pipeline/search/document_source_internal_search_id_lookup.h"
#include "mongo/db/pipeline/search/lite_parsed_search.h"
#include "mongo/db/pipeline/search/vector_search_helper.h"
@ -260,8 +261,44 @@ std::list<intrusive_ptr<DocumentSource>> DocumentSourceVectorSearch::desugar() {
return desugaredPipeline;
}
std::pair<Pipeline::SourceContainer::iterator, bool>
DocumentSourceVectorSearch::_attemptSortAfterVectorSearchOptimization(
Pipeline::SourceContainer::iterator itr, Pipeline::SourceContainer* container) {
auto isSortOnVectorSearchMeta = [](const SortPattern& sortPattern) -> bool {
return isSortOnSingleMetaField(sortPattern,
(1 << DocumentMetadataFields::MetaType::kVectorSearchScore));
};
auto optItr = std::next(itr);
if (optItr != container->end()) {
if (auto sortStage = dynamic_cast<DocumentSourceSort*>(optItr->get())) {
// A $sort stage has been found directly after this stage.
// $vectorSearch results are always sorted by 'vectorSearchScore',
// so if the $sort stage is also sorted by 'vectorSearchScore', the $sort stage
// is redundant and can safely be removed.
if (isSortOnVectorSearchMeta(sortStage->getSortKeyPattern())) {
// Optimization successful.
container->remove(*optItr);
return {itr, true}; // Return the same pointer in case there are other
// optimizations to still be applied.
}
}
}
// Optimization not possible.
return {itr, false};
}
Pipeline::SourceContainer::iterator DocumentSourceVectorSearch::doOptimizeAt(
Pipeline::SourceContainer::iterator itr, Pipeline::SourceContainer* container) {
// Attempt to remove a $sort on metadata after this $vectorSearch stage.
{
const auto&& [returnItr, optimizationSucceeded] =
_attemptSortAfterVectorSearchOptimization(itr, container);
if (optimizationSucceeded) {
return returnItr;
}
}
auto stageItr = std::next(itr);
// Only attempt to get the limit from the query if there are further stages in the pipeline.
if (stageItr != container->end()) {

View File

@ -121,6 +121,23 @@ private:
// Initialize metrics related to the $vectorSearch stage on the OpDebug object.
void initializeOpDebugVectorSearchMetrics();
/**
* Attempts a pipeline optimization that removes a $sort stage that comes after the output of
* of mongot, if the resulting documents from mongot are sorted by the same criteria as the
* $sort ('vectorSearchScore').
*
* Also, this optimization only applies to cases where the $sort comes directly after this
* stage.
* TODO SERVER-96068 generalize this optimization to cases where any number of stages that
* preserve sort order come between this stage and the sort.
*
* Returns a pair of the iterator to return to the optimizer, and a bool of whether or not the
* optimization was successful. If optimization was successful, the container will be modified
* appropriately.
*/
std::pair<Pipeline::SourceContainer::iterator, bool> _attemptSortAfterVectorSearchOptimization(
Pipeline::SourceContainer::iterator itr, Pipeline::SourceContainer* container);
std::unique_ptr<MatchExpression> _filterExpr;
std::shared_ptr<executor::TaskExecutor> _taskExecutor;

View File

@ -198,4 +198,31 @@ bool SortPattern::isExtensionOf(const SortPattern& other) const {
}
return true;
}
bool isSortOnSingleMetaField(const SortPattern& sortPattern,
QueryMetadataBitSet metadataToConsider) {
// Exactly 1 expression in the sort pattern is needed.
if (sortPattern.begin() == sortPattern.end() ||
std::next(sortPattern.begin()) != sortPattern.end()) {
// 0 parts, or more than 1 part.
return false;
}
const auto& firstAndOnlyPart = *sortPattern.begin();
if (auto* expr = firstAndOnlyPart.expression.get()) {
if (auto metaExpr = dynamic_cast<ExpressionMeta*>(expr)) {
if (metadataToConsider.none()) {
// Any metadata field.
return true;
}
for (std::size_t i = 1; i < DocumentMetadataFields::kNumFields; ++i) {
if (metadataToConsider[i] &&
metaExpr->getMetaType() == static_cast<DocumentMetadataFields::MetaType>(i)) {
return true;
}
}
return false;
}
}
return false;
}
} // namespace mongo

View File

@ -172,4 +172,17 @@ private:
// The set of paths on which we're sorting.
OrderedPathSet _paths;
};
/**
* Returns true if 'sortPattern' represents a sort pattern on a single metadata field like:
* {score: {$meta: "searchScore"}}.
*
* Sort clause must only be on a single field, i.e. {score: {$meta: "searchScore"}, _id: 1} will
* return false.
*
* The 'metadataToConsider' field represents a bitset of all possible metadata fields to consider
* the sort is on. If the bitset is empty, any metadata will be considered.
*/
bool isSortOnSingleMetaField(const SortPattern& sortPattern,
QueryMetadataBitSet metadataToConsider = QueryMetadataBitSet{});
} // namespace mongo

View File

@ -36,11 +36,13 @@
#include "mongo/base/string_data.h"
#include "mongo/bson/json.h"
#include "mongo/db/exec/document_value/document.h"
#include "mongo/db/exec/document_value/document_metadata_fields.h"
#include "mongo/db/exec/document_value/document_value_test_util.h"
#include "mongo/db/namespace_string.h"
#include "mongo/db/pipeline/expression.h"
#include "mongo/db/pipeline/expression_context_for_test.h"
#include "mongo/db/query/sort_pattern.h"
#include "mongo/idl/server_parameter_test_util.h"
#include "mongo/unittest/assert.h"
#include "mongo/unittest/framework.h"
@ -125,5 +127,63 @@ TEST(SortStageDefaultTest, WrongSortKeyDefinition) {
ASSERT_THROWS_CODE(SortPattern(std::move(sortKeys)), AssertionException, 7472501);
}
// Testing expected behavior of 'isSortOnSingleMetaField()' stateless function.
TEST(IsSortOnSingleMetaFieldTest, TestingIsSortOnSingleMetaFieldFn) {
RAIIServerParameterControllerForTest searchHybridScoringPrerequisitesController(
"featureFlagSearchHybridScoringPrerequisites", true);
auto expCtx = getExpCtx();
// SortPattern must have a field.
ASSERT_FALSE(isSortOnSingleMetaField(SortPattern(fromjson("{}"), expCtx)));
// SortPattern must have one field, but it must be a metadata field.
ASSERT_FALSE(isSortOnSingleMetaField(SortPattern(fromjson("{a: 1}"), expCtx)));
// SortPattern cannot have multiple fields.
ASSERT_FALSE(isSortOnSingleMetaField(SortPattern(fromjson("{a: 1, b: 1}"), expCtx)));
// SortPattern on a single metadata field, without QueryMetadataBitSet specified should pass for
// any valid metadata.
ASSERT_TRUE(isSortOnSingleMetaField(
SortPattern(fromjson("{score: {$meta: 'vectorSearchScore'}}"), expCtx)));
// SortPattern on invalid metadata type should throw.
ASSERT_THROWS_CODE(isSortOnSingleMetaField(
SortPattern(fromjson("{score: {$meta: 'notRealMetadata'}}"), expCtx)),
DBException,
31138);
// SortPattern on valid metadata, but with multiple fields should be false.
ASSERT_FALSE(isSortOnSingleMetaField(
SortPattern(fromjson("{score: {$meta: 'vectorSearchScore'}, a: 1}"), expCtx)));
// Explicitly specifying the metadata to consider, matching the metadata in the SortPattern
// should pass.
ASSERT_TRUE(isSortOnSingleMetaField(
SortPattern(fromjson("{score: {$meta: 'vectorSearchScore'}}"), expCtx),
(1 << DocumentMetadataFields::MetaType::kVectorSearchScore)));
// Explicitly specifying the metadata to consider, that does not match the metadata in the
// SortPattern should fail.
ASSERT_FALSE(
isSortOnSingleMetaField(SortPattern(fromjson("{score: {$meta: 'searchScore'}}"), expCtx),
(1 << DocumentMetadataFields::MetaType::kVectorSearchScore)));
// Explicitly specifying multiple metadata to consider, one of them matching the meatada in the
// SortPattern should pass.
ASSERT_TRUE(isSortOnSingleMetaField(
SortPattern(fromjson("{score: {$meta: 'vectorSearchScore'}}"), expCtx),
((1 << DocumentMetadataFields::MetaType::kSearchScore) |
(1 << DocumentMetadataFields::MetaType::kVectorSearchScore))));
// Explicitly specifying multiple metadata to consider, neither of them matching the meatada in
// the SortPattern should fail.
ASSERT_FALSE(isSortOnSingleMetaField(
SortPattern(fromjson("{score: {$meta: 'vectorSearchScore'}}"), expCtx),
((1 << DocumentMetadataFields::MetaType::kSearchScore) |
(1 << DocumentMetadataFields::MetaType::kScore))));
}
} // namespace
} // namespace mongo