0
0
mirror of https://github.com/mongodb/mongo.git synced 2024-12-01 09:32:32 +01:00

SERVER-74361 Add $median accumulator that desugars to $percentile

This commit is contained in:
Gil Alon 2023-03-14 17:22:05 +00:00 committed by Evergreen Agent
parent 52eeec5650
commit c09b7fc88f
5 changed files with 201 additions and 2 deletions

View File

@ -0,0 +1,83 @@
/**
* Tests that the approximate median accumulator semantics matches the percentile semantics with the
* field 'p':[0.5].
* @tags: [
* featureFlagApproxPercentiles,
* # sharded collections aren't supported yet.
* assumes_unsharded_collection,
* ]
*/
(function() {
"use strict";
load("jstests/aggregation/extras/utils.js");
const coll = db[jsTestName()];
/**
* Tests for correctness without grouping. Confirms that $median computes the expected value and
* also checks that $percentile for p=0.5 computes the same value, because $median is supposed to be
* completely equivalent to the latter (e.g. we should not optimize $median independently of
* $percentile).
*/
function testWithSingleGroup({docs, medianSpec, expectedResult, msg}) {
coll.drop();
coll.insertMany(docs);
let medianArgs = medianSpec["$median"];
const percentileSpec = {
$percentile: {input: medianArgs.input, algorithm: medianArgs.algorithm, p: [0.5]}
};
const medianRes = coll.aggregate([{$group: {_id: null, p: medianSpec}}]).toArray();
const percentileRes = coll.aggregate([{$group: {_id: null, p: percentileSpec}}]).toArray();
assert.eq(expectedResult, medianRes[0].p, msg + ` result: ${tojson(medianRes)}`);
// If all the data is non-numeric then the expected result is just null, and therefore cannot be
// indexed into.
assert.eq((percentileRes[0].p ? percentileRes[0].p[0] : percentileRes[0].p),
medianRes[0].p,
msg + ` result: ${tojson(medianRes)}`);
}
testWithSingleGroup({
docs: [{x: 0}, {x: "non-numeric"}, {x: 1}, {no_x: 0}, {x: 2}],
medianSpec: {$median: {input: "$x", algorithm: "approximate"}},
expectedResult: 1,
msg: "Non-numeric data should be ignored"
});
testWithSingleGroup({
docs: [{x: "non-numeric"}, {non_x: 1}],
medianSpec: {$median: {input: "$x", algorithm: "approximate"}},
expectedResult: null,
msg: "Median of completely non-numeric data."
});
function testWithMultipleGroups({docs, medianSpec, expectedResult, msg}) {
coll.drop();
coll.insertMany(docs);
let medianArgs = medianSpec["$median"];
const percentileSpec = {
$percentile: {input: medianArgs.input, algorithm: medianArgs.algorithm, p: [0.5]}
};
const medianRes = coll.aggregate([{$group: {_id: null, p: medianSpec}}]).toArray();
const percentileRes = coll.aggregate([{$group: {_id: null, p: percentileSpec}}]).toArray();
assert.eq(medianRes.length, percentileRes.length);
for (let i = 0; i < medianRes.length; i++) {
assert.eq(expectedResult[i], medianRes[i].p, msg + ` result: ${tojson(medianRes)}`);
assert.eq(percentileRes[i].p[0], medianRes[i].p, msg + ` result: ${tojson(medianRes)}`);
}
}
testWithMultipleGroups({
docs: [{k: 0, x: 2}, {k: 0, x: 1}, {k: 1, x: 2}, {k: 2}, {k: 0, x: "str"}, {k: 1, x: 0}],
medianSpec: {$median: {input: "$x", algorithm: "approximate"}},
expectedResult: [/* k:0 */ 1, /* k:1 */ 0, /* k:2 */ null],
msg: "Median of multiple groups"
});
})();

View File

@ -61,6 +61,22 @@ assertInvalidSyntax({$percentile: {p: [0.5, 0.7], input: "$x", algorithm: 42}},
assertInvalidSyntax({$percentile: {p: [0.5, 0.7], input: "$x", algorithm: "fancy"}},
"Should fail if 'algorithm' isn't one of _predefined_ strings");
/**
* Tests for $median. $median desugars to $percentile with the field p:[0.5] added, and therefore
* has similar syntax to $percentile.
*/
assertInvalidSyntax({$median: {p: [0.5], input: "$x", algorithm: "approximate"}},
"Should fail if 'p' is defined");
assertInvalidSyntax({$median: {algorithm: "approximate"}},
"Should fail if $median is missing 'input' field");
assertInvalidSyntax({$median: {input: "$x"}},
"Should fail if $median is missing 'algorithm' field");
assertInvalidSyntax({$median: {input: "$x", algorithm: "approximate", extras: 42}},
"Should fail if $median contains an unexpected field");
/**
* Test that valid $percentile specifications are accepted. The results, i.e. semantics, are tested
* elsewhere and would cover all of the cases below, we are providing them here nonetheless for
@ -84,6 +100,13 @@ assertValidSyntax(
assertValidSyntax({$percentile: {p: [0.5, 0.9], input: "x", algorithm: "approximate"}},
"Non-numeric inputs should be gracefully ignored");
/**
* Tests for $median. $median desugars to $percentile with the field p:[0.5] added.
*/
assertValidSyntax({$median: {input: "$x", algorithm: "approximate"}},
"Simple base case for $median.");
/**
* Test that the "arrayness" of the result matches the "arrayness" of the specification.
*/

View File

@ -38,6 +38,11 @@ REGISTER_ACCUMULATOR_WITH_FEATURE_FLAG(percentile,
AccumulatorPercentile::parseArgs,
feature_flags::gFeatureFlagApproxPercentiles);
REGISTER_ACCUMULATOR_WITH_FEATURE_FLAG(median,
AccumulatorMedian::parseArgs,
feature_flags::gFeatureFlagApproxPercentiles);
Status AccumulatorPercentile::validatePercentileArg(const std::vector<double>& pv) {
if (pv.empty()) {
return {ErrorCodes::BadValue, "'p' cannot be an empty array"};
@ -143,4 +148,54 @@ intrusive_ptr<AccumulatorState> AccumulatorPercentile::create(
std::unique_ptr<PercentileAlgorithm> algo) {
return new AccumulatorPercentile(expCtx, ps, std::move(algo));
}
AccumulationExpression AccumulatorMedian::parseArgs(ExpressionContext* const expCtx,
BSONElement elem,
VariablesParseState vps) {
expCtx->sbeGroupCompatible = false;
uassert(7436100,
str::stream() << "specification must be an object; found " << elem,
elem.type() == BSONType::Object);
auto spec = AccumulatorMedianSpec::parse(IDLParserContext(kName), elem.Obj());
boost::intrusive_ptr<Expression> input =
Expression::parseOperand(expCtx, spec.getInput().getElement(), vps);
auto factory = [expCtx] {
// Temporary implementation! To be replaced based on the user's choice of algorithm.
auto algo = PercentileAlgorithm::createDiscreteSortAndRank();
return AccumulatorMedian::create(expCtx, std::move(algo));
};
return {ExpressionConstant::create(expCtx, Value(BSONNULL)) /*initializer*/,
std::move(input) /*argument*/,
std::move(factory),
"$ median"_sd /*name*/};
}
AccumulatorMedian::AccumulatorMedian(ExpressionContext* expCtx,
std::unique_ptr<PercentileAlgorithm> algo)
: AccumulatorPercentile(expCtx, {0.5} /* ps */, std::move(algo)){};
intrusive_ptr<AccumulatorState> AccumulatorMedian::create(
ExpressionContext* expCtx, std::unique_ptr<PercentileAlgorithm> algo) {
return new AccumulatorMedian(expCtx, std::move(algo));
}
Value AccumulatorMedian::getValue(bool toBeMerged) {
// Modify the base-class implementation to return a single value rather than a single-element
// array.
auto result = AccumulatorPercentile::getValue(toBeMerged);
if (result.getType() == jstNULL) {
return result;
}
tassert(7436101,
"the percentile algorithm for median must return a single result.",
result.getArrayLength() == 1);
return Value(result.getArray().front());
}
} // namespace mongo

View File

@ -40,7 +40,7 @@ namespace mongo {
class AccumulatorPercentile : public AccumulatorState {
public:
static constexpr auto kName = "$percentile"_sd;
const char* getOpName() const final {
const char* getOpName() const {
return kName.rawData();
}
@ -68,7 +68,7 @@ public:
* Ingressing values and computing the requested percentiles.
*/
void processInternal(const Value& input, bool merging) final;
Value getValue(bool toBeMerged) final;
Value getValue(bool toBeMerged);
/**
* Other infra for the accumulators.
@ -80,4 +80,30 @@ private:
std::unique_ptr<PercentileAlgorithm> _algo;
};
/*
* Accumulator for computing $median. $median has the same semantics as $percentile with the 'p'
* field set to [0.5].
*/
class AccumulatorMedian : public AccumulatorPercentile {
public:
static constexpr auto kName = "$median"_sd;
const char* getOpName() const final {
return kName.rawData();
}
/**
* Parsing and creating the accumulator.
*/
static AccumulationExpression parseArgs(ExpressionContext* expCtx,
BSONElement elem,
VariablesParseState vps);
static boost::intrusive_ptr<AccumulatorState> create(ExpressionContext* expCtx,
std::unique_ptr<PercentileAlgorithm> algo);
AccumulatorMedian(ExpressionContext* expCtx, std::unique_ptr<PercentileAlgorithm> algo);
Value getValue(bool toBeMerged) final;
};
} // namespace mongo

View File

@ -58,3 +58,15 @@ structs:
description: "The type of algorithm we will use to find the percentile."
type: PercentileAlgorithmType
optional: false
AccumulatorMedianSpec:
description: "Specification for the $median accumulator."
strict: true
fields:
input:
description: "The expression to find the percentiles of."
type: IDLAnyType
optional: false
algorithm:
description: "The type of algorithm we will use to find the percentile."
type: PercentileAlgorithmType
optional: false