mirror of
https://github.com/mongodb/mongo.git
synced 2024-12-01 09:32:32 +01:00
SERVER-74361 Add $median accumulator that desugars to $percentile
This commit is contained in:
parent
52eeec5650
commit
c09b7fc88f
83
jstests/aggregation/accumulators/median_approx.js
Normal file
83
jstests/aggregation/accumulators/median_approx.js
Normal file
@ -0,0 +1,83 @@
|
||||
/**
|
||||
* Tests that the approximate median accumulator semantics matches the percentile semantics with the
|
||||
* field 'p':[0.5].
|
||||
* @tags: [
|
||||
* featureFlagApproxPercentiles,
|
||||
* # sharded collections aren't supported yet.
|
||||
* assumes_unsharded_collection,
|
||||
* ]
|
||||
*/
|
||||
(function() {
|
||||
"use strict";
|
||||
|
||||
load("jstests/aggregation/extras/utils.js");
|
||||
|
||||
const coll = db[jsTestName()];
|
||||
|
||||
/**
|
||||
* Tests for correctness without grouping. Confirms that $median computes the expected value and
|
||||
* also checks that $percentile for p=0.5 computes the same value, because $median is supposed to be
|
||||
* completely equivalent to the latter (e.g. we should not optimize $median independently of
|
||||
* $percentile).
|
||||
*/
|
||||
function testWithSingleGroup({docs, medianSpec, expectedResult, msg}) {
|
||||
coll.drop();
|
||||
coll.insertMany(docs);
|
||||
|
||||
let medianArgs = medianSpec["$median"];
|
||||
const percentileSpec = {
|
||||
$percentile: {input: medianArgs.input, algorithm: medianArgs.algorithm, p: [0.5]}
|
||||
};
|
||||
|
||||
const medianRes = coll.aggregate([{$group: {_id: null, p: medianSpec}}]).toArray();
|
||||
const percentileRes = coll.aggregate([{$group: {_id: null, p: percentileSpec}}]).toArray();
|
||||
|
||||
assert.eq(expectedResult, medianRes[0].p, msg + ` result: ${tojson(medianRes)}`);
|
||||
|
||||
// If all the data is non-numeric then the expected result is just null, and therefore cannot be
|
||||
// indexed into.
|
||||
assert.eq((percentileRes[0].p ? percentileRes[0].p[0] : percentileRes[0].p),
|
||||
medianRes[0].p,
|
||||
msg + ` result: ${tojson(medianRes)}`);
|
||||
}
|
||||
|
||||
testWithSingleGroup({
|
||||
docs: [{x: 0}, {x: "non-numeric"}, {x: 1}, {no_x: 0}, {x: 2}],
|
||||
medianSpec: {$median: {input: "$x", algorithm: "approximate"}},
|
||||
expectedResult: 1,
|
||||
msg: "Non-numeric data should be ignored"
|
||||
});
|
||||
|
||||
testWithSingleGroup({
|
||||
docs: [{x: "non-numeric"}, {non_x: 1}],
|
||||
medianSpec: {$median: {input: "$x", algorithm: "approximate"}},
|
||||
expectedResult: null,
|
||||
msg: "Median of completely non-numeric data."
|
||||
});
|
||||
|
||||
function testWithMultipleGroups({docs, medianSpec, expectedResult, msg}) {
|
||||
coll.drop();
|
||||
coll.insertMany(docs);
|
||||
|
||||
let medianArgs = medianSpec["$median"];
|
||||
const percentileSpec = {
|
||||
$percentile: {input: medianArgs.input, algorithm: medianArgs.algorithm, p: [0.5]}
|
||||
};
|
||||
|
||||
const medianRes = coll.aggregate([{$group: {_id: null, p: medianSpec}}]).toArray();
|
||||
const percentileRes = coll.aggregate([{$group: {_id: null, p: percentileSpec}}]).toArray();
|
||||
|
||||
assert.eq(medianRes.length, percentileRes.length);
|
||||
for (let i = 0; i < medianRes.length; i++) {
|
||||
assert.eq(expectedResult[i], medianRes[i].p, msg + ` result: ${tojson(medianRes)}`);
|
||||
assert.eq(percentileRes[i].p[0], medianRes[i].p, msg + ` result: ${tojson(medianRes)}`);
|
||||
}
|
||||
}
|
||||
|
||||
testWithMultipleGroups({
|
||||
docs: [{k: 0, x: 2}, {k: 0, x: 1}, {k: 1, x: 2}, {k: 2}, {k: 0, x: "str"}, {k: 1, x: 0}],
|
||||
medianSpec: {$median: {input: "$x", algorithm: "approximate"}},
|
||||
expectedResult: [/* k:0 */ 1, /* k:1 */ 0, /* k:2 */ null],
|
||||
msg: "Median of multiple groups"
|
||||
});
|
||||
})();
|
@ -61,6 +61,22 @@ assertInvalidSyntax({$percentile: {p: [0.5, 0.7], input: "$x", algorithm: 42}},
|
||||
assertInvalidSyntax({$percentile: {p: [0.5, 0.7], input: "$x", algorithm: "fancy"}},
|
||||
"Should fail if 'algorithm' isn't one of _predefined_ strings");
|
||||
|
||||
/**
|
||||
* Tests for $median. $median desugars to $percentile with the field p:[0.5] added, and therefore
|
||||
* has similar syntax to $percentile.
|
||||
*/
|
||||
|
||||
assertInvalidSyntax({$median: {p: [0.5], input: "$x", algorithm: "approximate"}},
|
||||
"Should fail if 'p' is defined");
|
||||
|
||||
assertInvalidSyntax({$median: {algorithm: "approximate"}},
|
||||
"Should fail if $median is missing 'input' field");
|
||||
|
||||
assertInvalidSyntax({$median: {input: "$x"}},
|
||||
"Should fail if $median is missing 'algorithm' field");
|
||||
|
||||
assertInvalidSyntax({$median: {input: "$x", algorithm: "approximate", extras: 42}},
|
||||
"Should fail if $median contains an unexpected field");
|
||||
/**
|
||||
* Test that valid $percentile specifications are accepted. The results, i.e. semantics, are tested
|
||||
* elsewhere and would cover all of the cases below, we are providing them here nonetheless for
|
||||
@ -84,6 +100,13 @@ assertValidSyntax(
|
||||
assertValidSyntax({$percentile: {p: [0.5, 0.9], input: "x", algorithm: "approximate"}},
|
||||
"Non-numeric inputs should be gracefully ignored");
|
||||
|
||||
/**
|
||||
* Tests for $median. $median desugars to $percentile with the field p:[0.5] added.
|
||||
*/
|
||||
|
||||
assertValidSyntax({$median: {input: "$x", algorithm: "approximate"}},
|
||||
"Simple base case for $median.");
|
||||
|
||||
/**
|
||||
* Test that the "arrayness" of the result matches the "arrayness" of the specification.
|
||||
*/
|
||||
|
@ -38,6 +38,11 @@ REGISTER_ACCUMULATOR_WITH_FEATURE_FLAG(percentile,
|
||||
AccumulatorPercentile::parseArgs,
|
||||
feature_flags::gFeatureFlagApproxPercentiles);
|
||||
|
||||
|
||||
REGISTER_ACCUMULATOR_WITH_FEATURE_FLAG(median,
|
||||
AccumulatorMedian::parseArgs,
|
||||
feature_flags::gFeatureFlagApproxPercentiles);
|
||||
|
||||
Status AccumulatorPercentile::validatePercentileArg(const std::vector<double>& pv) {
|
||||
if (pv.empty()) {
|
||||
return {ErrorCodes::BadValue, "'p' cannot be an empty array"};
|
||||
@ -143,4 +148,54 @@ intrusive_ptr<AccumulatorState> AccumulatorPercentile::create(
|
||||
std::unique_ptr<PercentileAlgorithm> algo) {
|
||||
return new AccumulatorPercentile(expCtx, ps, std::move(algo));
|
||||
}
|
||||
|
||||
AccumulationExpression AccumulatorMedian::parseArgs(ExpressionContext* const expCtx,
|
||||
BSONElement elem,
|
||||
VariablesParseState vps) {
|
||||
expCtx->sbeGroupCompatible = false;
|
||||
|
||||
uassert(7436100,
|
||||
str::stream() << "specification must be an object; found " << elem,
|
||||
elem.type() == BSONType::Object);
|
||||
|
||||
auto spec = AccumulatorMedianSpec::parse(IDLParserContext(kName), elem.Obj());
|
||||
boost::intrusive_ptr<Expression> input =
|
||||
Expression::parseOperand(expCtx, spec.getInput().getElement(), vps);
|
||||
|
||||
auto factory = [expCtx] {
|
||||
// Temporary implementation! To be replaced based on the user's choice of algorithm.
|
||||
auto algo = PercentileAlgorithm::createDiscreteSortAndRank();
|
||||
|
||||
return AccumulatorMedian::create(expCtx, std::move(algo));
|
||||
};
|
||||
|
||||
return {ExpressionConstant::create(expCtx, Value(BSONNULL)) /*initializer*/,
|
||||
std::move(input) /*argument*/,
|
||||
std::move(factory),
|
||||
"$ median"_sd /*name*/};
|
||||
}
|
||||
|
||||
AccumulatorMedian::AccumulatorMedian(ExpressionContext* expCtx,
|
||||
std::unique_ptr<PercentileAlgorithm> algo)
|
||||
: AccumulatorPercentile(expCtx, {0.5} /* ps */, std::move(algo)){};
|
||||
|
||||
intrusive_ptr<AccumulatorState> AccumulatorMedian::create(
|
||||
ExpressionContext* expCtx, std::unique_ptr<PercentileAlgorithm> algo) {
|
||||
return new AccumulatorMedian(expCtx, std::move(algo));
|
||||
}
|
||||
|
||||
Value AccumulatorMedian::getValue(bool toBeMerged) {
|
||||
// Modify the base-class implementation to return a single value rather than a single-element
|
||||
// array.
|
||||
auto result = AccumulatorPercentile::getValue(toBeMerged);
|
||||
if (result.getType() == jstNULL) {
|
||||
return result;
|
||||
}
|
||||
|
||||
tassert(7436101,
|
||||
"the percentile algorithm for median must return a single result.",
|
||||
result.getArrayLength() == 1);
|
||||
|
||||
return Value(result.getArray().front());
|
||||
}
|
||||
} // namespace mongo
|
||||
|
@ -40,7 +40,7 @@ namespace mongo {
|
||||
class AccumulatorPercentile : public AccumulatorState {
|
||||
public:
|
||||
static constexpr auto kName = "$percentile"_sd;
|
||||
const char* getOpName() const final {
|
||||
const char* getOpName() const {
|
||||
return kName.rawData();
|
||||
}
|
||||
|
||||
@ -68,7 +68,7 @@ public:
|
||||
* Ingressing values and computing the requested percentiles.
|
||||
*/
|
||||
void processInternal(const Value& input, bool merging) final;
|
||||
Value getValue(bool toBeMerged) final;
|
||||
Value getValue(bool toBeMerged);
|
||||
|
||||
/**
|
||||
* Other infra for the accumulators.
|
||||
@ -80,4 +80,30 @@ private:
|
||||
std::unique_ptr<PercentileAlgorithm> _algo;
|
||||
};
|
||||
|
||||
/*
|
||||
* Accumulator for computing $median. $median has the same semantics as $percentile with the 'p'
|
||||
* field set to [0.5].
|
||||
*/
|
||||
class AccumulatorMedian : public AccumulatorPercentile {
|
||||
public:
|
||||
static constexpr auto kName = "$median"_sd;
|
||||
const char* getOpName() const final {
|
||||
return kName.rawData();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parsing and creating the accumulator.
|
||||
*/
|
||||
static AccumulationExpression parseArgs(ExpressionContext* expCtx,
|
||||
BSONElement elem,
|
||||
VariablesParseState vps);
|
||||
|
||||
static boost::intrusive_ptr<AccumulatorState> create(ExpressionContext* expCtx,
|
||||
std::unique_ptr<PercentileAlgorithm> algo);
|
||||
|
||||
AccumulatorMedian(ExpressionContext* expCtx, std::unique_ptr<PercentileAlgorithm> algo);
|
||||
|
||||
Value getValue(bool toBeMerged) final;
|
||||
};
|
||||
|
||||
} // namespace mongo
|
||||
|
@ -58,3 +58,15 @@ structs:
|
||||
description: "The type of algorithm we will use to find the percentile."
|
||||
type: PercentileAlgorithmType
|
||||
optional: false
|
||||
AccumulatorMedianSpec:
|
||||
description: "Specification for the $median accumulator."
|
||||
strict: true
|
||||
fields:
|
||||
input:
|
||||
description: "The expression to find the percentiles of."
|
||||
type: IDLAnyType
|
||||
optional: false
|
||||
algorithm:
|
||||
description: "The type of algorithm we will use to find the percentile."
|
||||
type: PercentileAlgorithmType
|
||||
optional: false
|
||||
|
Loading…
Reference in New Issue
Block a user