SERVER-74361 Add $median accumulator that desugars to $percentile

2024-12-01 09:32:32 +01:00 · 2023-03-14 17:22:05 +00:00 · 2023-03-14 17:22:05 +00:00 · c09b7fc88f
commit c09b7fc88f
parent 52eeec5650
5 changed files with 201 additions and 2 deletions
--- a/jstests/aggregation/accumulators/median_approx.js
+++ b/jstests/aggregation/accumulators/median_approx.js
@ -0,0 +1,83 @@
+/**
+ * Tests that the approximate median accumulator semantics matches the percentile semantics with the
+ * field 'p':[0.5].
+ * @tags: [
+ *   featureFlagApproxPercentiles,
+ *   # sharded collections aren't supported yet.
+ *   assumes_unsharded_collection,
+ * ]
+ */
+(function() {
+"use strict";
+
+load("jstests/aggregation/extras/utils.js");
+
+const coll = db[jsTestName()];
+
+/**
+ * Tests for correctness without grouping. Confirms that $median computes the expected value and
+ * also checks that $percentile for p=0.5 computes the same value, because $median is supposed to be
+ * completely equivalent to the latter (e.g. we should not optimize $median independently of
+ * $percentile).
+ */
+function testWithSingleGroup({docs, medianSpec, expectedResult, msg}) {
+    coll.drop();
+    coll.insertMany(docs);
+
+    let medianArgs = medianSpec["$median"];
+    const percentileSpec = {
+        $percentile: {input: medianArgs.input, algorithm: medianArgs.algorithm, p: [0.5]}
+    };
+
+    const medianRes = coll.aggregate([{$group: {_id: null, p: medianSpec}}]).toArray();
+    const percentileRes = coll.aggregate([{$group: {_id: null, p: percentileSpec}}]).toArray();
+
+    assert.eq(expectedResult, medianRes[0].p, msg + ` result: ${tojson(medianRes)}`);
+
+    // If all the data is non-numeric then the expected result is just null, and therefore cannot be
+    // indexed into.
+    assert.eq((percentileRes[0].p ? percentileRes[0].p[0] : percentileRes[0].p),
+              medianRes[0].p,
+              msg + ` result: ${tojson(medianRes)}`);
+}
+
+testWithSingleGroup({
+    docs: [{x: 0}, {x: "non-numeric"}, {x: 1}, {no_x: 0}, {x: 2}],
+    medianSpec: {$median: {input: "$x", algorithm: "approximate"}},
+    expectedResult: 1,
+    msg: "Non-numeric data should be ignored"
+});
+
+testWithSingleGroup({
+    docs: [{x: "non-numeric"}, {non_x: 1}],
+    medianSpec: {$median: {input: "$x", algorithm: "approximate"}},
+    expectedResult: null,
+    msg: "Median of completely non-numeric data."
+});
+
+function testWithMultipleGroups({docs, medianSpec, expectedResult, msg}) {
+    coll.drop();
+    coll.insertMany(docs);
+
+    let medianArgs = medianSpec["$median"];
+    const percentileSpec = {
+        $percentile: {input: medianArgs.input, algorithm: medianArgs.algorithm, p: [0.5]}
+    };
+
+    const medianRes = coll.aggregate([{$group: {_id: null, p: medianSpec}}]).toArray();
+    const percentileRes = coll.aggregate([{$group: {_id: null, p: percentileSpec}}]).toArray();
+
+    assert.eq(medianRes.length, percentileRes.length);
+    for (let i = 0; i < medianRes.length; i++) {
+        assert.eq(expectedResult[i], medianRes[i].p, msg + ` result: ${tojson(medianRes)}`);
+        assert.eq(percentileRes[i].p[0], medianRes[i].p, msg + ` result: ${tojson(medianRes)}`);
+    }
+}
+
+testWithMultipleGroups({
+    docs: [{k: 0, x: 2}, {k: 0, x: 1}, {k: 1, x: 2}, {k: 2}, {k: 0, x: "str"}, {k: 1, x: 0}],
+    medianSpec: {$median: {input: "$x", algorithm: "approximate"}},
+    expectedResult: [/* k:0 */ 1, /* k:1 */ 0, /* k:2 */ null],
+    msg: "Median of multiple groups"
+});
+})();
--- a/jstests/aggregation/accumulators/percentiles_syntax.js
+++ b/jstests/aggregation/accumulators/percentiles_syntax.js
@ -61,6 +61,22 @@ assertInvalidSyntax({$percentile: {p: [0.5, 0.7], input: "$x", algorithm: 42}},
 assertInvalidSyntax({$percentile: {p: [0.5, 0.7], input: "$x", algorithm: "fancy"}},
                    "Should fail if 'algorithm' isn't one of _predefined_ strings");

+/**
+ * Tests for $median. $median desugars to $percentile with the field p:[0.5] added, and therefore
+ * has similar syntax to $percentile.
+ */
+
+assertInvalidSyntax({$median: {p: [0.5], input: "$x", algorithm: "approximate"}},
+                    "Should fail if 'p' is defined");
+
+assertInvalidSyntax({$median: {algorithm: "approximate"}},
+                    "Should fail if $median is missing 'input' field");
+
+assertInvalidSyntax({$median: {input: "$x"}},
+                    "Should fail if $median is missing 'algorithm' field");
+
+assertInvalidSyntax({$median: {input: "$x", algorithm: "approximate", extras: 42}},
+                    "Should fail if $median contains an unexpected field");
 /**
 * Test that valid $percentile specifications are accepted. The results, i.e. semantics, are tested
 * elsewhere and would cover all of the cases below, we are providing them here nonetheless for
@ -84,6 +100,13 @@ assertValidSyntax(
 assertValidSyntax({$percentile: {p: [0.5, 0.9], input: "x", algorithm: "approximate"}},
                  "Non-numeric inputs should be gracefully ignored");

+/**
+ * Tests for $median. $median desugars to $percentile with the field p:[0.5] added.
+ */
+
+assertValidSyntax({$median: {input: "$x", algorithm: "approximate"}},
+                  "Simple base case for $median.");
+
 /**
 * Test that the "arrayness" of the result matches the "arrayness" of the specification.
 */
--- a/src/mongo/db/pipeline/accumulator_percentile.cpp
+++ b/src/mongo/db/pipeline/accumulator_percentile.cpp
@ -38,6 +38,11 @@ REGISTER_ACCUMULATOR_WITH_FEATURE_FLAG(percentile,
                                       AccumulatorPercentile::parseArgs,
                                       feature_flags::gFeatureFlagApproxPercentiles);

+
+REGISTER_ACCUMULATOR_WITH_FEATURE_FLAG(median,
+                                       AccumulatorMedian::parseArgs,
+                                       feature_flags::gFeatureFlagApproxPercentiles);
+
 Status AccumulatorPercentile::validatePercentileArg(const std::vector<double>& pv) {
    if (pv.empty()) {
        return {ErrorCodes::BadValue, "'p' cannot be an empty array"};
@ -143,4 +148,54 @@ intrusive_ptr<AccumulatorState> AccumulatorPercentile::create(
    std::unique_ptr<PercentileAlgorithm> algo) {
    return new AccumulatorPercentile(expCtx, ps, std::move(algo));
 }
+
+AccumulationExpression AccumulatorMedian::parseArgs(ExpressionContext* const expCtx,
+                                                    BSONElement elem,
+                                                    VariablesParseState vps) {
+    expCtx->sbeGroupCompatible = false;
+
+    uassert(7436100,
+            str::stream() << "specification must be an object; found " << elem,
+            elem.type() == BSONType::Object);
+
+    auto spec = AccumulatorMedianSpec::parse(IDLParserContext(kName), elem.Obj());
+    boost::intrusive_ptr<Expression> input =
+        Expression::parseOperand(expCtx, spec.getInput().getElement(), vps);
+
+    auto factory = [expCtx] {
+        // Temporary implementation! To be replaced based on the user's choice of algorithm.
+        auto algo = PercentileAlgorithm::createDiscreteSortAndRank();
+
+        return AccumulatorMedian::create(expCtx, std::move(algo));
+    };
+
+    return {ExpressionConstant::create(expCtx, Value(BSONNULL)) /*initializer*/,
+            std::move(input) /*argument*/,
+            std::move(factory),
+            "$ median"_sd /*name*/};
+}
+
+AccumulatorMedian::AccumulatorMedian(ExpressionContext* expCtx,
+                                     std::unique_ptr<PercentileAlgorithm> algo)
+    : AccumulatorPercentile(expCtx, {0.5} /* ps */, std::move(algo)){};
+
+intrusive_ptr<AccumulatorState> AccumulatorMedian::create(
+    ExpressionContext* expCtx, std::unique_ptr<PercentileAlgorithm> algo) {
+    return new AccumulatorMedian(expCtx, std::move(algo));
+}
+
+Value AccumulatorMedian::getValue(bool toBeMerged) {
+    // Modify the base-class implementation to return a single value rather than a single-element
+    // array.
+    auto result = AccumulatorPercentile::getValue(toBeMerged);
+    if (result.getType() == jstNULL) {
+        return result;
+    }
+
+    tassert(7436101,
+            "the percentile algorithm for median must return a single result.",
+            result.getArrayLength() == 1);
+
+    return Value(result.getArray().front());
+}
 }  // namespace mongo
--- a/src/mongo/db/pipeline/accumulator_percentile.h
+++ b/src/mongo/db/pipeline/accumulator_percentile.h
@ -40,7 +40,7 @@ namespace mongo {
 class AccumulatorPercentile : public AccumulatorState {
 public:
    static constexpr auto kName = "$percentile"_sd;
-    const char* getOpName() const final {
+    const char* getOpName() const {
        return kName.rawData();
    }

@ -68,7 +68,7 @@ public:
     * Ingressing values and computing the requested percentiles.
     */
    void processInternal(const Value& input, bool merging) final;
-    Value getValue(bool toBeMerged) final;
+    Value getValue(bool toBeMerged);

    /**
     * Other infra for the accumulators.
@ -80,4 +80,30 @@ private:
    std::unique_ptr<PercentileAlgorithm> _algo;
 };

+/*
+ * Accumulator for computing $median. $median has the same semantics as $percentile with the 'p'
+ * field set to [0.5].
+ */
+class AccumulatorMedian : public AccumulatorPercentile {
+public:
+    static constexpr auto kName = "$median"_sd;
+    const char* getOpName() const final {
+        return kName.rawData();
+    }
+
+    /**
+     * Parsing and creating the accumulator.
+     */
+    static AccumulationExpression parseArgs(ExpressionContext* expCtx,
+                                            BSONElement elem,
+                                            VariablesParseState vps);
+
+    static boost::intrusive_ptr<AccumulatorState> create(ExpressionContext* expCtx,
+                                                         std::unique_ptr<PercentileAlgorithm> algo);
+
+    AccumulatorMedian(ExpressionContext* expCtx, std::unique_ptr<PercentileAlgorithm> algo);
+
+    Value getValue(bool toBeMerged) final;
+};
+
 }  // namespace mongo
--- a/src/mongo/db/pipeline/accumulator_percentile.idl
+++ b/src/mongo/db/pipeline/accumulator_percentile.idl
@ -58,3 +58,15 @@ structs:
                description: "The type of algorithm we will use to find the percentile."
                type: PercentileAlgorithmType
                optional: false
+    AccumulatorMedianSpec:
+        description: "Specification for the $median accumulator."
+        strict: true
+        fields:
+            input:
+                description: "The expression to find the percentiles of."
+                type: IDLAnyType
+                optional: false
+            algorithm:
+                description: "The type of algorithm we will use to find the percentile."
+                type: PercentileAlgorithmType
+                optional: false