diff --git a/ee/clickhouse/queries/experiments/funnel_experiment_result.py b/ee/clickhouse/queries/experiments/funnel_experiment_result.py index 9b0087a8edd..a9a683faae5 100644 --- a/ee/clickhouse/queries/experiments/funnel_experiment_result.py +++ b/ee/clickhouse/queries/experiments/funnel_experiment_result.py @@ -83,7 +83,7 @@ class ClickhouseFunnelExperimentResult: variant.key: probability for variant, probability in zip([control_variant, *test_variants], probabilities) } - significance_code = self.are_results_significant(control_variant, test_variants, probabilities) + significance_code, loss = self.are_results_significant(control_variant, test_variants, probabilities) return { "insight": funnel_results, @@ -91,6 +91,7 @@ class ClickhouseFunnelExperimentResult: "significant": significance_code == ExperimentSignificanceCode.SIGNIFICANT, "filters": self.funnel._filter.to_dict(), "significance_code": significance_code, + "expected_loss": loss, } def get_variants(self, funnel_results): @@ -141,35 +142,35 @@ class ClickhouseFunnelExperimentResult: @staticmethod def are_results_significant( control_variant: Variant, test_variants: List[Variant], probabilities: List[Probability] - ) -> ExperimentSignificanceCode: + ) -> Tuple[ExperimentSignificanceCode, Probability]: control_sample_size = control_variant.success_count + control_variant.failure_count for variant in test_variants: # We need a feature flag distribution threshold because distribution of people # can skew wildly when there are few people in the experiment if variant.success_count + variant.failure_count < FF_DISTRIBUTION_THRESHOLD: - return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE + return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, 1 if control_sample_size < FF_DISTRIBUTION_THRESHOLD: - return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE + return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, 1 if ( probabilities[0] < MIN_PROBABILITY_FOR_SIGNIFICANCE and sum(probabilities[1:]) < MIN_PROBABILITY_FOR_SIGNIFICANCE ): # Sum of probability of winning for all variants except control is less than 90% - return ExperimentSignificanceCode.LOW_WIN_PROBABILITY + return ExperimentSignificanceCode.LOW_WIN_PROBABILITY, 1 best_test_variant = max( test_variants, key=lambda variant: variant.success_count / (variant.success_count + variant.failure_count) ) - expected_loss = calculate_expected_loss(best_test_variant, [control_variant],) + expected_loss = calculate_expected_loss(best_test_variant, [control_variant]) if expected_loss >= EXPECTED_LOSS_SIGNIFICANCE_LEVEL: - return ExperimentSignificanceCode.HIGH_LOSS + return ExperimentSignificanceCode.HIGH_LOSS, expected_loss - return ExperimentSignificanceCode.SIGNIFICANT + return ExperimentSignificanceCode.SIGNIFICANT, expected_loss def calculate_expected_loss(target_variant: Variant, variants: List[Variant]) -> float: diff --git a/ee/clickhouse/queries/experiments/test_experiment_result.py b/ee/clickhouse/queries/experiments/test_experiment_result.py index 4d18e26c13a..2d513f45f84 100644 --- a/ee/clickhouse/queries/experiments/test_experiment_result.py +++ b/ee/clickhouse/queries/experiments/test_experiment_result.py @@ -139,11 +139,10 @@ class TestFunnelExperimentCalculator(unittest.TestCase): _, probability = ClickhouseFunnelExperimentResult.calculate_results(variant_control, [variant_test]) self.assertAlmostEqual(probability, 0.918, places=2) - self.assertAlmostEqual(calculate_expected_loss(variant_test, [variant_control]), 0.0016, places=3) - - significant = ClickhouseFunnelExperimentResult.are_results_significant( + significant, loss = ClickhouseFunnelExperimentResult.are_results_significant( variant_control, [variant_test], [probability] ) + self.assertAlmostEqual(loss, 0.0016, places=3) self.assertEqual(significant, ExperimentSignificanceCode.SIGNIFICANT) def test_simulation_result_is_close_to_closed_form_solution(self): @@ -178,9 +177,11 @@ class TestFunnelExperimentCalculator(unittest.TestCase): calculate_expected_loss(variant_test_2, [variant_control, variant_test_1]), 0.0004, places=3 ) - significant = ClickhouseFunnelExperimentResult.are_results_significant( + # this loss only checks variant 2 against control + significant, loss = ClickhouseFunnelExperimentResult.are_results_significant( variant_control, [variant_test_1, variant_test_2], probabilities ) + self.assertAlmostEqual(loss, 0.00000, places=3) self.assertEqual(significant, ExperimentSignificanceCode.SIGNIFICANT) def test_calculate_results_for_two_test_variants_almost_equal(self): @@ -205,9 +206,10 @@ class TestFunnelExperimentCalculator(unittest.TestCase): calculate_expected_loss(variant_test_2, [variant_control, variant_test_1]), 0.022, places=2 ) - significant = ClickhouseFunnelExperimentResult.are_results_significant( + significant, loss = ClickhouseFunnelExperimentResult.are_results_significant( variant_control, [variant_test_1, variant_test_2], probabilities ) + self.assertAlmostEqual(loss, 1, places=3) self.assertEqual(significant, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) def test_absolute_loss_less_than_one_percent_but_not_significant(self): @@ -221,9 +223,10 @@ class TestFunnelExperimentCalculator(unittest.TestCase): self.assertAlmostEqual(calculate_expected_loss(variant_test_1, [variant_control]), 0.0010, places=3) - significant = ClickhouseFunnelExperimentResult.are_results_significant( + significant, loss = ClickhouseFunnelExperimentResult.are_results_significant( variant_control, [variant_test_1], probabilities ) + self.assertAlmostEqual(loss, 1, places=3) self.assertEqual(significant, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) def test_calculate_results_for_three_test_variants(self): @@ -251,9 +254,10 @@ class TestFunnelExperimentCalculator(unittest.TestCase): calculate_expected_loss(variant_test_2, [variant_control, variant_test_1, variant_test_3]), 0.0004, places=2 ) - significant = ClickhouseFunnelExperimentResult.are_results_significant( + significant, loss = ClickhouseFunnelExperimentResult.are_results_significant( variant_control, [variant_test_1, variant_test_2, variant_test_3], probabilities ) + self.assertAlmostEqual(loss, 0.0004, places=2) self.assertEqual(significant, ExperimentSignificanceCode.SIGNIFICANT) def test_calculate_results_for_three_test_variants_almost_equal(self): @@ -281,9 +285,10 @@ class TestFunnelExperimentCalculator(unittest.TestCase): ) # passing in artificial probabilities to subvert the low_probability threshold - significant = ClickhouseFunnelExperimentResult.are_results_significant( + significant, loss = ClickhouseFunnelExperimentResult.are_results_significant( variant_control, [variant_test_1, variant_test_2, variant_test_3], [1, 0] ) + self.assertAlmostEqual(loss, 0.012, places=2) self.assertEqual(significant, ExperimentSignificanceCode.HIGH_LOSS) def test_calculate_results_for_three_test_variants_much_better_than_control(self): @@ -302,11 +307,10 @@ class TestFunnelExperimentCalculator(unittest.TestCase): ) self.assertAlmostEqual(probabilities[0], alternative_probability_for_control, places=1) - self.assertAlmostEqual(calculate_expected_loss(variant_test_3, [variant_control]), 0, places=2) - - significant = ClickhouseFunnelExperimentResult.are_results_significant( + significant, loss = ClickhouseFunnelExperimentResult.are_results_significant( variant_control, [variant_test_1, variant_test_2, variant_test_3], probabilities ) + self.assertAlmostEqual(loss, 0, places=2) self.assertEqual(significant, ExperimentSignificanceCode.SIGNIFICANT) @@ -434,19 +438,19 @@ class TestTrendExperimentCalculator(unittest.TestCase): self.assertAlmostEqual(p_value, 0.001, places=3) # manually assign probabilities to control test case - significant = ClickhouseTrendExperimentResult.are_results_significant( + significant, p_value = ClickhouseTrendExperimentResult.are_results_significant( variant_a, [variant_b, variant_c], [0.5, 0.4, 0.1] ) + self.assertAlmostEqual(p_value, 1, places=3) self.assertEqual(significant, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) # new B variant is worse, such that control probability ought to be high enough variant_b = CountVariant("B", 100, 1, 200) - p_value = calculate_p_value(variant_a, [variant_b, variant_c]) - self.assertAlmostEqual(p_value, 0, places=3) - significant = ClickhouseTrendExperimentResult.are_results_significant( + significant, p_value = ClickhouseTrendExperimentResult.are_results_significant( variant_a, [variant_b, variant_c], [0.95, 0.03, 0.02] ) + self.assertAlmostEqual(p_value, 0, places=3) self.assertEqual(significant, ExperimentSignificanceCode.SIGNIFICANT) def test_results_with_different_exposures(self): @@ -470,8 +474,9 @@ class TestTrendExperimentCalculator(unittest.TestCase): p_value = calculate_p_value(variant_a, [variant_b, variant_c]) self.assertAlmostEqual(p_value, 0, places=3) - significant = ClickhouseTrendExperimentResult.are_results_significant( + significant, p_value = ClickhouseTrendExperimentResult.are_results_significant( variant_a, [variant_b, variant_c], probabilities ) + self.assertAlmostEqual(p_value, 1, places=3) # False because max probability is less than 0.9 self.assertEqual(significant, ExperimentSignificanceCode.LOW_WIN_PROBABILITY) diff --git a/ee/clickhouse/queries/experiments/trend_experiment_result.py b/ee/clickhouse/queries/experiments/trend_experiment_result.py index b1dae403bc9..2f8f8202637 100644 --- a/ee/clickhouse/queries/experiments/trend_experiment_result.py +++ b/ee/clickhouse/queries/experiments/trend_experiment_result.py @@ -2,7 +2,7 @@ import dataclasses from datetime import datetime from functools import lru_cache from math import exp, lgamma, log -from typing import List, Optional, Type +from typing import List, Optional, Tuple, Type from numpy.random import default_rng from rest_framework.exceptions import ValidationError @@ -110,7 +110,7 @@ class ClickhouseTrendExperimentResult: variant.key: probability for variant, probability in zip([control_variant, *test_variants], probabilities) } - significance_code = self.are_results_significant(control_variant, test_variants, probabilities) + significance_code, p_value = self.are_results_significant(control_variant, test_variants, probabilities) return { "insight": insight_results, @@ -118,6 +118,7 @@ class ClickhouseTrendExperimentResult: "significant": significance_code == ExperimentSignificanceCode.SIGNIFICANT, "filters": self.query_filter.to_dict(), "significance_code": significance_code, + "p_value": p_value, } def get_variants(self, insight_results, exposure_results): @@ -186,31 +187,31 @@ class ClickhouseTrendExperimentResult: @staticmethod def are_results_significant( control_variant: Variant, test_variants: List[Variant], probabilities: List[Probability] - ) -> ExperimentSignificanceCode: + ) -> Tuple[ExperimentSignificanceCode, Probability]: # TODO: Experiment with Expected Loss calculations for trend experiments for variant in test_variants: # We need a feature flag distribution threshold because distribution of people # can skew wildly when there are few people in the experiment if variant.absolute_exposure < FF_DISTRIBUTION_THRESHOLD: - return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE + return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, 1 if control_variant.absolute_exposure < FF_DISTRIBUTION_THRESHOLD: - return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE + return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, 1 if ( probabilities[0] < MIN_PROBABILITY_FOR_SIGNIFICANCE and sum(probabilities[1:]) < MIN_PROBABILITY_FOR_SIGNIFICANCE ): # Sum of probability of winning for all variants except control is less than 90% - return ExperimentSignificanceCode.LOW_WIN_PROBABILITY + return ExperimentSignificanceCode.LOW_WIN_PROBABILITY, 1 p_value = calculate_p_value(control_variant, test_variants) if p_value >= P_VALUE_SIGNIFICANCE_LEVEL: - return ExperimentSignificanceCode.HIGH_P_VALUE + return ExperimentSignificanceCode.HIGH_P_VALUE, p_value - return ExperimentSignificanceCode.SIGNIFICANT + return ExperimentSignificanceCode.SIGNIFICANT, p_value def simulate_winning_variant_for_arrival_rates(target_variant: Variant, variants: List[Variant]) -> float: diff --git a/ee/clickhouse/views/test/test_clickhouse_experiments.py b/ee/clickhouse/views/test/test_clickhouse_experiments.py index fd07dea5500..d88fd67c440 100644 --- a/ee/clickhouse/views/test/test_clickhouse_experiments.py +++ b/ee/clickhouse/views/test/test_clickhouse_experiments.py @@ -4,6 +4,7 @@ from rest_framework import status from ee.api.test.base import APILicensedTest from ee.clickhouse.test.test_journeys import journeys_for from ee.clickhouse.util import ClickhouseTestMixin, snapshot_clickhouse_queries +from posthog.constants import ExperimentSignificanceCode from posthog.models.experiment import Experiment from posthog.models.feature_flag import FeatureFlag @@ -629,6 +630,8 @@ class ClickhouseTestFunnelExperimentResults(ClickhouseTestMixin, APILicensedTest # Variant with test: Beta(2, 3) and control: Beta(3, 1) distribution # The variant has very low probability of being better. self.assertAlmostEqual(response_data["probability"]["test"], 0.114, places=2) + self.assertEqual(response_data["significance_code"], ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE) + self.assertAlmostEqual(response_data["expected_loss"], 1, places=2) @snapshot_clickhouse_queries def test_experiment_flow_with_event_results_for_three_test_variants(self): @@ -734,10 +737,12 @@ class ClickhouseTestFunnelExperimentResults(ClickhouseTestMixin, APILicensedTest self.assertEqual(result[1][1]["count"], 1) self.assertEqual("test", result[1][1]["breakdown_value"][0]) - self.assertAlmostEqual(response_data["probability"]["test"], 0.031, places=2) - self.assertAlmostEqual(response_data["probability"]["test_1"], 0.158, places=2) - self.assertAlmostEqual(response_data["probability"]["test_2"], 0.324, places=2) - self.assertAlmostEqual(response_data["probability"]["control"], 0.486, places=2) + self.assertAlmostEqual(response_data["probability"]["test"], 0.031, places=1) + self.assertAlmostEqual(response_data["probability"]["test_1"], 0.158, places=1) + self.assertAlmostEqual(response_data["probability"]["test_2"], 0.324, places=1) + self.assertAlmostEqual(response_data["probability"]["control"], 0.486, places=1) + self.assertEqual(response_data["significance_code"], ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE) + self.assertAlmostEqual(response_data["expected_loss"], 1, places=2) class ClickhouseTestTrendExperimentResults(ClickhouseTestMixin, APILicensedTest):