0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-11-28 09:16:49 +01:00

surface p value and loss from API, so frontend can display when needed. (#8434)

This commit is contained in:
Neil Kakkar 2022-02-04 12:12:26 +00:00 committed by GitHub
parent 261b317f86
commit 1fcf91fdfc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 48 additions and 36 deletions

View File

@ -83,7 +83,7 @@ class ClickhouseFunnelExperimentResult:
variant.key: probability for variant, probability in zip([control_variant, *test_variants], probabilities)
}
significance_code = self.are_results_significant(control_variant, test_variants, probabilities)
significance_code, loss = self.are_results_significant(control_variant, test_variants, probabilities)
return {
"insight": funnel_results,
@ -91,6 +91,7 @@ class ClickhouseFunnelExperimentResult:
"significant": significance_code == ExperimentSignificanceCode.SIGNIFICANT,
"filters": self.funnel._filter.to_dict(),
"significance_code": significance_code,
"expected_loss": loss,
}
def get_variants(self, funnel_results):
@ -141,35 +142,35 @@ class ClickhouseFunnelExperimentResult:
@staticmethod
def are_results_significant(
control_variant: Variant, test_variants: List[Variant], probabilities: List[Probability]
) -> ExperimentSignificanceCode:
) -> Tuple[ExperimentSignificanceCode, Probability]:
control_sample_size = control_variant.success_count + control_variant.failure_count
for variant in test_variants:
# We need a feature flag distribution threshold because distribution of people
# can skew wildly when there are few people in the experiment
if variant.success_count + variant.failure_count < FF_DISTRIBUTION_THRESHOLD:
return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE
return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, 1
if control_sample_size < FF_DISTRIBUTION_THRESHOLD:
return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE
return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, 1
if (
probabilities[0] < MIN_PROBABILITY_FOR_SIGNIFICANCE
and sum(probabilities[1:]) < MIN_PROBABILITY_FOR_SIGNIFICANCE
):
# Sum of probability of winning for all variants except control is less than 90%
return ExperimentSignificanceCode.LOW_WIN_PROBABILITY
return ExperimentSignificanceCode.LOW_WIN_PROBABILITY, 1
best_test_variant = max(
test_variants, key=lambda variant: variant.success_count / (variant.success_count + variant.failure_count)
)
expected_loss = calculate_expected_loss(best_test_variant, [control_variant],)
expected_loss = calculate_expected_loss(best_test_variant, [control_variant])
if expected_loss >= EXPECTED_LOSS_SIGNIFICANCE_LEVEL:
return ExperimentSignificanceCode.HIGH_LOSS
return ExperimentSignificanceCode.HIGH_LOSS, expected_loss
return ExperimentSignificanceCode.SIGNIFICANT
return ExperimentSignificanceCode.SIGNIFICANT, expected_loss
def calculate_expected_loss(target_variant: Variant, variants: List[Variant]) -> float:

View File

@ -139,11 +139,10 @@ class TestFunnelExperimentCalculator(unittest.TestCase):
_, probability = ClickhouseFunnelExperimentResult.calculate_results(variant_control, [variant_test])
self.assertAlmostEqual(probability, 0.918, places=2)
self.assertAlmostEqual(calculate_expected_loss(variant_test, [variant_control]), 0.0016, places=3)
significant = ClickhouseFunnelExperimentResult.are_results_significant(
significant, loss = ClickhouseFunnelExperimentResult.are_results_significant(
variant_control, [variant_test], [probability]
)
self.assertAlmostEqual(loss, 0.0016, places=3)
self.assertEqual(significant, ExperimentSignificanceCode.SIGNIFICANT)
def test_simulation_result_is_close_to_closed_form_solution(self):
@ -178,9 +177,11 @@ class TestFunnelExperimentCalculator(unittest.TestCase):
calculate_expected_loss(variant_test_2, [variant_control, variant_test_1]), 0.0004, places=3
)
significant = ClickhouseFunnelExperimentResult.are_results_significant(
# this loss only checks variant 2 against control
significant, loss = ClickhouseFunnelExperimentResult.are_results_significant(
variant_control, [variant_test_1, variant_test_2], probabilities
)
self.assertAlmostEqual(loss, 0.00000, places=3)
self.assertEqual(significant, ExperimentSignificanceCode.SIGNIFICANT)
def test_calculate_results_for_two_test_variants_almost_equal(self):
@ -205,9 +206,10 @@ class TestFunnelExperimentCalculator(unittest.TestCase):
calculate_expected_loss(variant_test_2, [variant_control, variant_test_1]), 0.022, places=2
)
significant = ClickhouseFunnelExperimentResult.are_results_significant(
significant, loss = ClickhouseFunnelExperimentResult.are_results_significant(
variant_control, [variant_test_1, variant_test_2], probabilities
)
self.assertAlmostEqual(loss, 1, places=3)
self.assertEqual(significant, ExperimentSignificanceCode.LOW_WIN_PROBABILITY)
def test_absolute_loss_less_than_one_percent_but_not_significant(self):
@ -221,9 +223,10 @@ class TestFunnelExperimentCalculator(unittest.TestCase):
self.assertAlmostEqual(calculate_expected_loss(variant_test_1, [variant_control]), 0.0010, places=3)
significant = ClickhouseFunnelExperimentResult.are_results_significant(
significant, loss = ClickhouseFunnelExperimentResult.are_results_significant(
variant_control, [variant_test_1], probabilities
)
self.assertAlmostEqual(loss, 1, places=3)
self.assertEqual(significant, ExperimentSignificanceCode.LOW_WIN_PROBABILITY)
def test_calculate_results_for_three_test_variants(self):
@ -251,9 +254,10 @@ class TestFunnelExperimentCalculator(unittest.TestCase):
calculate_expected_loss(variant_test_2, [variant_control, variant_test_1, variant_test_3]), 0.0004, places=2
)
significant = ClickhouseFunnelExperimentResult.are_results_significant(
significant, loss = ClickhouseFunnelExperimentResult.are_results_significant(
variant_control, [variant_test_1, variant_test_2, variant_test_3], probabilities
)
self.assertAlmostEqual(loss, 0.0004, places=2)
self.assertEqual(significant, ExperimentSignificanceCode.SIGNIFICANT)
def test_calculate_results_for_three_test_variants_almost_equal(self):
@ -281,9 +285,10 @@ class TestFunnelExperimentCalculator(unittest.TestCase):
)
# passing in artificial probabilities to subvert the low_probability threshold
significant = ClickhouseFunnelExperimentResult.are_results_significant(
significant, loss = ClickhouseFunnelExperimentResult.are_results_significant(
variant_control, [variant_test_1, variant_test_2, variant_test_3], [1, 0]
)
self.assertAlmostEqual(loss, 0.012, places=2)
self.assertEqual(significant, ExperimentSignificanceCode.HIGH_LOSS)
def test_calculate_results_for_three_test_variants_much_better_than_control(self):
@ -302,11 +307,10 @@ class TestFunnelExperimentCalculator(unittest.TestCase):
)
self.assertAlmostEqual(probabilities[0], alternative_probability_for_control, places=1)
self.assertAlmostEqual(calculate_expected_loss(variant_test_3, [variant_control]), 0, places=2)
significant = ClickhouseFunnelExperimentResult.are_results_significant(
significant, loss = ClickhouseFunnelExperimentResult.are_results_significant(
variant_control, [variant_test_1, variant_test_2, variant_test_3], probabilities
)
self.assertAlmostEqual(loss, 0, places=2)
self.assertEqual(significant, ExperimentSignificanceCode.SIGNIFICANT)
@ -434,19 +438,19 @@ class TestTrendExperimentCalculator(unittest.TestCase):
self.assertAlmostEqual(p_value, 0.001, places=3)
# manually assign probabilities to control test case
significant = ClickhouseTrendExperimentResult.are_results_significant(
significant, p_value = ClickhouseTrendExperimentResult.are_results_significant(
variant_a, [variant_b, variant_c], [0.5, 0.4, 0.1]
)
self.assertAlmostEqual(p_value, 1, places=3)
self.assertEqual(significant, ExperimentSignificanceCode.LOW_WIN_PROBABILITY)
# new B variant is worse, such that control probability ought to be high enough
variant_b = CountVariant("B", 100, 1, 200)
p_value = calculate_p_value(variant_a, [variant_b, variant_c])
self.assertAlmostEqual(p_value, 0, places=3)
significant = ClickhouseTrendExperimentResult.are_results_significant(
significant, p_value = ClickhouseTrendExperimentResult.are_results_significant(
variant_a, [variant_b, variant_c], [0.95, 0.03, 0.02]
)
self.assertAlmostEqual(p_value, 0, places=3)
self.assertEqual(significant, ExperimentSignificanceCode.SIGNIFICANT)
def test_results_with_different_exposures(self):
@ -470,8 +474,9 @@ class TestTrendExperimentCalculator(unittest.TestCase):
p_value = calculate_p_value(variant_a, [variant_b, variant_c])
self.assertAlmostEqual(p_value, 0, places=3)
significant = ClickhouseTrendExperimentResult.are_results_significant(
significant, p_value = ClickhouseTrendExperimentResult.are_results_significant(
variant_a, [variant_b, variant_c], probabilities
)
self.assertAlmostEqual(p_value, 1, places=3)
# False because max probability is less than 0.9
self.assertEqual(significant, ExperimentSignificanceCode.LOW_WIN_PROBABILITY)

View File

@ -2,7 +2,7 @@ import dataclasses
from datetime import datetime
from functools import lru_cache
from math import exp, lgamma, log
from typing import List, Optional, Type
from typing import List, Optional, Tuple, Type
from numpy.random import default_rng
from rest_framework.exceptions import ValidationError
@ -110,7 +110,7 @@ class ClickhouseTrendExperimentResult:
variant.key: probability for variant, probability in zip([control_variant, *test_variants], probabilities)
}
significance_code = self.are_results_significant(control_variant, test_variants, probabilities)
significance_code, p_value = self.are_results_significant(control_variant, test_variants, probabilities)
return {
"insight": insight_results,
@ -118,6 +118,7 @@ class ClickhouseTrendExperimentResult:
"significant": significance_code == ExperimentSignificanceCode.SIGNIFICANT,
"filters": self.query_filter.to_dict(),
"significance_code": significance_code,
"p_value": p_value,
}
def get_variants(self, insight_results, exposure_results):
@ -186,31 +187,31 @@ class ClickhouseTrendExperimentResult:
@staticmethod
def are_results_significant(
control_variant: Variant, test_variants: List[Variant], probabilities: List[Probability]
) -> ExperimentSignificanceCode:
) -> Tuple[ExperimentSignificanceCode, Probability]:
# TODO: Experiment with Expected Loss calculations for trend experiments
for variant in test_variants:
# We need a feature flag distribution threshold because distribution of people
# can skew wildly when there are few people in the experiment
if variant.absolute_exposure < FF_DISTRIBUTION_THRESHOLD:
return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE
return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, 1
if control_variant.absolute_exposure < FF_DISTRIBUTION_THRESHOLD:
return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE
return ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE, 1
if (
probabilities[0] < MIN_PROBABILITY_FOR_SIGNIFICANCE
and sum(probabilities[1:]) < MIN_PROBABILITY_FOR_SIGNIFICANCE
):
# Sum of probability of winning for all variants except control is less than 90%
return ExperimentSignificanceCode.LOW_WIN_PROBABILITY
return ExperimentSignificanceCode.LOW_WIN_PROBABILITY, 1
p_value = calculate_p_value(control_variant, test_variants)
if p_value >= P_VALUE_SIGNIFICANCE_LEVEL:
return ExperimentSignificanceCode.HIGH_P_VALUE
return ExperimentSignificanceCode.HIGH_P_VALUE, p_value
return ExperimentSignificanceCode.SIGNIFICANT
return ExperimentSignificanceCode.SIGNIFICANT, p_value
def simulate_winning_variant_for_arrival_rates(target_variant: Variant, variants: List[Variant]) -> float:

View File

@ -4,6 +4,7 @@ from rest_framework import status
from ee.api.test.base import APILicensedTest
from ee.clickhouse.test.test_journeys import journeys_for
from ee.clickhouse.util import ClickhouseTestMixin, snapshot_clickhouse_queries
from posthog.constants import ExperimentSignificanceCode
from posthog.models.experiment import Experiment
from posthog.models.feature_flag import FeatureFlag
@ -629,6 +630,8 @@ class ClickhouseTestFunnelExperimentResults(ClickhouseTestMixin, APILicensedTest
# Variant with test: Beta(2, 3) and control: Beta(3, 1) distribution
# The variant has very low probability of being better.
self.assertAlmostEqual(response_data["probability"]["test"], 0.114, places=2)
self.assertEqual(response_data["significance_code"], ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE)
self.assertAlmostEqual(response_data["expected_loss"], 1, places=2)
@snapshot_clickhouse_queries
def test_experiment_flow_with_event_results_for_three_test_variants(self):
@ -734,10 +737,12 @@ class ClickhouseTestFunnelExperimentResults(ClickhouseTestMixin, APILicensedTest
self.assertEqual(result[1][1]["count"], 1)
self.assertEqual("test", result[1][1]["breakdown_value"][0])
self.assertAlmostEqual(response_data["probability"]["test"], 0.031, places=2)
self.assertAlmostEqual(response_data["probability"]["test_1"], 0.158, places=2)
self.assertAlmostEqual(response_data["probability"]["test_2"], 0.324, places=2)
self.assertAlmostEqual(response_data["probability"]["control"], 0.486, places=2)
self.assertAlmostEqual(response_data["probability"]["test"], 0.031, places=1)
self.assertAlmostEqual(response_data["probability"]["test_1"], 0.158, places=1)
self.assertAlmostEqual(response_data["probability"]["test_2"], 0.324, places=1)
self.assertAlmostEqual(response_data["probability"]["control"], 0.486, places=1)
self.assertEqual(response_data["significance_code"], ExperimentSignificanceCode.NOT_ENOUGH_EXPOSURE)
self.assertAlmostEqual(response_data["expected_loss"], 1, places=2)
class ClickhouseTestTrendExperimentResults(ClickhouseTestMixin, APILicensedTest):