feat(product-assistant): evaluation pipeline (#26179)

Co-authored-by: Michael Matloka <michael@posthog.com> Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
2024-11-24 00:47:50 +01:00 · 2024-11-20 15:44:47 +01:00 · 2024-11-20 15:44:47 +01:00 · d836bc860a
commit d836bc860a
parent bc98fdab44
14 changed files with 866 additions and 32 deletions
--- a/.gitignore
+++ b/.gitignore
@ -69,4 +69,10 @@ plugin-transpiler/dist
 *.log
 # pyright config (keep this until we have a standardized one)
 pyrightconfig.json
+# Assistant Evaluation with Deepeval
+.deepeval
+.deepeval-cache.json
+.deepeval_telemtry.txt
 .temporal-worker-settings
+temp_test_run_data.json
+.temp-deepeval-cache.json
--- a/ee/hogai/assistant.py
+++ b/ee/hogai/assistant.py
@ -1,9 +1,9 @@
-from collections.abc import Generator
-from typing import Any, Literal, TypedDict, TypeGuard, Union
+from collections.abc import Generator, Hashable, Iterator
+from typing import Any, Literal, Optional, TypedDict, TypeGuard, Union, cast

 from langchain_core.messages import AIMessageChunk
 from langfuse.callback import CallbackHandler
-from langgraph.graph.state import StateGraph
+from langgraph.graph.state import CompiledStateGraph, StateGraph
 from pydantic import BaseModel
 from sentry_sdk import capture_exception

@ -74,25 +74,49 @@ VISUALIZATION_NODES: dict[AssistantNodeName, type[SchemaGeneratorNode]] = {
 }


-class Assistant:
+class AssistantGraph:
    _team: Team
    _graph: StateGraph

    def __init__(self, team: Team):
        self._team = team
        self._graph = StateGraph(AssistantState)
+        self._has_start_node = False

-    def _compile_graph(self):
+    def add_edge(self, from_node: AssistantNodeName, to_node: AssistantNodeName):
+        if from_node == AssistantNodeName.START:
+            self._has_start_node = True
+        self._graph.add_edge(from_node, to_node)
+        return self
+
+    def compile(self):
+        if not self._has_start_node:
+            raise ValueError("Start node not added to the graph")
+        return self._graph.compile()
+
+    def add_start(self):
+        return self.add_edge(AssistantNodeName.START, AssistantNodeName.ROUTER)
+
+    def add_router(
+        self,
+        path_map: Optional[dict[Hashable, AssistantNodeName]] = None,
+    ):
        builder = self._graph
-
+        path_map = path_map or {
+            "trends": AssistantNodeName.TRENDS_PLANNER,
+            "funnel": AssistantNodeName.FUNNEL_PLANNER,
+        }
        router_node = RouterNode(self._team)
        builder.add_node(AssistantNodeName.ROUTER, router_node.run)
-        builder.add_edge(AssistantNodeName.START, AssistantNodeName.ROUTER)
        builder.add_conditional_edges(
            AssistantNodeName.ROUTER,
            router_node.router,
-            path_map={"trends": AssistantNodeName.TRENDS_PLANNER, "funnel": AssistantNodeName.FUNNEL_PLANNER},
+            path_map=cast(dict[Hashable, str], path_map),
        )
+        return self
+
+    def add_trends_planner(self, next_node: AssistantNodeName = AssistantNodeName.TRENDS_GENERATOR):
+        builder = self._graph

        create_trends_plan_node = TrendsPlannerNode(self._team)
        builder.add_node(AssistantNodeName.TRENDS_PLANNER, create_trends_plan_node.run)
@ -111,26 +135,36 @@ class Assistant:
            create_trends_plan_tools_node.router,
            path_map={
                "continue": AssistantNodeName.TRENDS_PLANNER,
-                "plan_found": AssistantNodeName.TRENDS_GENERATOR,
+                "plan_found": next_node,
            },
        )

-        generate_trends_node = TrendsGeneratorNode(self._team)
-        builder.add_node(AssistantNodeName.TRENDS_GENERATOR, generate_trends_node.run)
+        return self

-        generate_trends_tools_node = TrendsGeneratorToolsNode(self._team)
-        builder.add_node(AssistantNodeName.TRENDS_GENERATOR_TOOLS, generate_trends_tools_node.run)
+    def add_trends_generator(self, next_node: AssistantNodeName = AssistantNodeName.SUMMARIZER):
+        builder = self._graph
+
+        trends_generator = TrendsGeneratorNode(self._team)
+        builder.add_node(AssistantNodeName.TRENDS_GENERATOR, trends_generator.run)
+
+        trends_generator_tools = TrendsGeneratorToolsNode(self._team)
+        builder.add_node(AssistantNodeName.TRENDS_GENERATOR_TOOLS, trends_generator_tools.run)

        builder.add_edge(AssistantNodeName.TRENDS_GENERATOR_TOOLS, AssistantNodeName.TRENDS_GENERATOR)
        builder.add_conditional_edges(
            AssistantNodeName.TRENDS_GENERATOR,
-            generate_trends_node.router,
+            trends_generator.router,
            path_map={
                "tools": AssistantNodeName.TRENDS_GENERATOR_TOOLS,
-                "next": AssistantNodeName.SUMMARIZER,
+                "next": next_node,
            },
        )

+        return self
+
+    def add_funnel_planner(self, next_node: AssistantNodeName = AssistantNodeName.FUNNEL_GENERATOR):
+        builder = self._graph
+
        funnel_planner = FunnelPlannerNode(self._team)
        builder.add_node(AssistantNodeName.FUNNEL_PLANNER, funnel_planner.run)
        builder.add_conditional_edges(
@ -148,41 +182,69 @@ class Assistant:
            funnel_planner_tools.router,
            path_map={
                "continue": AssistantNodeName.FUNNEL_PLANNER,
-                "plan_found": AssistantNodeName.FUNNEL_GENERATOR,
+                "plan_found": next_node,
            },
        )

+        return self
+
+    def add_funnel_generator(self, next_node: AssistantNodeName = AssistantNodeName.SUMMARIZER):
+        builder = self._graph
+
        funnel_generator = FunnelGeneratorNode(self._team)
        builder.add_node(AssistantNodeName.FUNNEL_GENERATOR, funnel_generator.run)

-        funnel_generator_tools_node = FunnelGeneratorToolsNode(self._team)
-        builder.add_node(AssistantNodeName.FUNNEL_GENERATOR_TOOLS, funnel_generator_tools_node.run)
+        funnel_generator_tools = FunnelGeneratorToolsNode(self._team)
+        builder.add_node(AssistantNodeName.FUNNEL_GENERATOR_TOOLS, funnel_generator_tools.run)

        builder.add_edge(AssistantNodeName.FUNNEL_GENERATOR_TOOLS, AssistantNodeName.FUNNEL_GENERATOR)
        builder.add_conditional_edges(
            AssistantNodeName.FUNNEL_GENERATOR,
-            generate_trends_node.router,
+            funnel_generator.router,
            path_map={
                "tools": AssistantNodeName.FUNNEL_GENERATOR_TOOLS,
-                "next": AssistantNodeName.SUMMARIZER,
+                "next": next_node,
            },
        )

+        return self
+
+    def add_summarizer(self, next_node: AssistantNodeName = AssistantNodeName.END):
+        builder = self._graph
        summarizer_node = SummarizerNode(self._team)
        builder.add_node(AssistantNodeName.SUMMARIZER, summarizer_node.run)
-        builder.add_edge(AssistantNodeName.SUMMARIZER, AssistantNodeName.END)
+        builder.add_edge(AssistantNodeName.SUMMARIZER, next_node)
+        return self

-        return builder.compile()
+    def compile_full_graph(self):
+        return (
+            self.add_start()
+            .add_router()
+            .add_trends_planner()
+            .add_trends_generator()
+            .add_funnel_planner()
+            .add_funnel_generator()
+            .add_summarizer()
+            .compile()
+        )
+
+
+class Assistant:
+    _team: Team
+    _graph: CompiledStateGraph
+
+    def __init__(self, team: Team):
+        self._team = team
+        self._graph = AssistantGraph(team).compile_full_graph()

    def stream(self, conversation: Conversation) -> Generator[BaseModel, None, None]:
-        assistant_graph = self._compile_graph()
        callbacks = [langfuse_handler] if langfuse_handler else []
        messages = [message.root for message in conversation.messages]

        chunks = AIMessageChunk(content="")
        state: AssistantState = {"messages": messages, "intermediate_steps": None, "plan": None}

-        generator = assistant_graph.stream(
+        generator: Iterator[Any] = self._graph.stream(
            state,
            config={"recursion_limit": 24, "callbacks": callbacks},
            stream_mode=["messages", "values", "updates"],
--- a/ee/hogai/eval/init.py
+++ b/ee/hogai/eval/init.py
--- a/ee/hogai/eval/test_eval_funnel_planner.py
+++ b/ee/hogai/eval/test_eval_funnel_planner.py
@ -0,0 +1,179 @@
+from deepeval import assert_test
+from deepeval.metrics import GEval
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from langgraph.graph.state import CompiledStateGraph
+
+from ee.hogai.assistant import AssistantGraph
+from ee.hogai.eval.utils import EvalBaseTest
+from ee.hogai.utils import AssistantNodeName
+from posthog.schema import HumanMessage
+
+
+class TestEvalFunnelPlanner(EvalBaseTest):
+    def _get_plan_correctness_metric(self):
+        return GEval(
+            name="Funnel Plan Correctness",
+            criteria="You will be given expected and actual generated plans to provide a taxonomy to answer a user's question with a funnel insight. Compare the plans to determine whether the taxonomy of the actual plan matches the expected plan. Do not apply general knowledge about funnel insights.",
+            evaluation_steps=[
+                "A plan must define at least two series in the sequence, but it is not required to define any filters, exclusion steps, or a breakdown.",
+                "Compare events, properties, math types, and property values of 'expected output' and 'actual output'.",
+                "Check if the combination of events, properties, and property values in 'actual output' can answer the user's question according to the 'expected output'.",
+                # The criteria for aggregations must be more specific because there isn't a way to bypass them.
+                "Check if the math types in 'actual output' match those in 'expected output.' If the aggregation type is specified by a property, user, or group in 'expected output', the same property, user, or group must be used in 'actual output'.",
+                "If 'expected output' contains exclusion steps, check if 'actual output' contains those, and heavily penalize if the exclusion steps are not present or different.",
+                "If 'expected output' contains a breakdown, check if 'actual output' contains a similar breakdown, and heavily penalize if the breakdown is not present or different. Plans may only have one breakdown.",
+                # We don't want to see in the output unnecessary property filters. The assistant tries to use them all the time.
+                "Heavily penalize if the 'actual output' contains any excessive output not present in the 'expected output'. For example, the `is set` operator in filters should not be used unless the user explicitly asks for it.",
+            ],
+            evaluation_params=[
+                LLMTestCaseParams.INPUT,
+                LLMTestCaseParams.EXPECTED_OUTPUT,
+                LLMTestCaseParams.ACTUAL_OUTPUT,
+            ],
+            threshold=0.7,
+        )
+
+    def _call_node(self, query):
+        graph: CompiledStateGraph = (
+            AssistantGraph(self.team)
+            .add_edge(AssistantNodeName.START, AssistantNodeName.FUNNEL_PLANNER)
+            .add_funnel_planner(AssistantNodeName.END)
+            .compile()
+        )
+        state = graph.invoke({"messages": [HumanMessage(content=query)]})
+        return state["plan"]
+
+    def test_basic_funnel(self):
+        query = "what was the conversion from a page view to sign up?"
+        test_case = LLMTestCase(
+            input=query,
+            expected_output="""
+            Sequence:
+            1. $pageview
+            2. signed_up
+            """,
+            actual_output=self._call_node(query),
+        )
+        assert_test(test_case, [self._get_plan_correctness_metric()])
+
+    def test_outputs_at_least_two_events(self):
+        """
+        Ambigious query. The funnel must return at least two events.
+        """
+        query = "how many users paid a bill?"
+        test_case = LLMTestCase(
+            input=query,
+            expected_output="""
+            Sequence:
+            1. any event
+            2. upgrade_plan
+            """,
+            actual_output=self._call_node(query),
+        )
+        assert_test(test_case, [self._get_plan_correctness_metric()])
+
+    def test_no_excessive_property_filters(self):
+        query = "Show the user conversion from a sign up to a file download"
+        test_case = LLMTestCase(
+            input=query,
+            expected_output="""
+            Sequence:
+            1. signed_up
+            2. downloaded_file
+            """,
+            actual_output=self._call_node(query),
+        )
+        assert_test(test_case, [self._get_plan_correctness_metric()])
+
+    def test_basic_filtering(self):
+        query = (
+            "What was the conversion from uploading a file to downloading it from Chrome and Safari in the last 30d?"
+        )
+        test_case = LLMTestCase(
+            input=query,
+            expected_output="""
+            Sequence:
+            1. uploaded_file
+                - property filter 1:
+                    - entity: event
+                    - property name: $browser
+                    - property type: String
+                    - operator: equals
+                    - property value: Chrome
+                - property filter 2:
+                    - entity: event
+                    - property name: $browser
+                    - property type: String
+                    - operator: equals
+                    - property value: Safari
+            2. downloaded_file
+                - property filter 1:
+                    - entity: event
+                    - property name: $browser
+                    - property type: String
+                    - operator: equals
+                    - property value: Chrome
+                - property filter 2:
+                    - entity: event
+                    - property name: $browser
+                    - property type: String
+                    - operator: equals
+                    - property value: Safari
+            """,
+            actual_output=self._call_node(query),
+        )
+        assert_test(test_case, [self._get_plan_correctness_metric()])
+
+    def test_exclusion_steps(self):
+        query = "What was the conversion from uploading a file to downloading it in the last 30d excluding users that deleted a file?"
+        test_case = LLMTestCase(
+            input=query,
+            expected_output="""
+            Sequence:
+            1. uploaded_file
+            2. downloaded_file
+
+            Exclusions:
+            - deleted_file
+                - start index: 0
+                - end index: 1
+            """,
+            actual_output=self._call_node(query),
+        )
+        assert_test(test_case, [self._get_plan_correctness_metric()])
+
+    def test_breakdown(self):
+        query = "Show a conversion from uploading a file to downloading it segmented by a user's email"
+        test_case = LLMTestCase(
+            input=query,
+            expected_output="""
+            Sequence:
+            1. uploaded_file
+            2. downloaded_file
+
+            Breakdown by:
+            - entity: person
+            - property name: email
+            """,
+            actual_output=self._call_node(query),
+        )
+        assert_test(test_case, [self._get_plan_correctness_metric()])
+
+    def test_needle_in_a_haystack(self):
+        query = "What was the conversion from a sign up to a paying customer on the personal-pro plan?"
+        test_case = LLMTestCase(
+            input=query,
+            expected_output="""
+            Sequence:
+            1. signed_up
+            2. paid_bill
+                - property filter 1:
+                    - entity: event
+                    - property name: plan
+                    - property type: String
+                    - operator: equals
+                    - property value: personal/pro
+            """,
+            actual_output=self._call_node(query),
+        )
+        assert_test(test_case, [self._get_plan_correctness_metric()])
--- a/ee/hogai/eval/test_eval_router.py
+++ b/ee/hogai/eval/test_eval_router.py
@ -0,0 +1,59 @@
+from langgraph.graph.state import CompiledStateGraph
+
+from ee.hogai.assistant import AssistantGraph
+from ee.hogai.eval.utils import EvalBaseTest
+from ee.hogai.utils import AssistantNodeName
+from posthog.schema import HumanMessage, RouterMessage
+
+
+class TestEvalRouter(EvalBaseTest):
+    def _call_node(self, query: str | list):
+        graph: CompiledStateGraph = (
+            AssistantGraph(self.team)
+            .add_start()
+            .add_router(path_map={"trends": AssistantNodeName.END, "funnel": AssistantNodeName.END})
+            .compile()
+        )
+        messages = [HumanMessage(content=query)] if isinstance(query, str) else query
+        state = graph.invoke({"messages": messages})
+        return state["messages"][-1].content
+
+    def test_outputs_basic_trends_insight(self):
+        query = "Show the $pageview trend"
+        res = self._call_node(query)
+        self.assertEqual(res, "trends")
+
+    def test_outputs_basic_funnel_insight(self):
+        query = "What is the conversion rate of users who uploaded a file to users who paid for a plan?"
+        res = self._call_node(query)
+        self.assertEqual(res, "funnel")
+
+    def test_converts_trends_to_funnel(self):
+        conversation = [
+            HumanMessage(content="Show trends of $pageview and $identify"),
+            RouterMessage(content="trends"),
+            HumanMessage(content="Convert this insight to a funnel"),
+        ]
+        res = self._call_node(conversation[:1])
+        self.assertEqual(res, "trends")
+        res = self._call_node(conversation)
+        self.assertEqual(res, "funnel")
+
+    def test_converts_funnel_to_trends(self):
+        conversation = [
+            HumanMessage(content="What is the conversion from a page view to a sign up?"),
+            RouterMessage(content="funnel"),
+            HumanMessage(content="Convert this insight to a trends"),
+        ]
+        res = self._call_node(conversation[:1])
+        self.assertEqual(res, "funnel")
+        res = self._call_node(conversation)
+        self.assertEqual(res, "trends")
+
+    def test_outputs_single_trends_insight(self):
+        """
+        Must display a trends insight because it's not possible to build a funnel with a single series.
+        """
+        query = "how many users upgraded their plan to personal pro?"
+        res = self._call_node(query)
+        self.assertEqual(res, "trends")
--- a/ee/hogai/eval/test_eval_trends_planner.py
+++ b/ee/hogai/eval/test_eval_trends_planner.py
@ -0,0 +1,163 @@
+from deepeval import assert_test
+from deepeval.metrics import GEval
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from langgraph.graph.state import CompiledStateGraph
+
+from ee.hogai.assistant import AssistantGraph
+from ee.hogai.eval.utils import EvalBaseTest
+from ee.hogai.utils import AssistantNodeName
+from posthog.schema import HumanMessage
+
+
+class TestEvalTrendsPlanner(EvalBaseTest):
+    def _get_plan_correctness_metric(self):
+        return GEval(
+            name="Trends Plan Correctness",
+            criteria="You will be given expected and actual generated plans to provide a taxonomy to answer a user's question with a trends insight. Compare the plans to determine whether the taxonomy of the actual plan matches the expected plan. Do not apply general knowledge about trends insights.",
+            evaluation_steps=[
+                "A plan must define at least one event and a math type, but it is not required to define any filters, breakdowns, or formulas.",
+                "Compare events, properties, math types, and property values of 'expected output' and 'actual output'.",
+                "Check if the combination of events, properties, and property values in 'actual output' can answer the user's question according to the 'expected output'.",
+                # The criteria for aggregations must be more specific because there isn't a way to bypass them.
+                "Check if the math types in 'actual output' match those in 'expected output'. Math types sometimes are interchangeable, so use your judgement. If the aggregation type is specified by a property, user, or group in 'expected output', the same property, user, or group must be used in 'actual output'.",
+                "If 'expected output' contains a breakdown, check if 'actual output' contains a similar breakdown, and heavily penalize if the breakdown is not present or different.",
+                "If 'expected output' contains a formula, check if 'actual output' contains a similar formula, and heavily penalize if the formula is not present or different.",
+                # We don't want to see in the output unnecessary property filters. The assistant tries to use them all the time.
+                "Heavily penalize if the 'actual output' contains any excessive output not present in the 'expected output'. For example, the `is set` operator in filters should not be used unless the user explicitly asks for it.",
+            ],
+            evaluation_params=[
+                LLMTestCaseParams.INPUT,
+                LLMTestCaseParams.EXPECTED_OUTPUT,
+                LLMTestCaseParams.ACTUAL_OUTPUT,
+            ],
+            threshold=0.7,
+        )
+
+    def _call_node(self, query):
+        graph: CompiledStateGraph = (
+            AssistantGraph(self.team)
+            .add_edge(AssistantNodeName.START, AssistantNodeName.TRENDS_PLANNER)
+            .add_trends_planner(AssistantNodeName.END)
+            .compile()
+        )
+        state = graph.invoke({"messages": [HumanMessage(content=query)]})
+        return state["plan"]
+
+    def test_no_excessive_property_filters(self):
+        query = "Show the $pageview trend"
+        test_case = LLMTestCase(
+            input=query,
+            expected_output="""
+            Events:
+            - $pageview
+                - math operation: total count
+            """,
+            actual_output=self._call_node(query),
+        )
+        assert_test(test_case, [self._get_plan_correctness_metric()])
+
+    def test_no_excessive_property_filters_for_a_defined_math_type(self):
+        query = "What is the MAU?"
+        test_case = LLMTestCase(
+            input=query,
+            expected_output="""
+            Events:
+            - $pageview
+                - math operation: unique users
+            """,
+            actual_output=self._call_node(query),
+        )
+        assert_test(test_case, [self._get_plan_correctness_metric()])
+
+    def test_basic_filtering(self):
+        query = "can you compare how many Chrome vs Safari users uploaded a file in the last 30d?"
+        test_case = LLMTestCase(
+            input=query,
+            expected_output="""
+            Events:
+            - uploaded_file
+                - math operation: total count
+                - property filter 1:
+                    - entity: event
+                    - property name: $browser
+                    - property type: String
+                    - operator: equals
+                    - property value: Chrome
+                - property filter 2:
+                    - entity: event
+                    - property name: $browser
+                    - property type: String
+                    - operator: equals
+                    - property value: Safari
+
+            Breakdown by:
+            - breakdown 1:
+                - entity: event
+                - property name: $browser
+            """,
+            actual_output=self._call_node(query),
+        )
+        assert_test(test_case, [self._get_plan_correctness_metric()])
+
+    def test_formula_mode(self):
+        query = "i want to see a ratio of identify divided by page views"
+        test_case = LLMTestCase(
+            input=query,
+            expected_output="""
+            Events:
+            - $identify
+                - math operation: total count
+            - $pageview
+                - math operation: total count
+
+            Formula:
+            `A/B`, where `A` is the total count of `$identify` and `B` is the total count of `$pageview`
+            """,
+            actual_output=self._call_node(query),
+        )
+        assert_test(test_case, [self._get_plan_correctness_metric()])
+
+    def test_math_type_by_a_property(self):
+        query = "what is the average session duration?"
+        test_case = LLMTestCase(
+            input=query,
+            expected_output="""
+            Events:
+            - All Events
+                - math operation: average by `$session_duration`
+            """,
+            actual_output=self._call_node(query),
+        )
+        assert_test(test_case, [self._get_plan_correctness_metric()])
+
+    def test_math_type_by_a_user(self):
+        query = "What is the median page view count for a user?"
+        test_case = LLMTestCase(
+            input=query,
+            expected_output="""
+            Events:
+            - $pageview
+                - math operation: median by users
+            """,
+            actual_output=self._call_node(query),
+        )
+        assert_test(test_case, [self._get_plan_correctness_metric()])
+
+    def test_needle_in_a_haystack(self):
+        query = "How frequently do people pay for a personal-pro plan?"
+        test_case = LLMTestCase(
+            input=query,
+            expected_output="""
+            Events:
+            - paid_bill
+                - math operation: total count
+                - property filter 1:
+                    - entity: event
+                    - property name: plan
+                    - property type: String
+                    - operator: contains
+                    - property value: personal/pro
+            """,
+            actual_output=self._call_node(query),
+        )
+        assert_test(test_case, [self._get_plan_correctness_metric()])
--- a/ee/hogai/eval/utils.py
+++ b/ee/hogai/eval/utils.py
@ -0,0 +1,28 @@
+import datetime as dt
+import os
+
+import pytest
+from flaky import flaky
+
+from posthog.demo.matrix.manager import MatrixManager
+from posthog.tasks.demo_create_data import HedgeboxMatrix
+from posthog.test.base import BaseTest
+
+
+@pytest.mark.skipif(os.environ.get("DEEPEVAL") != "YES", reason="Only runs for the assistant evaluation")
+@flaky(max_runs=3, min_passes=1)
+class EvalBaseTest(BaseTest):
+    @classmethod
+    def setUpTestData(cls):
+        super().setUpTestData()
+        matrix = HedgeboxMatrix(
+            seed="b1ef3c66-5f43-488a-98be-6b46d92fbcef",  # this seed generates all events
+            now=dt.datetime.now(dt.UTC) - dt.timedelta(days=25),
+            days_past=60,
+            days_future=30,
+            n_clusters=60,
+            group_type_index_offset=0,
+        )
+        matrix_manager = MatrixManager(matrix, print_steps=True)
+        existing_user = cls.team.organization.members.first()
+        matrix_manager.run_on_team(cls.team, existing_user)
--- a/ee/hogai/schema_generator/nodes.py
+++ b/ee/hogai/schema_generator/nodes.py
@ -23,7 +23,7 @@ from ee.hogai.schema_generator.prompts import (
    QUESTION_PROMPT,
 )
 from ee.hogai.schema_generator.utils import SchemaGeneratorOutput
-from ee.hogai.utils import AssistantState, AssistantNode, filter_visualization_conversation
+from ee.hogai.utils import AssistantNode, AssistantState, filter_visualization_conversation
 from posthog.models.group_type_mapping import GroupTypeMapping
 from posthog.schema import (
    FailureMessage,
--- a/posthog/api/test/snapshots/test_decide.ambr
+++ b/posthog/api/test/snapshots/test_decide.ambr
@ -712,6 +712,22 @@
  '''
 # ---
 # name: TestDecide.test_flag_with_behavioural_cohorts.5
+  '''
+  SELECT "posthog_group"."id",
+         "posthog_group"."team_id",
+         "posthog_group"."group_key",
+         "posthog_group"."group_type_index",
+         "posthog_group"."group_properties",
+         "posthog_group"."created_at",
+         "posthog_group"."properties_last_updated_at",
+         "posthog_group"."properties_last_operation",
+         "posthog_group"."version"
+  FROM "posthog_group"
+  WHERE "posthog_group"."team_id" = 99999
+  LIMIT 21
+  '''
+# ---
+# name: TestDecide.test_flag_with_behavioural_cohorts.6
  '''
  SELECT "posthog_cohort"."id",
         "posthog_cohort"."name",
@ -736,6 +752,22 @@
         AND "posthog_cohort"."team_id" = 99999)
  '''
 # ---
+# name: TestDecide.test_flag_with_behavioural_cohorts.7
+  '''
+  SELECT "posthog_group"."id",
+         "posthog_group"."team_id",
+         "posthog_group"."group_key",
+         "posthog_group"."group_type_index",
+         "posthog_group"."group_properties",
+         "posthog_group"."created_at",
+         "posthog_group"."properties_last_updated_at",
+         "posthog_group"."properties_last_operation",
+         "posthog_group"."version"
+  FROM "posthog_group"
+  WHERE "posthog_group"."team_id" = 99999
+  LIMIT 21
+  '''
+# ---
 # name: TestDecide.test_flag_with_regular_cohorts
  '''
  SELECT "posthog_hogfunction"."id",
--- a/posthog/api/test/test_decide.py
+++ b/posthog/api/test/test_decide.py
@ -2624,12 +2624,12 @@ class TestDecide(BaseTest, QueryMatchingTest):
            created_by=self.user,
        )

-        with self.assertNumQueries(5):
+        with self.assertNumQueries(6):
            response = self._post_decide(api_version=3, distinct_id="example_id_1")
            self.assertEqual(response.json()["featureFlags"], {})
            self.assertEqual(response.json()["errorsWhileComputingFlags"], True)

-        with self.assertNumQueries(5):
+        with self.assertNumQueries(6):
            response = self._post_decide(api_version=3, distinct_id="another_id")
            self.assertEqual(response.json()["featureFlags"], {})
            self.assertEqual(response.json()["errorsWhileComputingFlags"], True)
--- a/requirements-dev.in
+++ b/requirements-dev.in
@ -56,3 +56,4 @@ flaky==3.7.0
 aioresponses==0.7.6
 prance==23.06.21.0
 openapi-spec-validator==0.7.1 # Needed for prance as a validation backend
+deepeval==1.5.5
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -4,6 +4,10 @@ aiohttp==3.9.3
    # via
    #   -c requirements.txt
    #   aioresponses
+    #   datasets
+    #   fsspec
+    #   langchain
+    #   langchain-community
 aioresponses==0.7.6
    # via -r requirements-dev.in
 aiosignal==1.2.0
@ -14,6 +18,13 @@ annotated-types==0.7.0
    # via
    #   -c requirements.txt
    #   pydantic
+anyio==4.6.2.post1
+    # via
+    #   -c requirements.txt
+    #   httpx
+    #   openai
+appdirs==1.4.4
+    # via ragas
 argcomplete==2.0.0
    # via datamodel-code-generator
 asgiref==3.7.2
@ -45,7 +56,10 @@ botocore-stubs==1.34.84
 certifi==2019.11.28
    # via
    #   -c requirements.txt
+    #   httpcore
+    #   httpx
    #   requests
+    #   sentry-sdk
 cffi==1.16.0
    # via
    #   -c requirements.txt
@ -61,6 +75,7 @@ click==8.1.7
    #   -c requirements.txt
    #   black
    #   inline-snapshot
+    #   typer
 colorama==0.4.4
    # via pytest-watch
 coverage==5.5
@ -69,8 +84,26 @@ cryptography==39.0.2
    # via
    #   -c requirements.txt
    #   types-paramiko
+dataclasses-json==0.6.7
+    # via langchain-community
 datamodel-code-generator==0.26.1
    # via -r requirements-dev.in
+datasets==2.19.1
+    # via ragas
+deepeval==1.5.5
+    # via -r requirements-dev.in
+deprecated==1.2.15
+    # via
+    #   opentelemetry-api
+    #   opentelemetry-exporter-otlp-proto-grpc
+dill==0.3.8
+    # via
+    #   datasets
+    #   multiprocess
+distro==1.9.0
+    # via
+    #   -c requirements.txt
+    #   openai
 django==4.2.15
    # via
    #   -c requirements.txt
@ -93,6 +126,8 @@ dnspython==2.2.1
    #   email-validator
 docopt==0.6.2
    # via pytest-watch
+docx2txt==0.8
+    # via deepeval
 email-validator==2.0.0.post2
    # via pydantic
 execnet==2.1.1
@ -103,6 +138,11 @@ faker==17.5.0
    # via -r requirements-dev.in
 fakeredis==2.23.3
    # via -r requirements-dev.in
+filelock==3.12.0
+    # via
+    #   -c requirements.txt
+    #   datasets
+    #   huggingface-hub
 flaky==3.7.0
    # via -r requirements-dev.in
 freezegun==1.2.2
@ -112,16 +152,51 @@ frozenlist==1.4.1
    #   -c requirements.txt
    #   aiohttp
    #   aiosignal
+fsspec==2023.10.0
+    # via
+    #   -c requirements.txt
+    #   datasets
+    #   huggingface-hub
 genson==1.2.2
    # via datamodel-code-generator
+googleapis-common-protos==1.60.0
+    # via
+    #   -c requirements.txt
+    #   opentelemetry-exporter-otlp-proto-grpc
+grpcio==1.63.2
+    # via
+    #   -c requirements.txt
+    #   deepeval
+    #   opentelemetry-exporter-otlp-proto-grpc
+h11==0.13.0
+    # via
+    #   -c requirements.txt
+    #   httpcore
+httpcore==1.0.2
+    # via
+    #   -c requirements.txt
+    #   httpx
+httpx==0.26.0
+    # via
+    #   -c requirements.txt
+    #   langsmith
+    #   openai
+huggingface-hub==0.26.2
+    # via datasets
 icdiff==2.0.5
    # via pytest-icdiff
 idna==3.10
    # via
    #   -c requirements.txt
+    #   anyio
    #   email-validator
+    #   httpx
    #   requests
    #   yarl
+importlib-metadata==7.0.0
+    # via
+    #   deepeval
+    #   opentelemetry-api
 inflect==5.6.2
    # via datamodel-code-generator
 iniconfig==1.1.1
@ -132,6 +207,18 @@ isort==5.2.2
    # via datamodel-code-generator
 jinja2==3.1.4
    # via datamodel-code-generator
+jiter==0.5.0
+    # via
+    #   -c requirements.txt
+    #   openai
+jsonpatch==1.33
+    # via
+    #   -c requirements.txt
+    #   langchain-core
+jsonpointer==3.0.0
+    # via
+    #   -c requirements.txt
+    #   jsonpatch
 jsonschema==4.20.0
    # via
    #   -c requirements.txt
@ -144,6 +231,38 @@ jsonschema-specifications==2023.12.1
    #   -c requirements.txt
    #   jsonschema
    #   openapi-schema-validator
+langchain==0.3.3
+    # via
+    #   -c requirements.txt
+    #   deepeval
+    #   langchain-community
+    #   ragas
+langchain-community==0.3.2
+    # via ragas
+langchain-core==0.3.10
+    # via
+    #   -c requirements.txt
+    #   deepeval
+    #   langchain
+    #   langchain-community
+    #   langchain-openai
+    #   langchain-text-splitters
+    #   ragas
+langchain-openai==0.2.2
+    # via
+    #   -c requirements.txt
+    #   deepeval
+    #   ragas
+langchain-text-splitters==0.3.0
+    # via
+    #   -c requirements.txt
+    #   langchain
+langsmith==0.1.132
+    # via
+    #   -c requirements.txt
+    #   langchain
+    #   langchain-community
+    #   langchain-core
 lazy-object-proxy==1.10.0
    # via openapi-spec-validator
 lupa==2.2
@ -152,6 +271,8 @@ markdown-it-py==3.0.0
    # via rich
 markupsafe==2.1.5
    # via jinja2
+marshmallow==3.23.1
+    # via dataclasses-json
 mdurl==0.1.2
    # via markdown-it-py
 multidict==6.0.2
@ -159,6 +280,8 @@ multidict==6.0.2
    #   -c requirements.txt
    #   aiohttp
    #   yarl
+multiprocess==0.70.16
+    # via datasets
 mypy==1.11.1
    # via -r requirements-dev.in
 mypy-baseline==0.7.0
@ -170,18 +293,66 @@ mypy-extensions==1.0.0
    #   -r requirements-dev.in
    #   black
    #   mypy
+    #   typing-inspect
+nest-asyncio==1.6.0
+    # via ragas
+numpy==1.23.3
+    # via
+    #   -c requirements.txt
+    #   datasets
+    #   langchain
+    #   langchain-community
+    #   pandas
+    #   pyarrow
+    #   ragas
+openai==1.51.2
+    # via
+    #   -c requirements.txt
+    #   langchain-openai
+    #   ragas
 openapi-schema-validator==0.6.2
    # via openapi-spec-validator
 openapi-spec-validator==0.7.1
    # via -r requirements-dev.in
+opentelemetry-api==1.24.0
+    # via
+    #   deepeval
+    #   opentelemetry-exporter-otlp-proto-grpc
+    #   opentelemetry-sdk
+opentelemetry-exporter-otlp-proto-common==1.24.0
+    # via opentelemetry-exporter-otlp-proto-grpc
+opentelemetry-exporter-otlp-proto-grpc==1.24.0
+    # via deepeval
+opentelemetry-proto==1.24.0
+    # via
+    #   opentelemetry-exporter-otlp-proto-common
+    #   opentelemetry-exporter-otlp-proto-grpc
+opentelemetry-sdk==1.24.0
+    # via
+    #   deepeval
+    #   opentelemetry-exporter-otlp-proto-grpc
+opentelemetry-semantic-conventions==0.45b0
+    # via opentelemetry-sdk
+orjson==3.10.7
+    # via
+    #   -c requirements.txt
+    #   langsmith
 packaging==24.1
    # via
    #   -c requirements.txt
    #   -r requirements-dev.in
    #   black
    #   datamodel-code-generator
+    #   datasets
+    #   huggingface-hub
+    #   langchain-core
+    #   marshmallow
    #   prance
    #   pytest
+pandas==2.2.0
+    # via
+    #   -c requirements.txt
+    #   datasets
 parameterized==0.9.0
    # via -r requirements-dev.in
 pathable==0.4.3
@ -196,10 +367,24 @@ pluggy==1.5.0
    # via
    #   -c requirements.txt
    #   pytest
+portalocker==2.10.1
+    # via deepeval
 pprintpp==0.4.0
    # via pytest-icdiff
 prance==23.6.21.0
    # via -r requirements-dev.in
+protobuf==4.22.1
+    # via
+    #   -c requirements.txt
+    #   deepeval
+    #   googleapis-common-protos
+    #   opentelemetry-proto
+pyarrow==17.0.0
+    # via
+    #   -c requirements.txt
+    #   datasets
+pyarrow-hotfix==0.6
+    # via datasets
 pycparser==2.20
    # via
    #   -c requirements.txt
@ -208,21 +393,34 @@ pydantic==2.9.2
    # via
    #   -c requirements.txt
    #   datamodel-code-generator
+    #   deepeval
+    #   langchain
+    #   langchain-core
+    #   langsmith
+    #   openai
+    #   pydantic-settings
+    #   ragas
 pydantic-core==2.23.4
    # via
    #   -c requirements.txt
    #   pydantic
+pydantic-settings==2.6.1
+    # via langchain-community
 pygments==2.18.0
    # via rich
+pysbd==0.3.4
+    # via ragas
 pytest==8.0.2
    # via
    #   -r requirements-dev.in
+    #   deepeval
    #   pytest-asyncio
    #   pytest-cov
    #   pytest-django
    #   pytest-env
    #   pytest-icdiff
    #   pytest-mock
+    #   pytest-repeat
    #   pytest-split
    #   pytest-watch
    #   pytest-xdist
@ -239,24 +437,44 @@ pytest-icdiff==0.6
    # via -r requirements-dev.in
 pytest-mock==3.11.1
    # via -r requirements-dev.in
+pytest-repeat==0.9.3
+    # via deepeval
 pytest-split==0.9.0
    # via -r requirements-dev.in
 pytest-watch==4.2.0
    # via -r requirements-dev.in
 pytest-xdist==3.6.1
-    # via -r requirements-dev.in
+    # via
+    #   -r requirements-dev.in
+    #   deepeval
 python-dateutil==2.8.2
    # via
    #   -c requirements.txt
    #   -r requirements-dev.in
    #   faker
    #   freezegun
+    #   pandas
+python-dotenv==0.21.0
+    # via
+    #   -c requirements.txt
+    #   pydantic-settings
+pytz==2023.3
+    # via
+    #   -c requirements.txt
+    #   pandas
 pyyaml==6.0.1
    # via
    #   -c requirements.txt
    #   datamodel-code-generator
+    #   datasets
+    #   huggingface-hub
    #   jsonschema-path
+    #   langchain
+    #   langchain-community
+    #   langchain-core
    #   responses
+ragas==0.2.5
+    # via deepeval
 redis==4.5.4
    # via
    #   -c requirements.txt
@ -267,19 +485,39 @@ referencing==0.31.1
    #   jsonschema
    #   jsonschema-path
    #   jsonschema-specifications
+regex==2023.12.25
+    # via
+    #   -c requirements.txt
+    #   tiktoken
 requests==2.32.0
    # via
    #   -c requirements.txt
+    #   datasets
+    #   deepeval
    #   djangorestframework-stubs
+    #   fsspec
+    #   huggingface-hub
    #   jsonschema-path
+    #   langchain
+    #   langchain-community
+    #   langsmith
    #   prance
+    #   requests-toolbelt
    #   responses
+    #   tiktoken
+requests-toolbelt==1.0.0
+    # via
+    #   -c requirements.txt
+    #   langsmith
 responses==0.23.1
    # via -r requirements-dev.in
 rfc3339-validator==0.1.4
    # via openapi-schema-validator
 rich==13.7.1
-    # via inline-snapshot
+    # via
+    #   deepeval
+    #   inline-snapshot
+    #   typer
 rpds-py==0.16.2
    # via
    #   -c requirements.txt
@ -291,6 +529,12 @@ ruamel-yaml-clib==0.2.8
    # via ruamel-yaml
 ruff==0.6.1
    # via -r requirements-dev.in
+sentry-sdk==1.44.1
+    # via
+    #   -c requirements.txt
+    #   deepeval
+shellingham==1.5.4
+    # via typer
 six==1.16.0
    # via
    #   -c requirements.txt
@ -298,20 +542,54 @@ six==1.16.0
    #   prance
    #   python-dateutil
    #   rfc3339-validator
+sniffio==1.3.1
+    # via
+    #   -c requirements.txt
+    #   anyio
+    #   httpx
+    #   openai
 sortedcontainers==2.4.0
    # via
    #   -c requirements.txt
    #   fakeredis
+sqlalchemy==2.0.31
+    # via
+    #   -c requirements.txt
+    #   langchain
+    #   langchain-community
 sqlparse==0.4.4
    # via
    #   -c requirements.txt
    #   django
 syrupy==4.6.4
    # via -r requirements-dev.in
+tabulate==0.9.0
+    # via deepeval
+tenacity==8.4.2
+    # via
+    #   -c requirements.txt
+    #   deepeval
+    #   langchain
+    #   langchain-community
+    #   langchain-core
+tiktoken==0.8.0
+    # via
+    #   -c requirements.txt
+    #   langchain-openai
+    #   ragas
 toml==0.10.2
    # via
    #   coverage
    #   inline-snapshot
+tqdm==4.64.1
+    # via
+    #   -c requirements.txt
+    #   datasets
+    #   deepeval
+    #   huggingface-hub
+    #   openai
+typer==0.13.0
+    # via deepeval
 types-awscrt==0.20.9
    # via botocore-stubs
 types-freezegun==1.1.10
@ -355,21 +633,43 @@ typing-extensions==4.12.2
    #   django-stubs
    #   django-stubs-ext
    #   djangorestframework-stubs
+    #   huggingface-hub
    #   inline-snapshot
+    #   langchain-core
    #   mypy
    #   mypy-boto3-s3
+    #   openai
+    #   opentelemetry-sdk
    #   pydantic
    #   pydantic-core
+    #   sqlalchemy
+    #   typer
+    #   typing-inspect
+typing-inspect==0.9.0
+    # via dataclasses-json
+tzdata==2023.3
+    # via
+    #   -c requirements.txt
+    #   pandas
 urllib3==1.26.18
    # via
    #   -c requirements.txt
    #   requests
    #   responses
+    #   sentry-sdk
 watchdog==2.1.8
    # via
    #   -r requirements-dev.in
    #   pytest-watch
+wrapt==1.15.0
+    # via
+    #   -c requirements.txt
+    #   deprecated
+xxhash==3.5.0
+    # via datasets
 yarl==1.9.4
    # via
    #   -c requirements.txt
    #   aiohttp
+zipp==3.21.0
+    # via importlib-metadata
--- a/requirements.in
+++ b/requirements.in
@ -111,3 +111,5 @@ zxcvbn==4.4.28
 zstd==1.5.5.1
 xmlsec==1.3.13 # Do not change this version - it will break SAML
 lxml==4.9.4 # Do not change this version - it will break SAML
+grpcio~=1.63.2 # Version constrained so that `deepeval` can be installed in in dev
+tenacity~=8.4.2  # Version constrained so that `deepeval` can be installed in in dev
--- a/requirements.txt
+++ b/requirements.txt
@ -267,8 +267,9 @@ googleapis-common-protos==1.60.0
    # via
    #   google-api-core
    #   grpcio-status
-grpcio==1.57.0
+grpcio==1.63.2
    # via
+    #   -r requirements.in
    #   google-api-core
    #   grpcio-status
    #   sqlalchemy-bigquery
@ -702,8 +703,9 @@ structlog==23.2.0
    #   django-structlog
 temporalio==1.7.1
    # via -r requirements.in
-tenacity==8.2.3
+tenacity==8.4.2
    # via
+    #   -r requirements.in
    #   celery-redbeat
    #   dlt
    #   langchain