mirror of
https://github.com/PostHog/posthog.git
synced 2024-11-24 00:47:50 +01:00
feat(product-assistant): evaluation pipeline (#26179)
Co-authored-by: Michael Matloka <michael@posthog.com> Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
bc98fdab44
commit
d836bc860a
8
.gitignore
vendored
8
.gitignore
vendored
@ -69,4 +69,10 @@ plugin-transpiler/dist
|
|||||||
*.log
|
*.log
|
||||||
# pyright config (keep this until we have a standardized one)
|
# pyright config (keep this until we have a standardized one)
|
||||||
pyrightconfig.json
|
pyrightconfig.json
|
||||||
.temporal-worker-settings
|
# Assistant Evaluation with Deepeval
|
||||||
|
.deepeval
|
||||||
|
.deepeval-cache.json
|
||||||
|
.deepeval_telemtry.txt
|
||||||
|
.temporal-worker-settings
|
||||||
|
temp_test_run_data.json
|
||||||
|
.temp-deepeval-cache.json
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
from collections.abc import Generator
|
from collections.abc import Generator, Hashable, Iterator
|
||||||
from typing import Any, Literal, TypedDict, TypeGuard, Union
|
from typing import Any, Literal, Optional, TypedDict, TypeGuard, Union, cast
|
||||||
|
|
||||||
from langchain_core.messages import AIMessageChunk
|
from langchain_core.messages import AIMessageChunk
|
||||||
from langfuse.callback import CallbackHandler
|
from langfuse.callback import CallbackHandler
|
||||||
from langgraph.graph.state import StateGraph
|
from langgraph.graph.state import CompiledStateGraph, StateGraph
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from sentry_sdk import capture_exception
|
from sentry_sdk import capture_exception
|
||||||
|
|
||||||
@ -74,25 +74,49 @@ VISUALIZATION_NODES: dict[AssistantNodeName, type[SchemaGeneratorNode]] = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class Assistant:
|
class AssistantGraph:
|
||||||
_team: Team
|
_team: Team
|
||||||
_graph: StateGraph
|
_graph: StateGraph
|
||||||
|
|
||||||
def __init__(self, team: Team):
|
def __init__(self, team: Team):
|
||||||
self._team = team
|
self._team = team
|
||||||
self._graph = StateGraph(AssistantState)
|
self._graph = StateGraph(AssistantState)
|
||||||
|
self._has_start_node = False
|
||||||
|
|
||||||
def _compile_graph(self):
|
def add_edge(self, from_node: AssistantNodeName, to_node: AssistantNodeName):
|
||||||
|
if from_node == AssistantNodeName.START:
|
||||||
|
self._has_start_node = True
|
||||||
|
self._graph.add_edge(from_node, to_node)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def compile(self):
|
||||||
|
if not self._has_start_node:
|
||||||
|
raise ValueError("Start node not added to the graph")
|
||||||
|
return self._graph.compile()
|
||||||
|
|
||||||
|
def add_start(self):
|
||||||
|
return self.add_edge(AssistantNodeName.START, AssistantNodeName.ROUTER)
|
||||||
|
|
||||||
|
def add_router(
|
||||||
|
self,
|
||||||
|
path_map: Optional[dict[Hashable, AssistantNodeName]] = None,
|
||||||
|
):
|
||||||
builder = self._graph
|
builder = self._graph
|
||||||
|
path_map = path_map or {
|
||||||
|
"trends": AssistantNodeName.TRENDS_PLANNER,
|
||||||
|
"funnel": AssistantNodeName.FUNNEL_PLANNER,
|
||||||
|
}
|
||||||
router_node = RouterNode(self._team)
|
router_node = RouterNode(self._team)
|
||||||
builder.add_node(AssistantNodeName.ROUTER, router_node.run)
|
builder.add_node(AssistantNodeName.ROUTER, router_node.run)
|
||||||
builder.add_edge(AssistantNodeName.START, AssistantNodeName.ROUTER)
|
|
||||||
builder.add_conditional_edges(
|
builder.add_conditional_edges(
|
||||||
AssistantNodeName.ROUTER,
|
AssistantNodeName.ROUTER,
|
||||||
router_node.router,
|
router_node.router,
|
||||||
path_map={"trends": AssistantNodeName.TRENDS_PLANNER, "funnel": AssistantNodeName.FUNNEL_PLANNER},
|
path_map=cast(dict[Hashable, str], path_map),
|
||||||
)
|
)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def add_trends_planner(self, next_node: AssistantNodeName = AssistantNodeName.TRENDS_GENERATOR):
|
||||||
|
builder = self._graph
|
||||||
|
|
||||||
create_trends_plan_node = TrendsPlannerNode(self._team)
|
create_trends_plan_node = TrendsPlannerNode(self._team)
|
||||||
builder.add_node(AssistantNodeName.TRENDS_PLANNER, create_trends_plan_node.run)
|
builder.add_node(AssistantNodeName.TRENDS_PLANNER, create_trends_plan_node.run)
|
||||||
@ -111,26 +135,36 @@ class Assistant:
|
|||||||
create_trends_plan_tools_node.router,
|
create_trends_plan_tools_node.router,
|
||||||
path_map={
|
path_map={
|
||||||
"continue": AssistantNodeName.TRENDS_PLANNER,
|
"continue": AssistantNodeName.TRENDS_PLANNER,
|
||||||
"plan_found": AssistantNodeName.TRENDS_GENERATOR,
|
"plan_found": next_node,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
generate_trends_node = TrendsGeneratorNode(self._team)
|
return self
|
||||||
builder.add_node(AssistantNodeName.TRENDS_GENERATOR, generate_trends_node.run)
|
|
||||||
|
|
||||||
generate_trends_tools_node = TrendsGeneratorToolsNode(self._team)
|
def add_trends_generator(self, next_node: AssistantNodeName = AssistantNodeName.SUMMARIZER):
|
||||||
builder.add_node(AssistantNodeName.TRENDS_GENERATOR_TOOLS, generate_trends_tools_node.run)
|
builder = self._graph
|
||||||
|
|
||||||
|
trends_generator = TrendsGeneratorNode(self._team)
|
||||||
|
builder.add_node(AssistantNodeName.TRENDS_GENERATOR, trends_generator.run)
|
||||||
|
|
||||||
|
trends_generator_tools = TrendsGeneratorToolsNode(self._team)
|
||||||
|
builder.add_node(AssistantNodeName.TRENDS_GENERATOR_TOOLS, trends_generator_tools.run)
|
||||||
|
|
||||||
builder.add_edge(AssistantNodeName.TRENDS_GENERATOR_TOOLS, AssistantNodeName.TRENDS_GENERATOR)
|
builder.add_edge(AssistantNodeName.TRENDS_GENERATOR_TOOLS, AssistantNodeName.TRENDS_GENERATOR)
|
||||||
builder.add_conditional_edges(
|
builder.add_conditional_edges(
|
||||||
AssistantNodeName.TRENDS_GENERATOR,
|
AssistantNodeName.TRENDS_GENERATOR,
|
||||||
generate_trends_node.router,
|
trends_generator.router,
|
||||||
path_map={
|
path_map={
|
||||||
"tools": AssistantNodeName.TRENDS_GENERATOR_TOOLS,
|
"tools": AssistantNodeName.TRENDS_GENERATOR_TOOLS,
|
||||||
"next": AssistantNodeName.SUMMARIZER,
|
"next": next_node,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def add_funnel_planner(self, next_node: AssistantNodeName = AssistantNodeName.FUNNEL_GENERATOR):
|
||||||
|
builder = self._graph
|
||||||
|
|
||||||
funnel_planner = FunnelPlannerNode(self._team)
|
funnel_planner = FunnelPlannerNode(self._team)
|
||||||
builder.add_node(AssistantNodeName.FUNNEL_PLANNER, funnel_planner.run)
|
builder.add_node(AssistantNodeName.FUNNEL_PLANNER, funnel_planner.run)
|
||||||
builder.add_conditional_edges(
|
builder.add_conditional_edges(
|
||||||
@ -148,41 +182,69 @@ class Assistant:
|
|||||||
funnel_planner_tools.router,
|
funnel_planner_tools.router,
|
||||||
path_map={
|
path_map={
|
||||||
"continue": AssistantNodeName.FUNNEL_PLANNER,
|
"continue": AssistantNodeName.FUNNEL_PLANNER,
|
||||||
"plan_found": AssistantNodeName.FUNNEL_GENERATOR,
|
"plan_found": next_node,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def add_funnel_generator(self, next_node: AssistantNodeName = AssistantNodeName.SUMMARIZER):
|
||||||
|
builder = self._graph
|
||||||
|
|
||||||
funnel_generator = FunnelGeneratorNode(self._team)
|
funnel_generator = FunnelGeneratorNode(self._team)
|
||||||
builder.add_node(AssistantNodeName.FUNNEL_GENERATOR, funnel_generator.run)
|
builder.add_node(AssistantNodeName.FUNNEL_GENERATOR, funnel_generator.run)
|
||||||
|
|
||||||
funnel_generator_tools_node = FunnelGeneratorToolsNode(self._team)
|
funnel_generator_tools = FunnelGeneratorToolsNode(self._team)
|
||||||
builder.add_node(AssistantNodeName.FUNNEL_GENERATOR_TOOLS, funnel_generator_tools_node.run)
|
builder.add_node(AssistantNodeName.FUNNEL_GENERATOR_TOOLS, funnel_generator_tools.run)
|
||||||
|
|
||||||
builder.add_edge(AssistantNodeName.FUNNEL_GENERATOR_TOOLS, AssistantNodeName.FUNNEL_GENERATOR)
|
builder.add_edge(AssistantNodeName.FUNNEL_GENERATOR_TOOLS, AssistantNodeName.FUNNEL_GENERATOR)
|
||||||
builder.add_conditional_edges(
|
builder.add_conditional_edges(
|
||||||
AssistantNodeName.FUNNEL_GENERATOR,
|
AssistantNodeName.FUNNEL_GENERATOR,
|
||||||
generate_trends_node.router,
|
funnel_generator.router,
|
||||||
path_map={
|
path_map={
|
||||||
"tools": AssistantNodeName.FUNNEL_GENERATOR_TOOLS,
|
"tools": AssistantNodeName.FUNNEL_GENERATOR_TOOLS,
|
||||||
"next": AssistantNodeName.SUMMARIZER,
|
"next": next_node,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def add_summarizer(self, next_node: AssistantNodeName = AssistantNodeName.END):
|
||||||
|
builder = self._graph
|
||||||
summarizer_node = SummarizerNode(self._team)
|
summarizer_node = SummarizerNode(self._team)
|
||||||
builder.add_node(AssistantNodeName.SUMMARIZER, summarizer_node.run)
|
builder.add_node(AssistantNodeName.SUMMARIZER, summarizer_node.run)
|
||||||
builder.add_edge(AssistantNodeName.SUMMARIZER, AssistantNodeName.END)
|
builder.add_edge(AssistantNodeName.SUMMARIZER, next_node)
|
||||||
|
return self
|
||||||
|
|
||||||
return builder.compile()
|
def compile_full_graph(self):
|
||||||
|
return (
|
||||||
|
self.add_start()
|
||||||
|
.add_router()
|
||||||
|
.add_trends_planner()
|
||||||
|
.add_trends_generator()
|
||||||
|
.add_funnel_planner()
|
||||||
|
.add_funnel_generator()
|
||||||
|
.add_summarizer()
|
||||||
|
.compile()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Assistant:
|
||||||
|
_team: Team
|
||||||
|
_graph: CompiledStateGraph
|
||||||
|
|
||||||
|
def __init__(self, team: Team):
|
||||||
|
self._team = team
|
||||||
|
self._graph = AssistantGraph(team).compile_full_graph()
|
||||||
|
|
||||||
def stream(self, conversation: Conversation) -> Generator[BaseModel, None, None]:
|
def stream(self, conversation: Conversation) -> Generator[BaseModel, None, None]:
|
||||||
assistant_graph = self._compile_graph()
|
|
||||||
callbacks = [langfuse_handler] if langfuse_handler else []
|
callbacks = [langfuse_handler] if langfuse_handler else []
|
||||||
messages = [message.root for message in conversation.messages]
|
messages = [message.root for message in conversation.messages]
|
||||||
|
|
||||||
chunks = AIMessageChunk(content="")
|
chunks = AIMessageChunk(content="")
|
||||||
state: AssistantState = {"messages": messages, "intermediate_steps": None, "plan": None}
|
state: AssistantState = {"messages": messages, "intermediate_steps": None, "plan": None}
|
||||||
|
|
||||||
generator = assistant_graph.stream(
|
generator: Iterator[Any] = self._graph.stream(
|
||||||
state,
|
state,
|
||||||
config={"recursion_limit": 24, "callbacks": callbacks},
|
config={"recursion_limit": 24, "callbacks": callbacks},
|
||||||
stream_mode=["messages", "values", "updates"],
|
stream_mode=["messages", "values", "updates"],
|
||||||
|
0
ee/hogai/eval/__init__.py
Normal file
0
ee/hogai/eval/__init__.py
Normal file
179
ee/hogai/eval/test_eval_funnel_planner.py
Normal file
179
ee/hogai/eval/test_eval_funnel_planner.py
Normal file
@ -0,0 +1,179 @@
|
|||||||
|
from deepeval import assert_test
|
||||||
|
from deepeval.metrics import GEval
|
||||||
|
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
||||||
|
from langgraph.graph.state import CompiledStateGraph
|
||||||
|
|
||||||
|
from ee.hogai.assistant import AssistantGraph
|
||||||
|
from ee.hogai.eval.utils import EvalBaseTest
|
||||||
|
from ee.hogai.utils import AssistantNodeName
|
||||||
|
from posthog.schema import HumanMessage
|
||||||
|
|
||||||
|
|
||||||
|
class TestEvalFunnelPlanner(EvalBaseTest):
|
||||||
|
def _get_plan_correctness_metric(self):
|
||||||
|
return GEval(
|
||||||
|
name="Funnel Plan Correctness",
|
||||||
|
criteria="You will be given expected and actual generated plans to provide a taxonomy to answer a user's question with a funnel insight. Compare the plans to determine whether the taxonomy of the actual plan matches the expected plan. Do not apply general knowledge about funnel insights.",
|
||||||
|
evaluation_steps=[
|
||||||
|
"A plan must define at least two series in the sequence, but it is not required to define any filters, exclusion steps, or a breakdown.",
|
||||||
|
"Compare events, properties, math types, and property values of 'expected output' and 'actual output'.",
|
||||||
|
"Check if the combination of events, properties, and property values in 'actual output' can answer the user's question according to the 'expected output'.",
|
||||||
|
# The criteria for aggregations must be more specific because there isn't a way to bypass them.
|
||||||
|
"Check if the math types in 'actual output' match those in 'expected output.' If the aggregation type is specified by a property, user, or group in 'expected output', the same property, user, or group must be used in 'actual output'.",
|
||||||
|
"If 'expected output' contains exclusion steps, check if 'actual output' contains those, and heavily penalize if the exclusion steps are not present or different.",
|
||||||
|
"If 'expected output' contains a breakdown, check if 'actual output' contains a similar breakdown, and heavily penalize if the breakdown is not present or different. Plans may only have one breakdown.",
|
||||||
|
# We don't want to see in the output unnecessary property filters. The assistant tries to use them all the time.
|
||||||
|
"Heavily penalize if the 'actual output' contains any excessive output not present in the 'expected output'. For example, the `is set` operator in filters should not be used unless the user explicitly asks for it.",
|
||||||
|
],
|
||||||
|
evaluation_params=[
|
||||||
|
LLMTestCaseParams.INPUT,
|
||||||
|
LLMTestCaseParams.EXPECTED_OUTPUT,
|
||||||
|
LLMTestCaseParams.ACTUAL_OUTPUT,
|
||||||
|
],
|
||||||
|
threshold=0.7,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _call_node(self, query):
|
||||||
|
graph: CompiledStateGraph = (
|
||||||
|
AssistantGraph(self.team)
|
||||||
|
.add_edge(AssistantNodeName.START, AssistantNodeName.FUNNEL_PLANNER)
|
||||||
|
.add_funnel_planner(AssistantNodeName.END)
|
||||||
|
.compile()
|
||||||
|
)
|
||||||
|
state = graph.invoke({"messages": [HumanMessage(content=query)]})
|
||||||
|
return state["plan"]
|
||||||
|
|
||||||
|
def test_basic_funnel(self):
|
||||||
|
query = "what was the conversion from a page view to sign up?"
|
||||||
|
test_case = LLMTestCase(
|
||||||
|
input=query,
|
||||||
|
expected_output="""
|
||||||
|
Sequence:
|
||||||
|
1. $pageview
|
||||||
|
2. signed_up
|
||||||
|
""",
|
||||||
|
actual_output=self._call_node(query),
|
||||||
|
)
|
||||||
|
assert_test(test_case, [self._get_plan_correctness_metric()])
|
||||||
|
|
||||||
|
def test_outputs_at_least_two_events(self):
|
||||||
|
"""
|
||||||
|
Ambigious query. The funnel must return at least two events.
|
||||||
|
"""
|
||||||
|
query = "how many users paid a bill?"
|
||||||
|
test_case = LLMTestCase(
|
||||||
|
input=query,
|
||||||
|
expected_output="""
|
||||||
|
Sequence:
|
||||||
|
1. any event
|
||||||
|
2. upgrade_plan
|
||||||
|
""",
|
||||||
|
actual_output=self._call_node(query),
|
||||||
|
)
|
||||||
|
assert_test(test_case, [self._get_plan_correctness_metric()])
|
||||||
|
|
||||||
|
def test_no_excessive_property_filters(self):
|
||||||
|
query = "Show the user conversion from a sign up to a file download"
|
||||||
|
test_case = LLMTestCase(
|
||||||
|
input=query,
|
||||||
|
expected_output="""
|
||||||
|
Sequence:
|
||||||
|
1. signed_up
|
||||||
|
2. downloaded_file
|
||||||
|
""",
|
||||||
|
actual_output=self._call_node(query),
|
||||||
|
)
|
||||||
|
assert_test(test_case, [self._get_plan_correctness_metric()])
|
||||||
|
|
||||||
|
def test_basic_filtering(self):
|
||||||
|
query = (
|
||||||
|
"What was the conversion from uploading a file to downloading it from Chrome and Safari in the last 30d?"
|
||||||
|
)
|
||||||
|
test_case = LLMTestCase(
|
||||||
|
input=query,
|
||||||
|
expected_output="""
|
||||||
|
Sequence:
|
||||||
|
1. uploaded_file
|
||||||
|
- property filter 1:
|
||||||
|
- entity: event
|
||||||
|
- property name: $browser
|
||||||
|
- property type: String
|
||||||
|
- operator: equals
|
||||||
|
- property value: Chrome
|
||||||
|
- property filter 2:
|
||||||
|
- entity: event
|
||||||
|
- property name: $browser
|
||||||
|
- property type: String
|
||||||
|
- operator: equals
|
||||||
|
- property value: Safari
|
||||||
|
2. downloaded_file
|
||||||
|
- property filter 1:
|
||||||
|
- entity: event
|
||||||
|
- property name: $browser
|
||||||
|
- property type: String
|
||||||
|
- operator: equals
|
||||||
|
- property value: Chrome
|
||||||
|
- property filter 2:
|
||||||
|
- entity: event
|
||||||
|
- property name: $browser
|
||||||
|
- property type: String
|
||||||
|
- operator: equals
|
||||||
|
- property value: Safari
|
||||||
|
""",
|
||||||
|
actual_output=self._call_node(query),
|
||||||
|
)
|
||||||
|
assert_test(test_case, [self._get_plan_correctness_metric()])
|
||||||
|
|
||||||
|
def test_exclusion_steps(self):
|
||||||
|
query = "What was the conversion from uploading a file to downloading it in the last 30d excluding users that deleted a file?"
|
||||||
|
test_case = LLMTestCase(
|
||||||
|
input=query,
|
||||||
|
expected_output="""
|
||||||
|
Sequence:
|
||||||
|
1. uploaded_file
|
||||||
|
2. downloaded_file
|
||||||
|
|
||||||
|
Exclusions:
|
||||||
|
- deleted_file
|
||||||
|
- start index: 0
|
||||||
|
- end index: 1
|
||||||
|
""",
|
||||||
|
actual_output=self._call_node(query),
|
||||||
|
)
|
||||||
|
assert_test(test_case, [self._get_plan_correctness_metric()])
|
||||||
|
|
||||||
|
def test_breakdown(self):
|
||||||
|
query = "Show a conversion from uploading a file to downloading it segmented by a user's email"
|
||||||
|
test_case = LLMTestCase(
|
||||||
|
input=query,
|
||||||
|
expected_output="""
|
||||||
|
Sequence:
|
||||||
|
1. uploaded_file
|
||||||
|
2. downloaded_file
|
||||||
|
|
||||||
|
Breakdown by:
|
||||||
|
- entity: person
|
||||||
|
- property name: email
|
||||||
|
""",
|
||||||
|
actual_output=self._call_node(query),
|
||||||
|
)
|
||||||
|
assert_test(test_case, [self._get_plan_correctness_metric()])
|
||||||
|
|
||||||
|
def test_needle_in_a_haystack(self):
|
||||||
|
query = "What was the conversion from a sign up to a paying customer on the personal-pro plan?"
|
||||||
|
test_case = LLMTestCase(
|
||||||
|
input=query,
|
||||||
|
expected_output="""
|
||||||
|
Sequence:
|
||||||
|
1. signed_up
|
||||||
|
2. paid_bill
|
||||||
|
- property filter 1:
|
||||||
|
- entity: event
|
||||||
|
- property name: plan
|
||||||
|
- property type: String
|
||||||
|
- operator: equals
|
||||||
|
- property value: personal/pro
|
||||||
|
""",
|
||||||
|
actual_output=self._call_node(query),
|
||||||
|
)
|
||||||
|
assert_test(test_case, [self._get_plan_correctness_metric()])
|
59
ee/hogai/eval/test_eval_router.py
Normal file
59
ee/hogai/eval/test_eval_router.py
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
from langgraph.graph.state import CompiledStateGraph
|
||||||
|
|
||||||
|
from ee.hogai.assistant import AssistantGraph
|
||||||
|
from ee.hogai.eval.utils import EvalBaseTest
|
||||||
|
from ee.hogai.utils import AssistantNodeName
|
||||||
|
from posthog.schema import HumanMessage, RouterMessage
|
||||||
|
|
||||||
|
|
||||||
|
class TestEvalRouter(EvalBaseTest):
|
||||||
|
def _call_node(self, query: str | list):
|
||||||
|
graph: CompiledStateGraph = (
|
||||||
|
AssistantGraph(self.team)
|
||||||
|
.add_start()
|
||||||
|
.add_router(path_map={"trends": AssistantNodeName.END, "funnel": AssistantNodeName.END})
|
||||||
|
.compile()
|
||||||
|
)
|
||||||
|
messages = [HumanMessage(content=query)] if isinstance(query, str) else query
|
||||||
|
state = graph.invoke({"messages": messages})
|
||||||
|
return state["messages"][-1].content
|
||||||
|
|
||||||
|
def test_outputs_basic_trends_insight(self):
|
||||||
|
query = "Show the $pageview trend"
|
||||||
|
res = self._call_node(query)
|
||||||
|
self.assertEqual(res, "trends")
|
||||||
|
|
||||||
|
def test_outputs_basic_funnel_insight(self):
|
||||||
|
query = "What is the conversion rate of users who uploaded a file to users who paid for a plan?"
|
||||||
|
res = self._call_node(query)
|
||||||
|
self.assertEqual(res, "funnel")
|
||||||
|
|
||||||
|
def test_converts_trends_to_funnel(self):
|
||||||
|
conversation = [
|
||||||
|
HumanMessage(content="Show trends of $pageview and $identify"),
|
||||||
|
RouterMessage(content="trends"),
|
||||||
|
HumanMessage(content="Convert this insight to a funnel"),
|
||||||
|
]
|
||||||
|
res = self._call_node(conversation[:1])
|
||||||
|
self.assertEqual(res, "trends")
|
||||||
|
res = self._call_node(conversation)
|
||||||
|
self.assertEqual(res, "funnel")
|
||||||
|
|
||||||
|
def test_converts_funnel_to_trends(self):
|
||||||
|
conversation = [
|
||||||
|
HumanMessage(content="What is the conversion from a page view to a sign up?"),
|
||||||
|
RouterMessage(content="funnel"),
|
||||||
|
HumanMessage(content="Convert this insight to a trends"),
|
||||||
|
]
|
||||||
|
res = self._call_node(conversation[:1])
|
||||||
|
self.assertEqual(res, "funnel")
|
||||||
|
res = self._call_node(conversation)
|
||||||
|
self.assertEqual(res, "trends")
|
||||||
|
|
||||||
|
def test_outputs_single_trends_insight(self):
|
||||||
|
"""
|
||||||
|
Must display a trends insight because it's not possible to build a funnel with a single series.
|
||||||
|
"""
|
||||||
|
query = "how many users upgraded their plan to personal pro?"
|
||||||
|
res = self._call_node(query)
|
||||||
|
self.assertEqual(res, "trends")
|
163
ee/hogai/eval/test_eval_trends_planner.py
Normal file
163
ee/hogai/eval/test_eval_trends_planner.py
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
from deepeval import assert_test
|
||||||
|
from deepeval.metrics import GEval
|
||||||
|
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
|
||||||
|
from langgraph.graph.state import CompiledStateGraph
|
||||||
|
|
||||||
|
from ee.hogai.assistant import AssistantGraph
|
||||||
|
from ee.hogai.eval.utils import EvalBaseTest
|
||||||
|
from ee.hogai.utils import AssistantNodeName
|
||||||
|
from posthog.schema import HumanMessage
|
||||||
|
|
||||||
|
|
||||||
|
class TestEvalTrendsPlanner(EvalBaseTest):
|
||||||
|
def _get_plan_correctness_metric(self):
|
||||||
|
return GEval(
|
||||||
|
name="Trends Plan Correctness",
|
||||||
|
criteria="You will be given expected and actual generated plans to provide a taxonomy to answer a user's question with a trends insight. Compare the plans to determine whether the taxonomy of the actual plan matches the expected plan. Do not apply general knowledge about trends insights.",
|
||||||
|
evaluation_steps=[
|
||||||
|
"A plan must define at least one event and a math type, but it is not required to define any filters, breakdowns, or formulas.",
|
||||||
|
"Compare events, properties, math types, and property values of 'expected output' and 'actual output'.",
|
||||||
|
"Check if the combination of events, properties, and property values in 'actual output' can answer the user's question according to the 'expected output'.",
|
||||||
|
# The criteria for aggregations must be more specific because there isn't a way to bypass them.
|
||||||
|
"Check if the math types in 'actual output' match those in 'expected output'. Math types sometimes are interchangeable, so use your judgement. If the aggregation type is specified by a property, user, or group in 'expected output', the same property, user, or group must be used in 'actual output'.",
|
||||||
|
"If 'expected output' contains a breakdown, check if 'actual output' contains a similar breakdown, and heavily penalize if the breakdown is not present or different.",
|
||||||
|
"If 'expected output' contains a formula, check if 'actual output' contains a similar formula, and heavily penalize if the formula is not present or different.",
|
||||||
|
# We don't want to see in the output unnecessary property filters. The assistant tries to use them all the time.
|
||||||
|
"Heavily penalize if the 'actual output' contains any excessive output not present in the 'expected output'. For example, the `is set` operator in filters should not be used unless the user explicitly asks for it.",
|
||||||
|
],
|
||||||
|
evaluation_params=[
|
||||||
|
LLMTestCaseParams.INPUT,
|
||||||
|
LLMTestCaseParams.EXPECTED_OUTPUT,
|
||||||
|
LLMTestCaseParams.ACTUAL_OUTPUT,
|
||||||
|
],
|
||||||
|
threshold=0.7,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _call_node(self, query):
|
||||||
|
graph: CompiledStateGraph = (
|
||||||
|
AssistantGraph(self.team)
|
||||||
|
.add_edge(AssistantNodeName.START, AssistantNodeName.TRENDS_PLANNER)
|
||||||
|
.add_trends_planner(AssistantNodeName.END)
|
||||||
|
.compile()
|
||||||
|
)
|
||||||
|
state = graph.invoke({"messages": [HumanMessage(content=query)]})
|
||||||
|
return state["plan"]
|
||||||
|
|
||||||
|
def test_no_excessive_property_filters(self):
|
||||||
|
query = "Show the $pageview trend"
|
||||||
|
test_case = LLMTestCase(
|
||||||
|
input=query,
|
||||||
|
expected_output="""
|
||||||
|
Events:
|
||||||
|
- $pageview
|
||||||
|
- math operation: total count
|
||||||
|
""",
|
||||||
|
actual_output=self._call_node(query),
|
||||||
|
)
|
||||||
|
assert_test(test_case, [self._get_plan_correctness_metric()])
|
||||||
|
|
||||||
|
def test_no_excessive_property_filters_for_a_defined_math_type(self):
|
||||||
|
query = "What is the MAU?"
|
||||||
|
test_case = LLMTestCase(
|
||||||
|
input=query,
|
||||||
|
expected_output="""
|
||||||
|
Events:
|
||||||
|
- $pageview
|
||||||
|
- math operation: unique users
|
||||||
|
""",
|
||||||
|
actual_output=self._call_node(query),
|
||||||
|
)
|
||||||
|
assert_test(test_case, [self._get_plan_correctness_metric()])
|
||||||
|
|
||||||
|
def test_basic_filtering(self):
|
||||||
|
query = "can you compare how many Chrome vs Safari users uploaded a file in the last 30d?"
|
||||||
|
test_case = LLMTestCase(
|
||||||
|
input=query,
|
||||||
|
expected_output="""
|
||||||
|
Events:
|
||||||
|
- uploaded_file
|
||||||
|
- math operation: total count
|
||||||
|
- property filter 1:
|
||||||
|
- entity: event
|
||||||
|
- property name: $browser
|
||||||
|
- property type: String
|
||||||
|
- operator: equals
|
||||||
|
- property value: Chrome
|
||||||
|
- property filter 2:
|
||||||
|
- entity: event
|
||||||
|
- property name: $browser
|
||||||
|
- property type: String
|
||||||
|
- operator: equals
|
||||||
|
- property value: Safari
|
||||||
|
|
||||||
|
Breakdown by:
|
||||||
|
- breakdown 1:
|
||||||
|
- entity: event
|
||||||
|
- property name: $browser
|
||||||
|
""",
|
||||||
|
actual_output=self._call_node(query),
|
||||||
|
)
|
||||||
|
assert_test(test_case, [self._get_plan_correctness_metric()])
|
||||||
|
|
||||||
|
def test_formula_mode(self):
|
||||||
|
query = "i want to see a ratio of identify divided by page views"
|
||||||
|
test_case = LLMTestCase(
|
||||||
|
input=query,
|
||||||
|
expected_output="""
|
||||||
|
Events:
|
||||||
|
- $identify
|
||||||
|
- math operation: total count
|
||||||
|
- $pageview
|
||||||
|
- math operation: total count
|
||||||
|
|
||||||
|
Formula:
|
||||||
|
`A/B`, where `A` is the total count of `$identify` and `B` is the total count of `$pageview`
|
||||||
|
""",
|
||||||
|
actual_output=self._call_node(query),
|
||||||
|
)
|
||||||
|
assert_test(test_case, [self._get_plan_correctness_metric()])
|
||||||
|
|
||||||
|
def test_math_type_by_a_property(self):
|
||||||
|
query = "what is the average session duration?"
|
||||||
|
test_case = LLMTestCase(
|
||||||
|
input=query,
|
||||||
|
expected_output="""
|
||||||
|
Events:
|
||||||
|
- All Events
|
||||||
|
- math operation: average by `$session_duration`
|
||||||
|
""",
|
||||||
|
actual_output=self._call_node(query),
|
||||||
|
)
|
||||||
|
assert_test(test_case, [self._get_plan_correctness_metric()])
|
||||||
|
|
||||||
|
def test_math_type_by_a_user(self):
|
||||||
|
query = "What is the median page view count for a user?"
|
||||||
|
test_case = LLMTestCase(
|
||||||
|
input=query,
|
||||||
|
expected_output="""
|
||||||
|
Events:
|
||||||
|
- $pageview
|
||||||
|
- math operation: median by users
|
||||||
|
""",
|
||||||
|
actual_output=self._call_node(query),
|
||||||
|
)
|
||||||
|
assert_test(test_case, [self._get_plan_correctness_metric()])
|
||||||
|
|
||||||
|
def test_needle_in_a_haystack(self):
|
||||||
|
query = "How frequently do people pay for a personal-pro plan?"
|
||||||
|
test_case = LLMTestCase(
|
||||||
|
input=query,
|
||||||
|
expected_output="""
|
||||||
|
Events:
|
||||||
|
- paid_bill
|
||||||
|
- math operation: total count
|
||||||
|
- property filter 1:
|
||||||
|
- entity: event
|
||||||
|
- property name: plan
|
||||||
|
- property type: String
|
||||||
|
- operator: contains
|
||||||
|
- property value: personal/pro
|
||||||
|
""",
|
||||||
|
actual_output=self._call_node(query),
|
||||||
|
)
|
||||||
|
assert_test(test_case, [self._get_plan_correctness_metric()])
|
28
ee/hogai/eval/utils.py
Normal file
28
ee/hogai/eval/utils.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
import datetime as dt
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from flaky import flaky
|
||||||
|
|
||||||
|
from posthog.demo.matrix.manager import MatrixManager
|
||||||
|
from posthog.tasks.demo_create_data import HedgeboxMatrix
|
||||||
|
from posthog.test.base import BaseTest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(os.environ.get("DEEPEVAL") != "YES", reason="Only runs for the assistant evaluation")
|
||||||
|
@flaky(max_runs=3, min_passes=1)
|
||||||
|
class EvalBaseTest(BaseTest):
|
||||||
|
@classmethod
|
||||||
|
def setUpTestData(cls):
|
||||||
|
super().setUpTestData()
|
||||||
|
matrix = HedgeboxMatrix(
|
||||||
|
seed="b1ef3c66-5f43-488a-98be-6b46d92fbcef", # this seed generates all events
|
||||||
|
now=dt.datetime.now(dt.UTC) - dt.timedelta(days=25),
|
||||||
|
days_past=60,
|
||||||
|
days_future=30,
|
||||||
|
n_clusters=60,
|
||||||
|
group_type_index_offset=0,
|
||||||
|
)
|
||||||
|
matrix_manager = MatrixManager(matrix, print_steps=True)
|
||||||
|
existing_user = cls.team.organization.members.first()
|
||||||
|
matrix_manager.run_on_team(cls.team, existing_user)
|
@ -23,7 +23,7 @@ from ee.hogai.schema_generator.prompts import (
|
|||||||
QUESTION_PROMPT,
|
QUESTION_PROMPT,
|
||||||
)
|
)
|
||||||
from ee.hogai.schema_generator.utils import SchemaGeneratorOutput
|
from ee.hogai.schema_generator.utils import SchemaGeneratorOutput
|
||||||
from ee.hogai.utils import AssistantState, AssistantNode, filter_visualization_conversation
|
from ee.hogai.utils import AssistantNode, AssistantState, filter_visualization_conversation
|
||||||
from posthog.models.group_type_mapping import GroupTypeMapping
|
from posthog.models.group_type_mapping import GroupTypeMapping
|
||||||
from posthog.schema import (
|
from posthog.schema import (
|
||||||
FailureMessage,
|
FailureMessage,
|
||||||
|
@ -712,6 +712,22 @@
|
|||||||
'''
|
'''
|
||||||
# ---
|
# ---
|
||||||
# name: TestDecide.test_flag_with_behavioural_cohorts.5
|
# name: TestDecide.test_flag_with_behavioural_cohorts.5
|
||||||
|
'''
|
||||||
|
SELECT "posthog_group"."id",
|
||||||
|
"posthog_group"."team_id",
|
||||||
|
"posthog_group"."group_key",
|
||||||
|
"posthog_group"."group_type_index",
|
||||||
|
"posthog_group"."group_properties",
|
||||||
|
"posthog_group"."created_at",
|
||||||
|
"posthog_group"."properties_last_updated_at",
|
||||||
|
"posthog_group"."properties_last_operation",
|
||||||
|
"posthog_group"."version"
|
||||||
|
FROM "posthog_group"
|
||||||
|
WHERE "posthog_group"."team_id" = 99999
|
||||||
|
LIMIT 21
|
||||||
|
'''
|
||||||
|
# ---
|
||||||
|
# name: TestDecide.test_flag_with_behavioural_cohorts.6
|
||||||
'''
|
'''
|
||||||
SELECT "posthog_cohort"."id",
|
SELECT "posthog_cohort"."id",
|
||||||
"posthog_cohort"."name",
|
"posthog_cohort"."name",
|
||||||
@ -736,6 +752,22 @@
|
|||||||
AND "posthog_cohort"."team_id" = 99999)
|
AND "posthog_cohort"."team_id" = 99999)
|
||||||
'''
|
'''
|
||||||
# ---
|
# ---
|
||||||
|
# name: TestDecide.test_flag_with_behavioural_cohorts.7
|
||||||
|
'''
|
||||||
|
SELECT "posthog_group"."id",
|
||||||
|
"posthog_group"."team_id",
|
||||||
|
"posthog_group"."group_key",
|
||||||
|
"posthog_group"."group_type_index",
|
||||||
|
"posthog_group"."group_properties",
|
||||||
|
"posthog_group"."created_at",
|
||||||
|
"posthog_group"."properties_last_updated_at",
|
||||||
|
"posthog_group"."properties_last_operation",
|
||||||
|
"posthog_group"."version"
|
||||||
|
FROM "posthog_group"
|
||||||
|
WHERE "posthog_group"."team_id" = 99999
|
||||||
|
LIMIT 21
|
||||||
|
'''
|
||||||
|
# ---
|
||||||
# name: TestDecide.test_flag_with_regular_cohorts
|
# name: TestDecide.test_flag_with_regular_cohorts
|
||||||
'''
|
'''
|
||||||
SELECT "posthog_hogfunction"."id",
|
SELECT "posthog_hogfunction"."id",
|
||||||
|
@ -2624,12 +2624,12 @@ class TestDecide(BaseTest, QueryMatchingTest):
|
|||||||
created_by=self.user,
|
created_by=self.user,
|
||||||
)
|
)
|
||||||
|
|
||||||
with self.assertNumQueries(5):
|
with self.assertNumQueries(6):
|
||||||
response = self._post_decide(api_version=3, distinct_id="example_id_1")
|
response = self._post_decide(api_version=3, distinct_id="example_id_1")
|
||||||
self.assertEqual(response.json()["featureFlags"], {})
|
self.assertEqual(response.json()["featureFlags"], {})
|
||||||
self.assertEqual(response.json()["errorsWhileComputingFlags"], True)
|
self.assertEqual(response.json()["errorsWhileComputingFlags"], True)
|
||||||
|
|
||||||
with self.assertNumQueries(5):
|
with self.assertNumQueries(6):
|
||||||
response = self._post_decide(api_version=3, distinct_id="another_id")
|
response = self._post_decide(api_version=3, distinct_id="another_id")
|
||||||
self.assertEqual(response.json()["featureFlags"], {})
|
self.assertEqual(response.json()["featureFlags"], {})
|
||||||
self.assertEqual(response.json()["errorsWhileComputingFlags"], True)
|
self.assertEqual(response.json()["errorsWhileComputingFlags"], True)
|
||||||
|
@ -56,3 +56,4 @@ flaky==3.7.0
|
|||||||
aioresponses==0.7.6
|
aioresponses==0.7.6
|
||||||
prance==23.06.21.0
|
prance==23.06.21.0
|
||||||
openapi-spec-validator==0.7.1 # Needed for prance as a validation backend
|
openapi-spec-validator==0.7.1 # Needed for prance as a validation backend
|
||||||
|
deepeval==1.5.5
|
||||||
|
@ -4,6 +4,10 @@ aiohttp==3.9.3
|
|||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# aioresponses
|
# aioresponses
|
||||||
|
# datasets
|
||||||
|
# fsspec
|
||||||
|
# langchain
|
||||||
|
# langchain-community
|
||||||
aioresponses==0.7.6
|
aioresponses==0.7.6
|
||||||
# via -r requirements-dev.in
|
# via -r requirements-dev.in
|
||||||
aiosignal==1.2.0
|
aiosignal==1.2.0
|
||||||
@ -14,6 +18,13 @@ annotated-types==0.7.0
|
|||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# pydantic
|
# pydantic
|
||||||
|
anyio==4.6.2.post1
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# httpx
|
||||||
|
# openai
|
||||||
|
appdirs==1.4.4
|
||||||
|
# via ragas
|
||||||
argcomplete==2.0.0
|
argcomplete==2.0.0
|
||||||
# via datamodel-code-generator
|
# via datamodel-code-generator
|
||||||
asgiref==3.7.2
|
asgiref==3.7.2
|
||||||
@ -45,7 +56,10 @@ botocore-stubs==1.34.84
|
|||||||
certifi==2019.11.28
|
certifi==2019.11.28
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
|
# httpcore
|
||||||
|
# httpx
|
||||||
# requests
|
# requests
|
||||||
|
# sentry-sdk
|
||||||
cffi==1.16.0
|
cffi==1.16.0
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
@ -61,6 +75,7 @@ click==8.1.7
|
|||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# black
|
# black
|
||||||
# inline-snapshot
|
# inline-snapshot
|
||||||
|
# typer
|
||||||
colorama==0.4.4
|
colorama==0.4.4
|
||||||
# via pytest-watch
|
# via pytest-watch
|
||||||
coverage==5.5
|
coverage==5.5
|
||||||
@ -69,8 +84,26 @@ cryptography==39.0.2
|
|||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# types-paramiko
|
# types-paramiko
|
||||||
|
dataclasses-json==0.6.7
|
||||||
|
# via langchain-community
|
||||||
datamodel-code-generator==0.26.1
|
datamodel-code-generator==0.26.1
|
||||||
# via -r requirements-dev.in
|
# via -r requirements-dev.in
|
||||||
|
datasets==2.19.1
|
||||||
|
# via ragas
|
||||||
|
deepeval==1.5.5
|
||||||
|
# via -r requirements-dev.in
|
||||||
|
deprecated==1.2.15
|
||||||
|
# via
|
||||||
|
# opentelemetry-api
|
||||||
|
# opentelemetry-exporter-otlp-proto-grpc
|
||||||
|
dill==0.3.8
|
||||||
|
# via
|
||||||
|
# datasets
|
||||||
|
# multiprocess
|
||||||
|
distro==1.9.0
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# openai
|
||||||
django==4.2.15
|
django==4.2.15
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
@ -93,6 +126,8 @@ dnspython==2.2.1
|
|||||||
# email-validator
|
# email-validator
|
||||||
docopt==0.6.2
|
docopt==0.6.2
|
||||||
# via pytest-watch
|
# via pytest-watch
|
||||||
|
docx2txt==0.8
|
||||||
|
# via deepeval
|
||||||
email-validator==2.0.0.post2
|
email-validator==2.0.0.post2
|
||||||
# via pydantic
|
# via pydantic
|
||||||
execnet==2.1.1
|
execnet==2.1.1
|
||||||
@ -103,6 +138,11 @@ faker==17.5.0
|
|||||||
# via -r requirements-dev.in
|
# via -r requirements-dev.in
|
||||||
fakeredis==2.23.3
|
fakeredis==2.23.3
|
||||||
# via -r requirements-dev.in
|
# via -r requirements-dev.in
|
||||||
|
filelock==3.12.0
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# datasets
|
||||||
|
# huggingface-hub
|
||||||
flaky==3.7.0
|
flaky==3.7.0
|
||||||
# via -r requirements-dev.in
|
# via -r requirements-dev.in
|
||||||
freezegun==1.2.2
|
freezegun==1.2.2
|
||||||
@ -112,16 +152,51 @@ frozenlist==1.4.1
|
|||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# aiohttp
|
# aiohttp
|
||||||
# aiosignal
|
# aiosignal
|
||||||
|
fsspec==2023.10.0
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# datasets
|
||||||
|
# huggingface-hub
|
||||||
genson==1.2.2
|
genson==1.2.2
|
||||||
# via datamodel-code-generator
|
# via datamodel-code-generator
|
||||||
|
googleapis-common-protos==1.60.0
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# opentelemetry-exporter-otlp-proto-grpc
|
||||||
|
grpcio==1.63.2
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# deepeval
|
||||||
|
# opentelemetry-exporter-otlp-proto-grpc
|
||||||
|
h11==0.13.0
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# httpcore
|
||||||
|
httpcore==1.0.2
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# httpx
|
||||||
|
httpx==0.26.0
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# langsmith
|
||||||
|
# openai
|
||||||
|
huggingface-hub==0.26.2
|
||||||
|
# via datasets
|
||||||
icdiff==2.0.5
|
icdiff==2.0.5
|
||||||
# via pytest-icdiff
|
# via pytest-icdiff
|
||||||
idna==3.10
|
idna==3.10
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
|
# anyio
|
||||||
# email-validator
|
# email-validator
|
||||||
|
# httpx
|
||||||
# requests
|
# requests
|
||||||
# yarl
|
# yarl
|
||||||
|
importlib-metadata==7.0.0
|
||||||
|
# via
|
||||||
|
# deepeval
|
||||||
|
# opentelemetry-api
|
||||||
inflect==5.6.2
|
inflect==5.6.2
|
||||||
# via datamodel-code-generator
|
# via datamodel-code-generator
|
||||||
iniconfig==1.1.1
|
iniconfig==1.1.1
|
||||||
@ -132,6 +207,18 @@ isort==5.2.2
|
|||||||
# via datamodel-code-generator
|
# via datamodel-code-generator
|
||||||
jinja2==3.1.4
|
jinja2==3.1.4
|
||||||
# via datamodel-code-generator
|
# via datamodel-code-generator
|
||||||
|
jiter==0.5.0
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# openai
|
||||||
|
jsonpatch==1.33
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# langchain-core
|
||||||
|
jsonpointer==3.0.0
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# jsonpatch
|
||||||
jsonschema==4.20.0
|
jsonschema==4.20.0
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
@ -144,6 +231,38 @@ jsonschema-specifications==2023.12.1
|
|||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# jsonschema
|
# jsonschema
|
||||||
# openapi-schema-validator
|
# openapi-schema-validator
|
||||||
|
langchain==0.3.3
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# deepeval
|
||||||
|
# langchain-community
|
||||||
|
# ragas
|
||||||
|
langchain-community==0.3.2
|
||||||
|
# via ragas
|
||||||
|
langchain-core==0.3.10
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# deepeval
|
||||||
|
# langchain
|
||||||
|
# langchain-community
|
||||||
|
# langchain-openai
|
||||||
|
# langchain-text-splitters
|
||||||
|
# ragas
|
||||||
|
langchain-openai==0.2.2
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# deepeval
|
||||||
|
# ragas
|
||||||
|
langchain-text-splitters==0.3.0
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# langchain
|
||||||
|
langsmith==0.1.132
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# langchain
|
||||||
|
# langchain-community
|
||||||
|
# langchain-core
|
||||||
lazy-object-proxy==1.10.0
|
lazy-object-proxy==1.10.0
|
||||||
# via openapi-spec-validator
|
# via openapi-spec-validator
|
||||||
lupa==2.2
|
lupa==2.2
|
||||||
@ -152,6 +271,8 @@ markdown-it-py==3.0.0
|
|||||||
# via rich
|
# via rich
|
||||||
markupsafe==2.1.5
|
markupsafe==2.1.5
|
||||||
# via jinja2
|
# via jinja2
|
||||||
|
marshmallow==3.23.1
|
||||||
|
# via dataclasses-json
|
||||||
mdurl==0.1.2
|
mdurl==0.1.2
|
||||||
# via markdown-it-py
|
# via markdown-it-py
|
||||||
multidict==6.0.2
|
multidict==6.0.2
|
||||||
@ -159,6 +280,8 @@ multidict==6.0.2
|
|||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# aiohttp
|
# aiohttp
|
||||||
# yarl
|
# yarl
|
||||||
|
multiprocess==0.70.16
|
||||||
|
# via datasets
|
||||||
mypy==1.11.1
|
mypy==1.11.1
|
||||||
# via -r requirements-dev.in
|
# via -r requirements-dev.in
|
||||||
mypy-baseline==0.7.0
|
mypy-baseline==0.7.0
|
||||||
@ -170,18 +293,66 @@ mypy-extensions==1.0.0
|
|||||||
# -r requirements-dev.in
|
# -r requirements-dev.in
|
||||||
# black
|
# black
|
||||||
# mypy
|
# mypy
|
||||||
|
# typing-inspect
|
||||||
|
nest-asyncio==1.6.0
|
||||||
|
# via ragas
|
||||||
|
numpy==1.23.3
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# datasets
|
||||||
|
# langchain
|
||||||
|
# langchain-community
|
||||||
|
# pandas
|
||||||
|
# pyarrow
|
||||||
|
# ragas
|
||||||
|
openai==1.51.2
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# langchain-openai
|
||||||
|
# ragas
|
||||||
openapi-schema-validator==0.6.2
|
openapi-schema-validator==0.6.2
|
||||||
# via openapi-spec-validator
|
# via openapi-spec-validator
|
||||||
openapi-spec-validator==0.7.1
|
openapi-spec-validator==0.7.1
|
||||||
# via -r requirements-dev.in
|
# via -r requirements-dev.in
|
||||||
|
opentelemetry-api==1.24.0
|
||||||
|
# via
|
||||||
|
# deepeval
|
||||||
|
# opentelemetry-exporter-otlp-proto-grpc
|
||||||
|
# opentelemetry-sdk
|
||||||
|
opentelemetry-exporter-otlp-proto-common==1.24.0
|
||||||
|
# via opentelemetry-exporter-otlp-proto-grpc
|
||||||
|
opentelemetry-exporter-otlp-proto-grpc==1.24.0
|
||||||
|
# via deepeval
|
||||||
|
opentelemetry-proto==1.24.0
|
||||||
|
# via
|
||||||
|
# opentelemetry-exporter-otlp-proto-common
|
||||||
|
# opentelemetry-exporter-otlp-proto-grpc
|
||||||
|
opentelemetry-sdk==1.24.0
|
||||||
|
# via
|
||||||
|
# deepeval
|
||||||
|
# opentelemetry-exporter-otlp-proto-grpc
|
||||||
|
opentelemetry-semantic-conventions==0.45b0
|
||||||
|
# via opentelemetry-sdk
|
||||||
|
orjson==3.10.7
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# langsmith
|
||||||
packaging==24.1
|
packaging==24.1
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# -r requirements-dev.in
|
# -r requirements-dev.in
|
||||||
# black
|
# black
|
||||||
# datamodel-code-generator
|
# datamodel-code-generator
|
||||||
|
# datasets
|
||||||
|
# huggingface-hub
|
||||||
|
# langchain-core
|
||||||
|
# marshmallow
|
||||||
# prance
|
# prance
|
||||||
# pytest
|
# pytest
|
||||||
|
pandas==2.2.0
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# datasets
|
||||||
parameterized==0.9.0
|
parameterized==0.9.0
|
||||||
# via -r requirements-dev.in
|
# via -r requirements-dev.in
|
||||||
pathable==0.4.3
|
pathable==0.4.3
|
||||||
@ -196,10 +367,24 @@ pluggy==1.5.0
|
|||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# pytest
|
# pytest
|
||||||
|
portalocker==2.10.1
|
||||||
|
# via deepeval
|
||||||
pprintpp==0.4.0
|
pprintpp==0.4.0
|
||||||
# via pytest-icdiff
|
# via pytest-icdiff
|
||||||
prance==23.6.21.0
|
prance==23.6.21.0
|
||||||
# via -r requirements-dev.in
|
# via -r requirements-dev.in
|
||||||
|
protobuf==4.22.1
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# deepeval
|
||||||
|
# googleapis-common-protos
|
||||||
|
# opentelemetry-proto
|
||||||
|
pyarrow==17.0.0
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# datasets
|
||||||
|
pyarrow-hotfix==0.6
|
||||||
|
# via datasets
|
||||||
pycparser==2.20
|
pycparser==2.20
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
@ -208,21 +393,34 @@ pydantic==2.9.2
|
|||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# datamodel-code-generator
|
# datamodel-code-generator
|
||||||
|
# deepeval
|
||||||
|
# langchain
|
||||||
|
# langchain-core
|
||||||
|
# langsmith
|
||||||
|
# openai
|
||||||
|
# pydantic-settings
|
||||||
|
# ragas
|
||||||
pydantic-core==2.23.4
|
pydantic-core==2.23.4
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# pydantic
|
# pydantic
|
||||||
|
pydantic-settings==2.6.1
|
||||||
|
# via langchain-community
|
||||||
pygments==2.18.0
|
pygments==2.18.0
|
||||||
# via rich
|
# via rich
|
||||||
|
pysbd==0.3.4
|
||||||
|
# via ragas
|
||||||
pytest==8.0.2
|
pytest==8.0.2
|
||||||
# via
|
# via
|
||||||
# -r requirements-dev.in
|
# -r requirements-dev.in
|
||||||
|
# deepeval
|
||||||
# pytest-asyncio
|
# pytest-asyncio
|
||||||
# pytest-cov
|
# pytest-cov
|
||||||
# pytest-django
|
# pytest-django
|
||||||
# pytest-env
|
# pytest-env
|
||||||
# pytest-icdiff
|
# pytest-icdiff
|
||||||
# pytest-mock
|
# pytest-mock
|
||||||
|
# pytest-repeat
|
||||||
# pytest-split
|
# pytest-split
|
||||||
# pytest-watch
|
# pytest-watch
|
||||||
# pytest-xdist
|
# pytest-xdist
|
||||||
@ -239,24 +437,44 @@ pytest-icdiff==0.6
|
|||||||
# via -r requirements-dev.in
|
# via -r requirements-dev.in
|
||||||
pytest-mock==3.11.1
|
pytest-mock==3.11.1
|
||||||
# via -r requirements-dev.in
|
# via -r requirements-dev.in
|
||||||
|
pytest-repeat==0.9.3
|
||||||
|
# via deepeval
|
||||||
pytest-split==0.9.0
|
pytest-split==0.9.0
|
||||||
# via -r requirements-dev.in
|
# via -r requirements-dev.in
|
||||||
pytest-watch==4.2.0
|
pytest-watch==4.2.0
|
||||||
# via -r requirements-dev.in
|
# via -r requirements-dev.in
|
||||||
pytest-xdist==3.6.1
|
pytest-xdist==3.6.1
|
||||||
# via -r requirements-dev.in
|
# via
|
||||||
|
# -r requirements-dev.in
|
||||||
|
# deepeval
|
||||||
python-dateutil==2.8.2
|
python-dateutil==2.8.2
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# -r requirements-dev.in
|
# -r requirements-dev.in
|
||||||
# faker
|
# faker
|
||||||
# freezegun
|
# freezegun
|
||||||
|
# pandas
|
||||||
|
python-dotenv==0.21.0
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# pydantic-settings
|
||||||
|
pytz==2023.3
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# pandas
|
||||||
pyyaml==6.0.1
|
pyyaml==6.0.1
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# datamodel-code-generator
|
# datamodel-code-generator
|
||||||
|
# datasets
|
||||||
|
# huggingface-hub
|
||||||
# jsonschema-path
|
# jsonschema-path
|
||||||
|
# langchain
|
||||||
|
# langchain-community
|
||||||
|
# langchain-core
|
||||||
# responses
|
# responses
|
||||||
|
ragas==0.2.5
|
||||||
|
# via deepeval
|
||||||
redis==4.5.4
|
redis==4.5.4
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
@ -267,19 +485,39 @@ referencing==0.31.1
|
|||||||
# jsonschema
|
# jsonschema
|
||||||
# jsonschema-path
|
# jsonschema-path
|
||||||
# jsonschema-specifications
|
# jsonschema-specifications
|
||||||
|
regex==2023.12.25
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# tiktoken
|
||||||
requests==2.32.0
|
requests==2.32.0
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
|
# datasets
|
||||||
|
# deepeval
|
||||||
# djangorestframework-stubs
|
# djangorestframework-stubs
|
||||||
|
# fsspec
|
||||||
|
# huggingface-hub
|
||||||
# jsonschema-path
|
# jsonschema-path
|
||||||
|
# langchain
|
||||||
|
# langchain-community
|
||||||
|
# langsmith
|
||||||
# prance
|
# prance
|
||||||
|
# requests-toolbelt
|
||||||
# responses
|
# responses
|
||||||
|
# tiktoken
|
||||||
|
requests-toolbelt==1.0.0
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# langsmith
|
||||||
responses==0.23.1
|
responses==0.23.1
|
||||||
# via -r requirements-dev.in
|
# via -r requirements-dev.in
|
||||||
rfc3339-validator==0.1.4
|
rfc3339-validator==0.1.4
|
||||||
# via openapi-schema-validator
|
# via openapi-schema-validator
|
||||||
rich==13.7.1
|
rich==13.7.1
|
||||||
# via inline-snapshot
|
# via
|
||||||
|
# deepeval
|
||||||
|
# inline-snapshot
|
||||||
|
# typer
|
||||||
rpds-py==0.16.2
|
rpds-py==0.16.2
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
@ -291,6 +529,12 @@ ruamel-yaml-clib==0.2.8
|
|||||||
# via ruamel-yaml
|
# via ruamel-yaml
|
||||||
ruff==0.6.1
|
ruff==0.6.1
|
||||||
# via -r requirements-dev.in
|
# via -r requirements-dev.in
|
||||||
|
sentry-sdk==1.44.1
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# deepeval
|
||||||
|
shellingham==1.5.4
|
||||||
|
# via typer
|
||||||
six==1.16.0
|
six==1.16.0
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
@ -298,20 +542,54 @@ six==1.16.0
|
|||||||
# prance
|
# prance
|
||||||
# python-dateutil
|
# python-dateutil
|
||||||
# rfc3339-validator
|
# rfc3339-validator
|
||||||
|
sniffio==1.3.1
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# anyio
|
||||||
|
# httpx
|
||||||
|
# openai
|
||||||
sortedcontainers==2.4.0
|
sortedcontainers==2.4.0
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# fakeredis
|
# fakeredis
|
||||||
|
sqlalchemy==2.0.31
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# langchain
|
||||||
|
# langchain-community
|
||||||
sqlparse==0.4.4
|
sqlparse==0.4.4
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# django
|
# django
|
||||||
syrupy==4.6.4
|
syrupy==4.6.4
|
||||||
# via -r requirements-dev.in
|
# via -r requirements-dev.in
|
||||||
|
tabulate==0.9.0
|
||||||
|
# via deepeval
|
||||||
|
tenacity==8.4.2
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# deepeval
|
||||||
|
# langchain
|
||||||
|
# langchain-community
|
||||||
|
# langchain-core
|
||||||
|
tiktoken==0.8.0
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# langchain-openai
|
||||||
|
# ragas
|
||||||
toml==0.10.2
|
toml==0.10.2
|
||||||
# via
|
# via
|
||||||
# coverage
|
# coverage
|
||||||
# inline-snapshot
|
# inline-snapshot
|
||||||
|
tqdm==4.64.1
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# datasets
|
||||||
|
# deepeval
|
||||||
|
# huggingface-hub
|
||||||
|
# openai
|
||||||
|
typer==0.13.0
|
||||||
|
# via deepeval
|
||||||
types-awscrt==0.20.9
|
types-awscrt==0.20.9
|
||||||
# via botocore-stubs
|
# via botocore-stubs
|
||||||
types-freezegun==1.1.10
|
types-freezegun==1.1.10
|
||||||
@ -355,21 +633,43 @@ typing-extensions==4.12.2
|
|||||||
# django-stubs
|
# django-stubs
|
||||||
# django-stubs-ext
|
# django-stubs-ext
|
||||||
# djangorestframework-stubs
|
# djangorestframework-stubs
|
||||||
|
# huggingface-hub
|
||||||
# inline-snapshot
|
# inline-snapshot
|
||||||
|
# langchain-core
|
||||||
# mypy
|
# mypy
|
||||||
# mypy-boto3-s3
|
# mypy-boto3-s3
|
||||||
|
# openai
|
||||||
|
# opentelemetry-sdk
|
||||||
# pydantic
|
# pydantic
|
||||||
# pydantic-core
|
# pydantic-core
|
||||||
|
# sqlalchemy
|
||||||
|
# typer
|
||||||
|
# typing-inspect
|
||||||
|
typing-inspect==0.9.0
|
||||||
|
# via dataclasses-json
|
||||||
|
tzdata==2023.3
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# pandas
|
||||||
urllib3==1.26.18
|
urllib3==1.26.18
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# requests
|
# requests
|
||||||
# responses
|
# responses
|
||||||
|
# sentry-sdk
|
||||||
watchdog==2.1.8
|
watchdog==2.1.8
|
||||||
# via
|
# via
|
||||||
# -r requirements-dev.in
|
# -r requirements-dev.in
|
||||||
# pytest-watch
|
# pytest-watch
|
||||||
|
wrapt==1.15.0
|
||||||
|
# via
|
||||||
|
# -c requirements.txt
|
||||||
|
# deprecated
|
||||||
|
xxhash==3.5.0
|
||||||
|
# via datasets
|
||||||
yarl==1.9.4
|
yarl==1.9.4
|
||||||
# via
|
# via
|
||||||
# -c requirements.txt
|
# -c requirements.txt
|
||||||
# aiohttp
|
# aiohttp
|
||||||
|
zipp==3.21.0
|
||||||
|
# via importlib-metadata
|
||||||
|
@ -111,3 +111,5 @@ zxcvbn==4.4.28
|
|||||||
zstd==1.5.5.1
|
zstd==1.5.5.1
|
||||||
xmlsec==1.3.13 # Do not change this version - it will break SAML
|
xmlsec==1.3.13 # Do not change this version - it will break SAML
|
||||||
lxml==4.9.4 # Do not change this version - it will break SAML
|
lxml==4.9.4 # Do not change this version - it will break SAML
|
||||||
|
grpcio~=1.63.2 # Version constrained so that `deepeval` can be installed in in dev
|
||||||
|
tenacity~=8.4.2 # Version constrained so that `deepeval` can be installed in in dev
|
||||||
|
@ -267,8 +267,9 @@ googleapis-common-protos==1.60.0
|
|||||||
# via
|
# via
|
||||||
# google-api-core
|
# google-api-core
|
||||||
# grpcio-status
|
# grpcio-status
|
||||||
grpcio==1.57.0
|
grpcio==1.63.2
|
||||||
# via
|
# via
|
||||||
|
# -r requirements.in
|
||||||
# google-api-core
|
# google-api-core
|
||||||
# grpcio-status
|
# grpcio-status
|
||||||
# sqlalchemy-bigquery
|
# sqlalchemy-bigquery
|
||||||
@ -702,8 +703,9 @@ structlog==23.2.0
|
|||||||
# django-structlog
|
# django-structlog
|
||||||
temporalio==1.7.1
|
temporalio==1.7.1
|
||||||
# via -r requirements.in
|
# via -r requirements.in
|
||||||
tenacity==8.2.3
|
tenacity==8.4.2
|
||||||
# via
|
# via
|
||||||
|
# -r requirements.in
|
||||||
# celery-redbeat
|
# celery-redbeat
|
||||||
# dlt
|
# dlt
|
||||||
# langchain
|
# langchain
|
||||||
|
Loading…
Reference in New Issue
Block a user