0
0
mirror of https://github.com/PostHog/posthog.git synced 2024-11-27 16:26:50 +01:00
posthog/ee/surveys/summaries/summarize_surveys.py
2024-11-07 15:45:16 +00:00

136 lines
4.5 KiB
Python

import json
import openai
from datetime import datetime
from typing import Optional, cast
from posthog.hogql import ast
from posthog.hogql.parser import parse_select
from posthog.hogql_queries.insights.paginators import HogQLHasMorePaginator
from posthog.schema import HogQLQueryResponse
from posthog.utils import get_instance_region
from prometheus_client import Histogram
from posthog.api.activity_log import ServerTimingsGathered
from posthog.models import Team, User
import structlog
logger = structlog.get_logger(__name__)
TOKENS_IN_PROMPT_HISTOGRAM = Histogram(
"posthog_survey_summary_tokens_in_prompt_histogram",
"histogram of the number of tokens in the prompt used to generate a survey summary",
buckets=[
0,
10,
50,
100,
500,
1000,
2000,
3000,
4000,
5000,
6000,
7000,
8000,
10000,
20000,
30000,
40000,
50000,
100000,
128000,
float("inf"),
],
)
def prepare_data(query_response: HogQLQueryResponse) -> list[str]:
response_values = []
properties_list: list[dict] = [json.loads(x[1]) for x in query_response.results]
for props in properties_list:
response_values.extend([value for key, value in props.items() if key.startswith("$survey_response") and value])
return response_values
def summarize_survey_responses(
survey_id: str, question_index: Optional[int], survey_start: datetime, survey_end: datetime, team: Team, user: User
):
timer = ServerTimingsGathered()
with timer("prepare_query"):
paginator = HogQLHasMorePaginator(limit=100, offset=0)
q = parse_select(
"""
SELECT distinct_id, properties
FROM events
WHERE event == 'survey sent'
AND properties.$survey_id = {survey_id}
-- e.g. `$survey_response` or `$survey_response_2`
AND trim(JSONExtractString(properties, {survey_response_property})) != ''
AND timestamp >= {start_date}
AND timestamp <= {end_date}
""",
{
"survey_id": ast.Constant(value=survey_id),
"survey_response_property": ast.Constant(
value=f"$survey_response_{question_index}" if question_index else "$survey_response"
),
"start_date": ast.Constant(value=survey_start),
"end_date": ast.Constant(value=survey_end),
},
)
with timer("run_query"):
query_response = paginator.execute_hogql_query(
team=team,
query_type="survey_response_list_query",
query=cast(ast.SelectQuery, q),
)
with timer("llm_api_prep"):
instance_region = get_instance_region() or "HOBBY"
prepared_data = prepare_data(query_response)
with timer("openai_completion"):
result = openai.chat.completions.create(
model="gpt-4o-mini", # allows 128k tokens
temperature=0.7,
messages=[
{
"role": "system",
"content": """
You are a product manager's assistant. You summarise survey responses from users for the product manager.
You don't do any other tasks.
""",
},
{
"role": "user",
"content": f"""the survey responses are {prepared_data}.""",
},
{
"role": "user",
"content": """
generate a one or two paragraph summary of the survey response.
only summarize, the goal is to identify real user pain points and needs
use bullet points to identify the themes, and highlights of quotes to bring them to life
we're trying to identify what to work on
use as concise and simple language as is possible.
generate no text other than the summary.
the aim is to let people see themes in the responses received. return the text in github flavoured markdown format""",
},
],
user=f"{instance_region}/{user.pk}",
)
usage = result.usage.prompt_tokens if result.usage else None
if usage:
TOKENS_IN_PROMPT_HISTOGRAM.observe(usage)
content: str = result.choices[0].message.content or ""
return {"content": content, "timings": timer.get_all_timings()}