From a6c3edb2041a036e1951a43a2e1cf47e85263d55 Mon Sep 17 00:00:00 2001 From: Paul D'Ambra Date: Wed, 17 Jan 2024 09:40:51 +0000 Subject: [PATCH] feat: session summaries (#19773) A quick spike of summarizing sessions to seek feedback from everyone it's relatively slow and costs money so I've wrapped it in a flag and am only summarizing one recording at a time --- frontend/src/lib/api.ts | 6 + frontend/src/lib/constants.tsx | 1 + .../playlist/SessionRecordingPreview.tsx | 54 ++- .../playlist/SessionRecordingsPlaylist.tsx | 11 + .../sessionRecordingsPlaylistLogic.ts | 38 ++ frontend/src/types.ts | 1 + .../queries/session_replay_events.py | 36 ++ .../session_recording_api.py | 56 ++- .../session_summary/summarize_session.py | 344 ++++++++++++++++++ .../test/test_summarize_session.py | 116 ++++++ 10 files changed, 655 insertions(+), 8 deletions(-) create mode 100644 posthog/session_recordings/session_summary/summarize_session.py create mode 100644 posthog/session_recordings/session_summary/test/test_summarize_session.py diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts index 854e5b657e4e1..efb943fbe7d1d 100644 --- a/frontend/src/lib/api.ts +++ b/frontend/src/lib/api.ts @@ -1537,6 +1537,12 @@ const api = { return await new ApiRequest().recording(recordingId).withAction('persist').create() }, + async summarize( + recordingId: SessionRecordingType['id'] + ): Promise<{ content: string; ai_result: Record }> { + return await new ApiRequest().recording(recordingId).withAction('summarize').create() + }, + async delete(recordingId: SessionRecordingType['id']): Promise<{ success: boolean }> { return await new ApiRequest().recording(recordingId).delete() }, diff --git a/frontend/src/lib/constants.tsx b/frontend/src/lib/constants.tsx index 7efe560091311..aca5707fccc69 100644 --- a/frontend/src/lib/constants.tsx +++ b/frontend/src/lib/constants.tsx @@ -194,6 +194,7 @@ export const FEATURE_FLAGS = { REDIRECT_WEB_PRODUCT_ANALYTICS_ONBOARDING: 'redirect-web-product-analytics-onboarding', // owner: @biancayang RECRUIT_ANDROID_MOBILE_BETA_TESTERS: 'recruit-android-mobile-beta-testers', // owner: #team-replay SIDEPANEL_STATUS: 'sidepanel-status', // owner: @benjackwhite + AI_SESSION_SUMMARY: 'ai-session-summary', // owner: #team-replay } as const export type FeatureFlagKey = (typeof FEATURE_FLAGS)[keyof typeof FEATURE_FLAGS] diff --git a/frontend/src/scenes/session-recordings/playlist/SessionRecordingPreview.tsx b/frontend/src/scenes/session-recordings/playlist/SessionRecordingPreview.tsx index dfafdabf8add7..c05b9985a258c 100644 --- a/frontend/src/scenes/session-recordings/playlist/SessionRecordingPreview.tsx +++ b/frontend/src/scenes/session-recordings/playlist/SessionRecordingPreview.tsx @@ -1,12 +1,17 @@ import clsx from 'clsx' import { useValues } from 'kea' +import { FlaggedFeature } from 'lib/components/FlaggedFeature' import { PropertyIcon } from 'lib/components/PropertyIcon' import { TZLabel } from 'lib/components/TZLabel' -import { IconAutocapture, IconKeyboard, IconPinFilled, IconSchedule } from 'lib/lemon-ui/icons' +import { FEATURE_FLAGS } from 'lib/constants' +import { IconAutoAwesome, IconAutocapture, IconKeyboard, IconPinFilled, IconSchedule } from 'lib/lemon-ui/icons' +import { LemonButton } from 'lib/lemon-ui/LemonButton' import { LemonSkeleton } from 'lib/lemon-ui/LemonSkeleton' +import { Popover } from 'lib/lemon-ui/Popover' +import { Spinner } from 'lib/lemon-ui/Spinner' import { Tooltip } from 'lib/lemon-ui/Tooltip' import { colonDelimitedDuration } from 'lib/utils' -import { Fragment } from 'react' +import { Fragment, useState } from 'react' import { DraggableToNotebook } from 'scenes/notebooks/AddToNotebook/DraggableToNotebook' import { asDisplay } from 'scenes/persons/person-utils' import { playerSettingsLogic } from 'scenes/session-recordings/player/playerSettingsLogic' @@ -22,6 +27,8 @@ export interface SessionRecordingPreviewProps { isActive?: boolean onClick?: () => void pinned?: boolean + summariseFn?: (recording: SessionRecordingType) => void + sessionSummaryLoading?: boolean } function RecordingDuration({ @@ -228,18 +235,61 @@ export function SessionRecordingPreview({ onClick, onPropertyClick, pinned, + summariseFn, + sessionSummaryLoading, }: SessionRecordingPreviewProps): JSX.Element { const { durationTypeToShow } = useValues(playerSettingsLogic) const iconClassnames = clsx('SessionRecordingPreview__property-icon text-base text-muted-alt') + const [summaryPopoverIsVisible, setSummaryPopoverIsVisible] = useState(false) + + const [summaryButtonIsVisible, setSummaryButtonIsVisible] = useState(false) + return (
onClick?.()} + onMouseEnter={() => setSummaryButtonIsVisible(true)} + onMouseLeave={() => setSummaryButtonIsVisible(false)} > + + {summariseFn && ( + setSummaryPopoverIsVisible(false)} + overlay={ + sessionSummaryLoading ? ( + + ) : ( +
{recording.summary}
+ ) + } + > + } + onClick={(e) => { + e.preventDefault() + e.stopPropagation() + setSummaryPopoverIsVisible(!summaryPopoverIsVisible) + if (!recording.summary) { + summariseFn(recording) + } + }} + /> +
+ )} +
diff --git a/frontend/src/scenes/session-recordings/playlist/SessionRecordingsPlaylist.tsx b/frontend/src/scenes/session-recordings/playlist/SessionRecordingsPlaylist.tsx index 8e28741482152..9bd33405a4d38 100644 --- a/frontend/src/scenes/session-recordings/playlist/SessionRecordingsPlaylist.tsx +++ b/frontend/src/scenes/session-recordings/playlist/SessionRecordingsPlaylist.tsx @@ -92,6 +92,8 @@ function RecordingsLists(): JSX.Element { logicProps, showOtherRecordings, recordingsCount, + sessionSummaryLoading, + sessionBeingSummarized, } = useValues(sessionRecordingsPlaylistLogic) const { setSelectedRecordingId, @@ -102,6 +104,7 @@ function RecordingsLists(): JSX.Element { resetFilters, setShowAdvancedFilters, toggleShowOtherRecordings, + summarizeSession, } = useActions(sessionRecordingsPlaylistLogic) const onRecordingClick = (recording: SessionRecordingType): void => { @@ -112,6 +115,10 @@ function RecordingsLists(): JSX.Element { setFilters(defaultPageviewPropertyEntityFilter(filters, property, value)) } + const onSummarizeClick = (recording: SessionRecordingType): void => { + summarizeSession(recording.id) + } + const lastScrollPositionRef = useRef(0) const contentRef = useRef(null) @@ -248,6 +255,10 @@ function RecordingsLists(): JSX.Element { onPropertyClick={onPropertyClick} isActive={activeSessionRecordingId === rec.id} pinned={false} + summariseFn={onSummarizeClick} + sessionSummaryLoading={ + sessionSummaryLoading && sessionBeingSummarized === rec.id + } />
)) diff --git a/frontend/src/scenes/session-recordings/playlist/sessionRecordingsPlaylistLogic.ts b/frontend/src/scenes/session-recordings/playlist/sessionRecordingsPlaylistLogic.ts index 961178e00f9f0..3090a9707ef0c 100644 --- a/frontend/src/scenes/session-recordings/playlist/sessionRecordingsPlaylistLogic.ts +++ b/frontend/src/scenes/session-recordings/playlist/sessionRecordingsPlaylistLogic.ts @@ -205,6 +205,11 @@ export interface SessionRecordingPlaylistLogicProps { onPinnedChange?: (recording: SessionRecordingType, pinned: boolean) => void } +export interface SessionSummaryResponse { + id: SessionRecordingType['id'] + content: string +} + export const sessionRecordingsPlaylistLogic = kea([ path((key) => ['scenes', 'session-recordings', 'playlist', 'sessionRecordingsPlaylistLogic', key]), props({} as SessionRecordingPlaylistLogicProps), @@ -239,6 +244,7 @@ export const sessionRecordingsPlaylistLogic = kea ({ direction }), maybeLoadSessionRecordings: (direction?: 'newer' | 'older') => ({ direction }), + summarizeSession: (id: SessionRecordingType['id']) => ({ id }), loadNext: true, loadPrev: true, toggleShowOtherRecordings: (show?: boolean) => ({ show }), @@ -255,6 +261,15 @@ export const sessionRecordingsPlaylistLogic = kea ({ + sessionSummary: { + summarizeSession: async ({ id }): Promise => { + if (!id) { + return null + } + const response = await api.recordings.summarize(id) + return { content: response.content, id: id } + }, + }, eventsHaveSessionId: [ {} as Record, { @@ -342,6 +357,13 @@ export const sessionRecordingsPlaylistLogic = kea ({ + sessionBeingSummarized: [ + null as null | SessionRecordingType['id'], + { + summarizeSession: (_, { id }) => id, + sessionSummarySuccess: () => null, + }, + ], // If we initialise with pinned recordings then we don't show others by default // but if we go down to 0 pinned recordings then we show others showOtherRecordings: [ @@ -428,6 +450,7 @@ export const sessionRecordingsPlaylistLogic = kea state.map((s) => { if (s.id === id) { @@ -439,6 +462,21 @@ export const sessionRecordingsPlaylistLogic = kea { + return sessionSummary + ? state.map((s) => { + if (s.id === sessionSummary.id) { + return { + ...s, + summary: sessionSummary.content, + } + } else { + return s + } + }) + : state + }, }, ], selectedRecordingId: [ diff --git a/frontend/src/types.ts b/frontend/src/types.ts index 89396df0c5114..836e272388f38 100644 --- a/frontend/src/types.ts +++ b/frontend/src/types.ts @@ -1100,6 +1100,7 @@ export interface SessionRecordingType { console_error_count?: number /** Where this recording information was loaded from */ storage?: 'object_storage_lts' | 'object_storage' + summary?: string } export interface SessionRecordingPropertiesType { diff --git a/posthog/session_recordings/queries/session_replay_events.py b/posthog/session_recordings/queries/session_replay_events.py index e77d923fa4709..fb1f2ea30329e 100644 --- a/posthog/session_recordings/queries/session_replay_events.py +++ b/posthog/session_recordings/queries/session_replay_events.py @@ -6,8 +6,10 @@ from posthog.clickhouse.client import sync_execute from posthog.cloud_utils import is_cloud from posthog.constants import AvailableFeature + from posthog.models.instance_setting import get_instance_setting from posthog.models.team import Team + from posthog.session_recordings.models.metadata import ( RecordingMetadata, ) @@ -102,6 +104,40 @@ def get_metadata( console_error_count=replay[11], ) + def get_events( + self, session_id: str, team: Team, metadata: RecordingMetadata, events_to_ignore: List[str] | None + ) -> Tuple[List | None, List | None]: + from posthog.schema import HogQLQuery, HogQLQueryResponse + from posthog.hogql_queries.hogql_query_runner import HogQLQueryRunner + + q = """ + select event, timestamp, elements_chain, properties.$window_id, properties.$current_url, properties.$event_type + from events + where timestamp >= {start_time} and timestamp <= {end_time} + and $session_id = {session_id} + """ + if events_to_ignore: + q += " and event not in {events_to_ignore}" + + q += " order by timestamp asc" + + hq = HogQLQuery( + query=q, + values={ + "start_time": metadata["start_time"], + "end_time": metadata["end_time"], + "session_id": session_id, + "events_to_ignore": events_to_ignore, + }, + ) + + result: HogQLQueryResponse = HogQLQueryRunner( + team=team, + query=hq, + ).calculate() + + return result.columns, result.results + def ttl_days(team: Team) -> int: ttl_days = (get_instance_setting("RECORDINGS_TTL_WEEKS") or 3) * 7 diff --git a/posthog/session_recordings/session_recording_api.py b/posthog/session_recordings/session_recording_api.py index 31fe842a5ba2d..eb5293b251830 100644 --- a/posthog/session_recordings/session_recording_api.py +++ b/posthog/session_recordings/session_recording_api.py @@ -1,13 +1,16 @@ +import os import time from datetime import datetime, timedelta, timezone import json from typing import Any, List, Type, cast, Dict, Tuple + from django.conf import settings import posthoganalytics import requests from django.contrib.auth.models import AnonymousUser +from django.core.cache import cache from django.http import JsonResponse, HttpResponse from drf_spectacular.utils import extend_schema from loginas.utils import is_impersonated_session @@ -19,6 +22,7 @@ from posthog.api.person import MinimalPersonSerializer from posthog.api.routing import StructuredViewSetMixin from posthog.auth import SharingAccessTokenAuthentication +from posthog.cloud_utils import is_cloud from posthog.constants import SESSION_RECORDINGS_FILTER_IDS from posthog.models import User from posthog.models.filters.session_recordings_filter import SessionRecordingsFilter @@ -46,12 +50,14 @@ ) from posthog.session_recordings.queries.session_replay_events import SessionReplayEvents from posthog.session_recordings.realtime_snapshots import get_realtime_snapshots, publish_subscription +from posthog.session_recordings.session_summary.summarize_session import summarize_recording from posthog.session_recordings.snapshots.convert_legacy_snapshots import ( convert_original_version_lts_recording, ) from posthog.storage import object_storage from prometheus_client import Counter + SNAPSHOT_SOURCE_REQUESTED = Counter( "session_snapshots_requested_counter", "When calling the API and providing a concrete snapshot type to load.", @@ -61,8 +67,9 @@ # context manager for gathering a sequence of server timings class ServerTimingsGathered: - # Class level dictionary to store timings - timings_dict: Dict[str, float] = {} + def __init__(self): + # Instance level dictionary to store timings + self.timings_dict = {} def __call__(self, name): self.name = name @@ -77,11 +84,10 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): end_time = time.perf_counter() * 1000 elapsed_time = end_time - self.start_time - ServerTimingsGathered.timings_dict[self.name] = elapsed_time + self.timings_dict[self.name] = elapsed_time - @classmethod - def get_all_timings(cls): - return cls.timings_dict + def get_all_timings(self): + return self.timings_dict class SessionRecordingSerializer(serializers.ModelSerializer): @@ -475,6 +481,44 @@ def properties(self, request: request.Request, **kwargs): return Response({"results": session_recording_serializer.data}) + @action(methods=["POST"], detail=True) + def summarize(self, request: request.Request, **kwargs): + if not request.user.is_authenticated: + raise exceptions.NotAuthenticated() + + user = cast(User, request.user) + + cache_key = f'summarize_recording_{self.team.pk}_{self.kwargs["pk"]}' + # Check if the response is cached + cached_response = cache.get(cache_key) + if cached_response is not None: + return Response(cached_response) + + recording = self.get_object() + + if not SessionReplayEvents().exists(session_id=str(recording.session_id), team=self.team): + raise exceptions.NotFound("Recording not found") + + environment_is_allowed = settings.DEBUG or is_cloud() + has_openai_api_key = bool(os.environ.get("OPENAI_API_KEY")) + if not environment_is_allowed or not has_openai_api_key: + raise exceptions.ValidationError("session summary is only supported in PostHog Cloud") + + if not posthoganalytics.feature_enabled("ai-session-summary", str(user.distinct_id)): + raise exceptions.ValidationError("session summary is not enabled for this user") + + summary = summarize_recording(recording, user, self.team) + timings = summary.pop("timings", None) + cache.set(cache_key, summary, timeout=30) + + # let the browser cache for half the time we cache on the server + r = Response(summary, headers={"Cache-Control": "max-age=15"}) + if timings: + r.headers["Server-Timing"] = ", ".join( + f"{key};dur={round(duration, ndigits=2)}" for key, duration in timings.items() + ) + return r + def list_recordings( filter: SessionRecordingsFilter, request: request.Request, context: Dict[str, Any] diff --git a/posthog/session_recordings/session_summary/summarize_session.py b/posthog/session_recordings/session_summary/summarize_session.py new file mode 100644 index 0000000000000..503a3c97f3ab0 --- /dev/null +++ b/posthog/session_recordings/session_summary/summarize_session.py @@ -0,0 +1,344 @@ +import dataclasses +from datetime import datetime + +from typing import List, Dict, Any + +import openai +from prometheus_client import Histogram + +from posthog.api.activity_log import ServerTimingsGathered +from posthog.models import User, Team +from posthog.models.element import chain_to_elements +from posthog.session_recordings.models.session_recording import SessionRecording + +from posthog.session_recordings.queries.session_replay_events import SessionReplayEvents + +from posthog.utils import get_instance_region + +TOKENS_IN_PROMPT_HISTOGRAM = Histogram( + "posthog_session_summary_tokens_in_prompt_histogram", + "histogram of the number of tokens in the prompt used to generate a session summary", + buckets=[ + 0, + 10, + 50, + 100, + 500, + 1000, + 2000, + 3000, + 4000, + 5000, + 6000, + 7000, + 8000, + 10000, + 20000, + 30000, + 40000, + 50000, + 100000, + 128000, + float("inf"), + ], +) + + +@dataclasses.dataclass +class SessionSummaryPromptData: + # we may allow customisation of columns included in the future, + # and we alter the columns present as we process the data + # so want to stay as loose as possible here + columns: List[str] = dataclasses.field(default_factory=list) + results: List[List[Any]] = dataclasses.field(default_factory=list) + # in order to reduce the number of tokens in the prompt + # we replace URLs with a placeholder and then pass this mapping of placeholder to URL into the prompt + url_mapping: Dict[str, str] = dataclasses.field(default_factory=dict) + + def is_empty(self) -> bool: + return not self.columns or not self.results + + def column_index(self, column: str) -> int | None: + for i, c in enumerate(self.columns): + if c == column: + return i + return None + + +def reduce_elements_chain(session_events: SessionSummaryPromptData) -> SessionSummaryPromptData: + if session_events.is_empty(): + return session_events + + # find elements_chain column index + elements_chain_index = session_events.column_index("elements_chain") + + reduced_results = [] + for result in session_events.results: + if elements_chain_index is None: + reduced_results.append(result) + continue + + elements_chain: str | None = result[elements_chain_index] + if not elements_chain: + reduced_results.append(result) + continue + + # the elements chain has lots of information that we don't need + elements = [e for e in chain_to_elements(elements_chain) if e.tag_name in e.USEFUL_ELEMENTS] + + result_list = list(result) + result_list[elements_chain_index] = [{"tag": e.tag_name, "text": e.text, "href": e.href} for e in elements] + reduced_results.append(result_list) + + return dataclasses.replace(session_events, results=reduced_results) + + +def simplify_window_id(session_events: SessionSummaryPromptData) -> SessionSummaryPromptData: + if session_events.is_empty(): + return session_events + + # find window_id column index + window_id_index = session_events.column_index("$window_id") + + window_id_mapping: Dict[str, int] = {} + simplified_results = [] + for result in session_events.results: + if window_id_index is None: + simplified_results.append(result) + continue + + window_id: str | None = result[window_id_index] + if not window_id: + simplified_results.append(result) + continue + + if window_id not in window_id_mapping: + window_id_mapping[window_id] = len(window_id_mapping) + 1 + + result_list = list(result) + result_list[window_id_index] = window_id_mapping[window_id] + simplified_results.append(result_list) + + return dataclasses.replace(session_events, results=simplified_results) + + +def deduplicate_urls(session_events: SessionSummaryPromptData) -> SessionSummaryPromptData: + if session_events.is_empty(): + return session_events + + # find url column index + url_index = session_events.column_index("$current_url") + + url_mapping: Dict[str, str] = {} + deduplicated_results = [] + for result in session_events.results: + if url_index is None: + deduplicated_results.append(result) + continue + + url: str | None = result[url_index] + if not url: + deduplicated_results.append(result) + continue + + if url not in url_mapping: + url_mapping[url] = f"url_{len(url_mapping) + 1}" + + result_list = list(result) + result_list[url_index] = url_mapping[url] + deduplicated_results.append(result_list) + + return dataclasses.replace(session_events, results=deduplicated_results, url_mapping=url_mapping) + + +def format_dates(session_events: SessionSummaryPromptData, start: datetime) -> SessionSummaryPromptData: + if session_events.is_empty(): + return session_events + + # find timestamp column index + timestamp_index = session_events.column_index("timestamp") + + if timestamp_index is None: + # no timestamp column so nothing to do + return session_events + + del session_events.columns[timestamp_index] # remove timestamp column from columns + session_events.columns.append("milliseconds_since_start") # add new column to columns at end + + formatted_results = [] + for result in session_events.results: + timestamp: datetime | None = result[timestamp_index] + if not timestamp: + formatted_results.append(result) + continue + + result_list = list(result) + # remove list item at timestamp_index + del result_list[timestamp_index] + # insert milliseconds since reference date + result_list.append(int((timestamp - start).total_seconds() * 1000)) + formatted_results.append(result_list) + + return dataclasses.replace(session_events, results=formatted_results) + + +def collapse_sequence_of_events(session_events: SessionSummaryPromptData) -> SessionSummaryPromptData: + # assumes the list is ordered by timestamp + if session_events.is_empty(): + return session_events + + # find the event column index + event_index = session_events.column_index("event") + + # find the window id column index + window_id_index = session_events.column_index("$window_id") + + event_repetition_count_index: int | None = None + # we only append this new column, if we need to add it below + + # now enumerate the results finding sequences of events with the same event and collapsing them to a single item + collapsed_results = [] + for i, result in enumerate(session_events.results): + if event_index is None: + collapsed_results.append(result) + continue + + event: str | None = result[event_index] + if not event: + collapsed_results.append(result) + continue + + if i == 0: + collapsed_results.append(result) + continue + + # we need to collapse into the last item added into collapsed results + # as we're going to amend it in place + previous_result = collapsed_results[len(collapsed_results) - 1] + previous_event: str | None = previous_result[event_index] + if not previous_event: + collapsed_results.append(result) + continue + + event_matches = previous_event == event + window_matches = previous_result[window_id_index] == result[window_id_index] if window_id_index else True + + if event_matches and window_matches: + # collapse the event into the previous result + if event_repetition_count_index is None: + # we need to add the column + event_repetition_count_index = len(session_events.columns) + session_events.columns.append("event_repetition_count") + previous_result_list = list(previous_result) + try: + existing_repetition_count = previous_result_list[event_repetition_count_index] or 0 + previous_result_list[event_repetition_count_index] = existing_repetition_count + 1 + except IndexError: + previous_result_list.append(2) + + collapsed_results[len(collapsed_results) - 1] = previous_result_list + else: + result.append(None) # there is no event repetition count + collapsed_results.append(result) + + return dataclasses.replace(session_events, results=collapsed_results) + + +def summarize_recording(recording: SessionRecording, user: User, team: Team): + timer = ServerTimingsGathered() + + with timer("get_metadata"): + session_metadata = SessionReplayEvents().get_metadata(session_id=str(recording.session_id), team=team) + if not session_metadata: + raise ValueError(f"no session metadata found for session_id {recording.session_id}") + + with timer("get_events"): + session_events = SessionReplayEvents().get_events( + session_id=str(recording.session_id), + team=team, + metadata=session_metadata, + events_to_ignore=[ + "$feature_flag_called", + ], + ) + if not session_events or not session_events[0] or not session_events[1]: + raise ValueError(f"no events found for session_id {recording.session_id}") + + # convert session_metadata to a Dict from a TypedDict + # so that we can amend its values freely + session_metadata_dict = dict(session_metadata) + + del session_metadata_dict["distinct_id"] + start_time = session_metadata["start_time"] + session_metadata_dict["start_time"] = start_time.isoformat() + session_metadata_dict["end_time"] = session_metadata["end_time"].isoformat() + + with timer("generate_prompt"): + prompt_data = deduplicate_urls( + collapse_sequence_of_events( + format_dates( + reduce_elements_chain( + simplify_window_id( + SessionSummaryPromptData(columns=session_events[0], results=session_events[1]) + ) + ), + start=start_time, + ) + ) + ) + + instance_region = get_instance_region() or "HOBBY" + messages = [ + { + "role": "system", + "content": """ + Session Replay is PostHog's tool to record visits to web sites and apps. + We also gather events that occur like mouse clicks and key presses. + You write two or three sentence concise and simple summaries of those sessions based on a prompt. + You are more likely to mention errors or things that look like business success such as checkout events. + You don't help with other knowledge.""", + }, + { + "role": "user", + "content": f"""the session metadata I have is {session_metadata_dict}. + it gives an overview of activity and duration""", + }, + { + "role": "user", + "content": f""" + URLs associated with the events can be found in this mapping {prompt_data.url_mapping}. + """, + }, + { + "role": "user", + "content": f"""the session events I have are {prompt_data.results}. + with columns {prompt_data.columns}. + they give an idea of what happened and when, + if present the elements_chain extracted from the html can aid in understanding + but should not be directly used in your response""", + }, + { + "role": "user", + "content": """ + generate a two or three sentence summary of the session. + use as concise and simple language as is possible. + assume a reading age of around 12 years old. + generate no text other than the summary.""", + }, + ] + + with timer("openai_completion"): + result = openai.ChatCompletion.create( + # model="gpt-4-1106-preview", # allows 128k tokens + model="gpt-4", # allows 8k tokens + temperature=0.7, + messages=messages, + user=f"{instance_region}/{user.pk}", # The user ID is for tracking within OpenAI in case of overuse/abuse + ) + + usage = result.get("usage", {}).get("prompt_tokens", None) + if usage: + TOKENS_IN_PROMPT_HISTOGRAM.observe(usage) + + content: str = result.get("choices", [{}])[0].get("message", {}).get("content", "") + return {"ai_result": result, "content": content, "prompt": messages, "timings": timer.get_all_timings()} diff --git a/posthog/session_recordings/session_summary/test/test_summarize_session.py b/posthog/session_recordings/session_summary/test/test_summarize_session.py new file mode 100644 index 0000000000000..266e19ef19c19 --- /dev/null +++ b/posthog/session_recordings/session_summary/test/test_summarize_session.py @@ -0,0 +1,116 @@ +from datetime import timezone, datetime + +from dateutil.parser import isoparse + +from posthog.session_recordings.session_summary.summarize_session import ( + format_dates, + simplify_window_id, + deduplicate_urls, + collapse_sequence_of_events, + SessionSummaryPromptData, +) +from posthog.test.base import BaseTest + + +class TestSummarizeSessions(BaseTest): + def test_format_dates_as_millis_since_start(self) -> None: + processed = format_dates( + SessionSummaryPromptData( + columns=["event", "timestamp"], + results=[ + ["$pageview", isoparse("2021-01-01T00:00:00Z")], + ["$pageview", isoparse("2021-01-01T00:00:01Z")], + ["$pageview", isoparse("2021-01-01T00:00:02Z")], + ], + ), + datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone.utc), + ) + assert processed.columns == ["event", "milliseconds_since_start"] + assert processed.results == [["$pageview", 0], ["$pageview", 1000], ["$pageview", 2000]] + + def test_simplify_window_id(self) -> None: + processed = simplify_window_id( + SessionSummaryPromptData( + columns=["event", "timestamp", "$window_id"], + results=[ + ["$pageview-1-1", isoparse("2021-01-01T00:00:00Z"), "window-the-first"], + ["$pageview-1-2", isoparse("2021-01-01T00:00:01Z"), "window-the-first"], + ["$pageview-2-1", isoparse("2021-01-01T00:00:02Z"), "window-the-second"], + ["$pageview-4-1", isoparse("2021-01-01T00:00:02Z"), "window-the-fourth"], + ["$pageview-3-1", isoparse("2021-01-01T00:00:02Z"), "window-the-third"], + ["$pageview-1-3", isoparse("2021-01-01T00:00:02Z"), "window-the-first"], + ], + ) + ) + + assert processed.columns == ["event", "timestamp", "$window_id"] + assert processed.results == [ + ["$pageview-1-1", isoparse("2021-01-01T00:00:00Z"), 1], + ["$pageview-1-2", isoparse("2021-01-01T00:00:01Z"), 1], + ["$pageview-2-1", isoparse("2021-01-01T00:00:02Z"), 2], + # window the fourth has index 3... + # in reality these are mapping from UUIDs + # and this apparent switch of number wouldn't stand out + ["$pageview-4-1", isoparse("2021-01-01T00:00:02Z"), 3], + ["$pageview-3-1", isoparse("2021-01-01T00:00:02Z"), 4], + ["$pageview-1-3", isoparse("2021-01-01T00:00:02Z"), 1], + ] + + def test_collapse_sequence_of_events(self) -> None: + processed = collapse_sequence_of_events( + SessionSummaryPromptData( + columns=["event", "timestamp", "$window_id"], + results=[ + # these collapse because they're a sequence + ["$pageview", isoparse("2021-01-01T00:00:00Z"), 1], + ["$pageview", isoparse("2021-01-01T01:00:00Z"), 1], + ["$pageview", isoparse("2021-01-01T02:00:00Z"), 1], + ["$pageview", isoparse("2021-01-01T03:00:00Z"), 1], + # these don't collapse because they're different windows + ["$autocapture", isoparse("2021-01-01T00:00:00Z"), 1], + ["$autocapture", isoparse("2021-01-01T01:00:00Z"), 2], + # these don't collapse because they're not a sequence + ["$a", isoparse("2021-01-01T01:00:00Z"), 2], + ["$b", isoparse("2021-01-01T01:00:00Z"), 2], + ["$c", isoparse("2021-01-01T01:00:00Z"), 2], + ], + ) + ) + assert processed.columns == ["event", "timestamp", "$window_id", "event_repetition_count"] + assert processed.results == [ + ["$pageview", isoparse("2021-01-01T00:00:00Z"), 1, 4], + ["$autocapture", isoparse("2021-01-01T00:00:00Z"), 1, None], + ["$autocapture", isoparse("2021-01-01T01:00:00Z"), 2, None], + ["$a", isoparse("2021-01-01T01:00:00Z"), 2, None], + ["$b", isoparse("2021-01-01T01:00:00Z"), 2, None], + ["$c", isoparse("2021-01-01T01:00:00Z"), 2, None], + ] + + def test_deduplicate_ids(self) -> None: + processed = deduplicate_urls( + SessionSummaryPromptData( + columns=["event", "$current_url"], + results=[ + ["$pageview-one", "https://example.com/one"], + ["$pageview-two", "https://example.com/two"], + ["$pageview-one", "https://example.com/one"], + ["$pageview-one", "https://example.com/one"], + ["$pageview-two", "https://example.com/two"], + ["$pageview-three", "https://example.com/three"], + ], + ) + ) + assert processed.columns == ["event", "$current_url"] + assert processed.results == [ + ["$pageview-one", "url_1"], + ["$pageview-two", "url_2"], + ["$pageview-one", "url_1"], + ["$pageview-one", "url_1"], + ["$pageview-two", "url_2"], + ["$pageview-three", "url_3"], + ] + assert processed.url_mapping == { + "https://example.com/one": "url_1", + "https://example.com/two": "url_2", + "https://example.com/three": "url_3", + }