Skip to content

Commit

Permalink
chore(dev): Allow generating demo data for existing projects (#17913)
Browse files Browse the repository at this point in the history
  • Loading branch information
Twixes authored Oct 11, 2023
1 parent 8bc9602 commit 1a0ec16
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 106 deletions.
31 changes: 23 additions & 8 deletions posthog/demo/matrix/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from django.conf import settings
from django.core import exceptions
from django.db import transaction
from django.db import transaction, IntegrityError

from posthog.client import query_with_columns, sync_execute
from posthog.models import (
Expand Down Expand Up @@ -69,8 +69,6 @@ def ensure_account_and_save(
organization, email, password, first_name, OrganizationMembership.Level.ADMIN, is_staff=is_staff
)
team = self.create_team(organization)
if self.print_steps:
print(f"Saving simulated data...")
self.run_on_team(team, new_user)
return (organization, team, new_user)
elif existing_user.is_staff:
Expand Down Expand Up @@ -106,6 +104,8 @@ def create_team(organization: Organization, **kwargs) -> Team:
return team

def run_on_team(self, team: Team, user: User):
if self.print_steps:
print(f"Saving simulated data...")
does_clickhouse_data_need_saving = True
if self.use_pre_save:
does_clickhouse_data_need_saving = not self._is_demo_data_pre_saved()
Expand All @@ -127,15 +127,21 @@ def run_on_team(self, team: Team, user: User):
def _save_analytics_data(self, data_team: Team):
sim_persons = self.matrix.people
bulk_group_type_mappings = []
if len(self.matrix.groups.keys()) + self.matrix.group_type_index_offset > 5:
raise ValueError("Too many group types! The maximum for a project is 5.")
for group_type_index, (group_type, groups) in enumerate(self.matrix.groups.items()):
group_type_index += self.matrix.group_type_index_offset # Adjust
bulk_group_type_mappings.append(
GroupTypeMapping(team=data_team, group_type_index=group_type_index, group_type=group_type)
)
for group_key, group in groups.items():
self._save_sim_group(
data_team, cast(Literal[0, 1, 2, 3, 4], group_type_index), group_key, group, self.matrix.now
)
GroupTypeMapping.objects.bulk_create(bulk_group_type_mappings)
try:
GroupTypeMapping.objects.bulk_create(bulk_group_type_mappings)
except IntegrityError as e:
print(f"SKIPPING GROUP TYPE MAPPING CREATION: {e}")
for sim_person in sim_persons:
self._save_sim_person(data_team, sim_person)
# We need to wait a bit for data just queued into Kafka to show up in CH
Expand Down Expand Up @@ -202,6 +208,7 @@ def _sync_postgres_with_clickhouse_data(cls, source_team_id: int, target_team_id
# This sets the pk in the bulk_persons dict so we can use them later
Person.objects.bulk_create(bulk_persons.values())
# Person distinct IDs
pre_existing_id_count = PersonDistinctId.objects.filter(team_id=target_team_id).count()
clickhouse_distinct_ids = query_with_columns(
SELECT_PERSON_DISTINCT_ID2S_OF_TEAM,
list_params,
Expand All @@ -211,17 +218,25 @@ def _sync_postgres_with_clickhouse_data(cls, source_team_id: int, target_team_id
bulk_person_distinct_ids = []
for row in clickhouse_distinct_ids:
person_uuid = row.pop("person_uuid")
bulk_person_distinct_ids.append(
PersonDistinctId(team_id=target_team_id, person_id=bulk_persons[person_uuid].pk, **row)
)
try:
bulk_person_distinct_ids.append(
PersonDistinctId(team_id=target_team_id, person_id=bulk_persons[person_uuid].pk, **row)
)
except KeyError:
pre_existing_id_count -= 1
if pre_existing_id_count > 0:
print(f"{pre_existing_id_count} IDS UNACCOUNTED FOR")
PersonDistinctId.objects.bulk_create(bulk_person_distinct_ids, ignore_conflicts=True)
# Groups
clickhouse_groups = query_with_columns(SELECT_GROUPS_OF_TEAM, list_params, ["team_id", "_timestamp", "_offset"])
bulk_groups = []
for row in clickhouse_groups:
group_properties = json.loads(row.pop("group_properties", "{}"))
bulk_groups.append(Group(team_id=target_team_id, version=0, group_properties=group_properties, **row))
Group.objects.bulk_create(bulk_groups)
try:
Group.objects.bulk_create(bulk_groups)
except IntegrityError as e:
print(f"SKIPPING GROUP CREATION: {e}")

def _save_sim_person(self, team: Team, subject: SimPerson):
# We only want to save directly if there are past events
Expand Down
5 changes: 4 additions & 1 deletion posthog/demo/matrix/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ class Matrix(ABC):
start: dt.datetime
now: dt.datetime
end: dt.datetime
group_type_index_offset: int
# A mapping of groups. The first key is the group type, the second key is the group key.
groups: DefaultDict[str, DefaultDict[str, Dict[str, Any]]]
distinct_id_to_person: Dict[str, SimPerson]
Expand All @@ -224,12 +225,14 @@ def __init__(
days_past: int = 180,
days_future: int = 30,
n_clusters: int = settings.DEMO_MATRIX_N_CLUSTERS,
group_type_index_offset: int = 0,
):
if now is None:
now = timezone.now()
self.now = now
self.start = (now - dt.timedelta(days=days_past)).replace(hour=0, minute=0, second=0, microsecond=0)
self.end = (now + dt.timedelta(days=days_future)).replace(hour=0, minute=0, second=0, microsecond=0)
self.group_type_index_offset = group_type_index_offset
# We initialize random data providers here and pass it down as a performance measure
# Provider initialization is a bit intensive, as it loads some JSON data,
# so doing it at cluster or person level could be overly taxing
Expand Down Expand Up @@ -272,6 +275,6 @@ def _update_group(self, group_type: str, group_key: str, set_properties: Dict[st

def _get_group_type_index(self, group_type: str) -> Optional[int]:
try:
return list(self.groups.keys()).index(group_type)
return list(self.groups.keys()).index(group_type) + self.group_type_index_offset
except ValueError:
return None
190 changes: 99 additions & 91 deletions posthog/demo/products/hedgebox/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from dataclasses import dataclass
from typing import Optional

from django.db import IntegrityError

from posthog.constants import INSIGHT_TRENDS, PAGEVIEW_EVENT, RETENTION_FIRST_TIME, TRENDS_LINEAR, TRENDS_WORLD_MAP
from posthog.demo.matrix.matrix import Cluster, Matrix
from posthog.demo.matrix.randomization import Industry
Expand Down Expand Up @@ -553,101 +555,107 @@ def set_project_up(self, team, user):
)

# InsightViewed
InsightViewed.objects.bulk_create(
(
InsightViewed(
team=team,
user=user,
insight=insight,
last_viewed_at=(
self.now - dt.timedelta(days=self.random.randint(0, 3), minutes=self.random.randint(5, 60))
),
)
for insight in Insight.objects.filter(team=team)
try:
InsightViewed.objects.bulk_create(
(
InsightViewed(
team=team,
user=user,
insight=insight,
last_viewed_at=(
self.now - dt.timedelta(days=self.random.randint(0, 3), minutes=self.random.randint(5, 60))
),
)
for insight in Insight.objects.filter(team=team)
),
)
)
except IntegrityError:
pass # This can happen if demo data generation is re-run for the same project

# Feature flags
new_signup_page_flag = FeatureFlag.objects.create(
team=team,
key=FILE_PREVIEWS_FLAG_KEY,
name="File previews (ticket #2137). Work-in-progress, so only visible internally at the moment",
filters={
"groups": [
{
"properties": [
{
"key": "email",
"type": "person",
"value": [
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
],
"operator": "exact",
}
]
}
]
},
created_by=user,
created_at=self.now - dt.timedelta(days=15),
)
try:
new_signup_page_flag = FeatureFlag.objects.create(
team=team,
key=FILE_PREVIEWS_FLAG_KEY,
name="File previews (ticket #2137). Work-in-progress, so only visible internally at the moment",
filters={
"groups": [
{
"properties": [
{
"key": "email",
"type": "person",
"value": [
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
],
"operator": "exact",
}
]
}
]
},
created_by=user,
created_at=self.now - dt.timedelta(days=15),
)

# Experiments
new_signup_page_flag = FeatureFlag.objects.create(
team=team,
key=NEW_SIGNUP_PAGE_FLAG_KEY,
name="New sign-up flow",
filters={
"groups": [{"properties": [], "rollout_percentage": None}],
"multivariate": {
"variants": [
# Experiments
new_signup_page_flag = FeatureFlag.objects.create(
team=team,
key=NEW_SIGNUP_PAGE_FLAG_KEY,
name="New sign-up flow",
filters={
"groups": [{"properties": [], "rollout_percentage": None}],
"multivariate": {
"variants": [
{"key": "control", "rollout_percentage": 100 - NEW_SIGNUP_PAGE_FLAG_ROLLOUT_PERCENT},
{"key": "test", "rollout_percentage": NEW_SIGNUP_PAGE_FLAG_ROLLOUT_PERCENT},
]
},
},
created_by=user,
created_at=self.new_signup_page_experiment_start - dt.timedelta(hours=1),
)
Experiment.objects.create(
team=team,
name="New sign-up flow",
description="We've rebuilt our sign-up page to offer a more personalized experience. Let's see if this version performs better with potential users.",
feature_flag=new_signup_page_flag,
created_by=user,
filters={
"events": [
{
"id": "$pageview",
"name": "$pageview",
"type": "events",
"order": 0,
"properties": [
{"key": "$current_url", "type": "event", "value": URL_SIGNUP, "operator": "exact"}
],
},
{"id": "signed_up", "name": "signed_up", "type": "events", "order": 1},
],
"actions": [],
"display": "FunnelViz",
"insight": "FUNNELS",
"interval": "day",
"funnel_viz_type": "steps",
"filter_test_accounts": True,
},
parameters={
"feature_flag_variants": [
{"key": "control", "rollout_percentage": 100 - NEW_SIGNUP_PAGE_FLAG_ROLLOUT_PERCENT},
{"key": "test", "rollout_percentage": NEW_SIGNUP_PAGE_FLAG_ROLLOUT_PERCENT},
]
],
"recommended_sample_size": int(len(self.clusters) * 0.274),
"recommended_running_time": None,
"minimum_detectable_effect": 1,
},
},
created_by=user,
created_at=self.new_signup_page_experiment_start - dt.timedelta(hours=1),
)
Experiment.objects.create(
team=team,
name="New sign-up flow",
description="We've rebuilt our sign-up page to offer a more personalized experience. Let's see if this version performs better with potential users.",
feature_flag=new_signup_page_flag,
created_by=user,
filters={
"events": [
{
"id": "$pageview",
"name": "$pageview",
"type": "events",
"order": 0,
"properties": [
{"key": "$current_url", "type": "event", "value": URL_SIGNUP, "operator": "exact"}
],
},
{"id": "signed_up", "name": "signed_up", "type": "events", "order": 1},
],
"actions": [],
"display": "FunnelViz",
"insight": "FUNNELS",
"interval": "day",
"funnel_viz_type": "steps",
"filter_test_accounts": True,
},
parameters={
"feature_flag_variants": [
{"key": "control", "rollout_percentage": 100 - NEW_SIGNUP_PAGE_FLAG_ROLLOUT_PERCENT},
{"key": "test", "rollout_percentage": NEW_SIGNUP_PAGE_FLAG_ROLLOUT_PERCENT},
],
"recommended_sample_size": int(len(self.clusters) * 0.274),
"recommended_running_time": None,
"minimum_detectable_effect": 1,
},
start_date=self.new_signup_page_experiment_start,
end_date=self.new_signup_page_experiment_end,
created_at=new_signup_page_flag.created_at,
)
start_date=self.new_signup_page_experiment_start,
end_date=self.new_signup_page_experiment_end,
created_at=new_signup_page_flag.created_at,
)
except IntegrityError:
pass # This can happen if demo data generation is re-run for the same project
Loading

0 comments on commit 1a0ec16

Please sign in to comment.