From 0cef06ebe39fa3198f6dd6667771b1f21a224abd Mon Sep 17 00:00:00 2001 From: James Greenhill Date: Fri, 17 Nov 2023 11:44:12 -0800 Subject: [PATCH 1/5] chore: Use system.parts table to get count of rows. much cheaper --- posthog/celery.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/posthog/celery.py b/posthog/celery.py index 46b1e3b402f71..8d1f5c7d5f1f8 100644 --- a/posthog/celery.py +++ b/posthog/celery.py @@ -688,9 +688,8 @@ def clickhouse_row_count(): ) for table in CLICKHOUSE_TABLES: try: - QUERY = ( - """select count(1) freq from {table} where _timestamp >= toStartOfDay(date_sub(DAY, 2, now()));""" - ) + QUERY = """SELECT sum(rows) rows from system.parts + WHERE table = '{table}' and is_active;""" query = QUERY.format(table=table) rows = sync_execute(query)[0][0] row_count_gauge.labels(table_name=table).set(rows) From 3051a5918f99c2a2f2e89ba1d7824b12a41961c5 Mon Sep 17 00:00:00 2001 From: James Greenhill Date: Fri, 17 Nov 2023 11:51:09 -0800 Subject: [PATCH 2/5] clean up another query that was doing full table scans --- posthog/celery.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/posthog/celery.py b/posthog/celery.py index 8d1f5c7d5f1f8..bb318860e2e8a 100644 --- a/posthog/celery.py +++ b/posthog/celery.py @@ -540,9 +540,9 @@ def clickhouse_lag(): ) for table in CLICKHOUSE_TABLES: try: - QUERY = ( - """select max(_timestamp) observed_ts, now() now_ts, now() - max(_timestamp) as lag from {table};""" - ) + QUERY = """SELECT max(_timestamp) observed_ts, now() now_ts, now() - max(_timestamp) as lag + FROM {table} + WHERE timestamp >= now() - toIntervalDay(3);""" query = QUERY.format(table=table) lag = sync_execute(query)[0][2] statsd.gauge( @@ -744,10 +744,10 @@ def clickhouse_part_count(): from posthog.client import sync_execute QUERY = """ - select table, count(1) freq - from system.parts - group by table - order by freq desc; + SELECT table, count(1) freq + FROM system.parts + GROUP BY table + ORDER BY freq DESC; """ rows = sync_execute(QUERY) From 9688b96fe26ca0a072d02573c3d65faf8b224f82 Mon Sep 17 00:00:00 2001 From: James Greenhill Date: Fri, 17 Nov 2023 14:28:40 -0800 Subject: [PATCH 3/5] is_active -> active --- posthog/celery.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/posthog/celery.py b/posthog/celery.py index bb318860e2e8a..760bdaadb7b25 100644 --- a/posthog/celery.py +++ b/posthog/celery.py @@ -689,7 +689,7 @@ def clickhouse_row_count(): for table in CLICKHOUSE_TABLES: try: QUERY = """SELECT sum(rows) rows from system.parts - WHERE table = '{table}' and is_active;""" + WHERE table = '{table}' and active;""" query = QUERY.format(table=table) rows = sync_execute(query)[0][0] row_count_gauge.labels(table_name=table).set(rows) @@ -746,6 +746,7 @@ def clickhouse_part_count(): QUERY = """ SELECT table, count(1) freq FROM system.parts + WHERE active GROUP BY table ORDER BY freq DESC; """ From 1314db27da8ed2e0061608f945fd9e248f6bffeb Mon Sep 17 00:00:00 2001 From: James Greenhill Date: Fri, 17 Nov 2023 15:33:21 -0800 Subject: [PATCH 4/5] added a projection to make things faster --- posthog/celery.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/posthog/celery.py b/posthog/celery.py index 760bdaadb7b25..2d323beb4703b 100644 --- a/posthog/celery.py +++ b/posthog/celery.py @@ -541,8 +541,7 @@ def clickhouse_lag(): for table in CLICKHOUSE_TABLES: try: QUERY = """SELECT max(_timestamp) observed_ts, now() now_ts, now() - max(_timestamp) as lag - FROM {table} - WHERE timestamp >= now() - toIntervalDay(3);""" + FROM {table}""" query = QUERY.format(table=table) lag = sync_execute(query)[0][2] statsd.gauge( From 4866963172e4166fba75db594fd20e70c35a3899 Mon Sep 17 00:00:00 2001 From: James Greenhill Date: Sat, 18 Nov 2023 15:22:07 -0800 Subject: [PATCH 5/5] look at rows for sharded tables --- posthog/celery.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/posthog/celery.py b/posthog/celery.py index 2d323beb4703b..374c90db7e4ec 100644 --- a/posthog/celery.py +++ b/posthog/celery.py @@ -515,10 +515,10 @@ def pg_row_count(): CLICKHOUSE_TABLES = [ - "events", + "sharded_events", "person", "person_distinct_id2", - "session_replay_events", + "sharded_session_replay_events", "log_entries", ] if not is_cloud():