From dd5be88f1add7b1ff5c9e65745ad9f64caa3e4bf Mon Sep 17 00:00:00 2001 From: James Greenhill Date: Sat, 18 Nov 2023 15:37:08 -0800 Subject: [PATCH] chore: Use system.parts table to get count of rows. much cheaper (#18730) * chore: Use system.parts table to get count of rows. much cheaper * clean up another query that was doing full table scans * is_active -> active * added a projection to make things faster * look at rows for sharded tables --- posthog/celery.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/posthog/celery.py b/posthog/celery.py index 46b1e3b402f71..374c90db7e4ec 100644 --- a/posthog/celery.py +++ b/posthog/celery.py @@ -515,10 +515,10 @@ def pg_row_count(): CLICKHOUSE_TABLES = [ - "events", + "sharded_events", "person", "person_distinct_id2", - "session_replay_events", + "sharded_session_replay_events", "log_entries", ] if not is_cloud(): @@ -540,9 +540,8 @@ def clickhouse_lag(): ) for table in CLICKHOUSE_TABLES: try: - QUERY = ( - """select max(_timestamp) observed_ts, now() now_ts, now() - max(_timestamp) as lag from {table};""" - ) + QUERY = """SELECT max(_timestamp) observed_ts, now() now_ts, now() - max(_timestamp) as lag + FROM {table}""" query = QUERY.format(table=table) lag = sync_execute(query)[0][2] statsd.gauge( @@ -688,9 +687,8 @@ def clickhouse_row_count(): ) for table in CLICKHOUSE_TABLES: try: - QUERY = ( - """select count(1) freq from {table} where _timestamp >= toStartOfDay(date_sub(DAY, 2, now()));""" - ) + QUERY = """SELECT sum(rows) rows from system.parts + WHERE table = '{table}' and active;""" query = QUERY.format(table=table) rows = sync_execute(query)[0][0] row_count_gauge.labels(table_name=table).set(rows) @@ -745,10 +743,11 @@ def clickhouse_part_count(): from posthog.client import sync_execute QUERY = """ - select table, count(1) freq - from system.parts - group by table - order by freq desc; + SELECT table, count(1) freq + FROM system.parts + WHERE active + GROUP BY table + ORDER BY freq DESC; """ rows = sync_execute(QUERY)