Skip to content

Commit

Permalink
ridiculously large cohort query limit
Browse files Browse the repository at this point in the history
  • Loading branch information
mariusandra committed Dec 5, 2023
1 parent 5916346 commit 838de65
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 35 deletions.
28 changes: 28 additions & 0 deletions posthog/hogql/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from uuid import UUID
from pydantic import ConfigDict, BaseModel

from posthog.hogql.errors import HogQLException

ConstantDataType: TypeAlias = Literal[
"int",
"float",
Expand Down Expand Up @@ -31,11 +33,37 @@
DEFAULT_RETURNED_ROWS = 100
# Max limit for all SELECT queries, and the default for CSV exports.
MAX_SELECT_RETURNED_ROWS = 10000 # sync with CSV_EXPORT_LIMIT
# Max limit for all cohort calculations
MAX_SELECT_COHORT_CALCULATION_LIMIT = 1000000000 # 1b persons


class LimitContext(Enum):
QUERY = "query"
EXPORT = "export"
COHORT_CALCULATION = "cohort_calculation"


def get_max_limit_for_context(limit_context: LimitContext) -> int:
if limit_context == LimitContext.EXPORT:
return MAX_SELECT_RETURNED_ROWS # 10k
elif limit_context == LimitContext.QUERY:
return MAX_SELECT_RETURNED_ROWS # 10k
elif limit_context == LimitContext.COHORT_CALCULATION:
return MAX_SELECT_COHORT_CALCULATION_LIMIT # 1b
else:
raise HogQLException(f"Unexpected LimitContext value: {limit_context}")


def get_default_limit_for_context(limit_context: LimitContext) -> int:
"""Limit used if no limit is provided"""
if limit_context == LimitContext.EXPORT:
return MAX_SELECT_RETURNED_ROWS # 10k
elif limit_context == LimitContext.QUERY:
return DEFAULT_RETURNED_ROWS # 100
elif limit_context == LimitContext.COHORT_CALCULATION:
return MAX_SELECT_COHORT_CALCULATION_LIMIT # 1b
else:
raise HogQLException(f"Unexpected LimitContext value: {limit_context}")


# Settings applied at the SELECT level
Expand Down
16 changes: 4 additions & 12 deletions posthog/hogql/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from posthog.clickhouse.client.connection import Workload
from posthog.errors import ExposedCHQueryError
from posthog.hogql import ast
from posthog.hogql.constants import HogQLGlobalSettings, LimitContext
from posthog.hogql.constants import HogQLGlobalSettings, LimitContext, get_default_limit_for_context
from posthog.hogql.errors import HogQLException
from posthog.hogql.hogql import HogQLContext
from posthog.hogql.modifiers import create_default_modifiers_for_team
Expand Down Expand Up @@ -34,7 +34,7 @@ def execute_hogql_query(
workload: Workload = Workload.ONLINE,
settings: Optional[HogQLGlobalSettings] = None,
modifiers: Optional[HogQLQueryModifiers] = None,
limit_context: Optional[LimitContext] = None,
limit_context: Optional[LimitContext] = LimitContext.QUERY,
timings: Optional[HogQLTimings] = None,
explain: Optional[bool] = False,
) -> HogQLQueryResponse:
Expand Down Expand Up @@ -68,20 +68,12 @@ def execute_hogql_query(
select_query = replace_placeholders(select_query, placeholders)

with timings.measure("max_limit"):
from posthog.hogql.constants import (
DEFAULT_RETURNED_ROWS,
MAX_SELECT_RETURNED_ROWS,
)

select_queries = (
select_query.select_queries if isinstance(select_query, ast.SelectUnionQuery) else [select_query]
)
for one_query in select_queries:
if one_query.limit is None:
# One more "max" of MAX_SELECT_RETURNED_ROWS (10k) in applied in the query printer.
one_query.limit = ast.Constant(
value=MAX_SELECT_RETURNED_ROWS if limit_context == LimitContext.EXPORT else DEFAULT_RETURNED_ROWS
)
one_query.limit = ast.Constant(value=get_default_limit_for_context(limit_context))

# Get printed HogQL query, and returned columns. Using a cloned query.
with timings.measure("hogql"):
Expand Down Expand Up @@ -122,7 +114,7 @@ def execute_hogql_query(
)

settings = settings or HogQLGlobalSettings()
if limit_context == LimitContext.EXPORT:
if limit_context == LimitContext.EXPORT or limit_context == LimitContext.COHORT_CALCULATION:
settings.max_execution_time = EXPORT_CONTEXT_MAX_EXECUTION_TIME

# Print the ClickHouse SQL query
Expand Down
21 changes: 5 additions & 16 deletions posthog/hogql_queries/events_query_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
from posthog.api.utils import get_pk_or_uuid
from posthog.clickhouse.client.connection import Workload
from posthog.hogql import ast
from posthog.hogql.constants import get_max_limit_for_context, get_default_limit_for_context
from posthog.hogql.parser import parse_expr, parse_order_expr
from posthog.hogql.property import action_to_expr, has_aggregation, property_to_expr
from posthog.hogql.query import execute_hogql_query, LimitContext
from posthog.hogql.query import execute_hogql_query
from posthog.hogql.timings import HogQLTimings
from posthog.hogql_queries.query_runner import QueryRunner
from posthog.models import Action, Person
Expand Down Expand Up @@ -255,22 +256,10 @@ def select_input_raw(self) -> List[str]:
return ["*"] if len(self.query.select) == 0 else self.query.select

def limit(self) -> int:
# importing locally so we could override in a test
from posthog.hogql.constants import (
DEFAULT_RETURNED_ROWS,
MAX_SELECT_RETURNED_ROWS,
)

# adding +1 to the limit to check if there's a "next page" after the requested results
return (
min(
MAX_SELECT_RETURNED_ROWS,
(MAX_SELECT_RETURNED_ROWS if self.limit_context == LimitContext.EXPORT else DEFAULT_RETURNED_ROWS)
if self.query.limit is None
else self.query.limit,
)
+ 1
)
max_rows = get_max_limit_for_context(self.limit_context)
default_rows = get_default_limit_for_context(self.limit_context)
return min(max_rows, default_rows if self.query.limit is None else self.query.limit) + 1

def _is_stale(self, cached_result_package):
return True
Expand Down
9 changes: 4 additions & 5 deletions posthog/hogql_queries/persons_query_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import List, cast, Literal

from posthog.hogql import ast
from posthog.hogql.constants import DEFAULT_RETURNED_ROWS, MAX_SELECT_RETURNED_ROWS
from posthog.hogql.constants import get_max_limit_for_context, get_default_limit_for_context
from posthog.hogql.parser import parse_expr, parse_order_expr
from posthog.hogql.property import property_to_expr, has_aggregation
from posthog.hogql.query import execute_hogql_query
Expand Down Expand Up @@ -103,10 +103,9 @@ def input_columns(self) -> List[str]:
return self.query.select or ["person", "id", "created_at", "person.$delete"]

def query_limit(self) -> int:
return min(
MAX_SELECT_RETURNED_ROWS,
DEFAULT_RETURNED_ROWS if self.query.limit is None else self.query.limit,
)
max_rows = get_max_limit_for_context(self.limit_context)
default_rows = get_default_limit_for_context(self.limit_context)
return min(max_rows, default_rows if self.query.limit is None else self.query.limit)

def to_query(self) -> ast.SelectQuery:
with self.timings.measure("columns"):
Expand Down
2 changes: 1 addition & 1 deletion posthog/hogql_queries/query_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ def calculate(self) -> BaseModel:
raise NotImplementedError()

def run(self, refresh_requested: Optional[bool] = None) -> CachedQueryResponse:
cache_key = self._cache_key() + ("_export" if self.limit_context == LimitContext.EXPORT else "")
cache_key = f"{self._cache_key()}_{self.limit_context or LimitContext.QUERY}"
tag_queries(cache_key=cache_key)

if not refresh_requested:
Expand Down
5 changes: 4 additions & 1 deletion posthog/models/cohort/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from posthog.client import sync_execute
from posthog.constants import PropertyOperatorType
from posthog.hogql.constants import LimitContext
from posthog.hogql.hogql import HogQLContext
from posthog.hogql.printer import print_ast
from posthog.models import Action, Filter, Team
Expand Down Expand Up @@ -75,7 +76,9 @@ def print_cohort_hogql_query(cohort: Cohort, hogql_context: HogQLContext) -> str

persons_query = cast(Dict, cohort.query)
persons_query["select"] = ["id as actor_id"]
query = get_query_runner(persons_query, team=cast(Team, cohort.team)).to_query()
query = get_query_runner(
persons_query, team=cast(Team, cohort.team), limit_context=LimitContext.COHORT_CALCULATION
).to_query()
hogql_context.enable_select_queries = True
return print_ast(query, context=hogql_context, dialect="clickhouse")

Expand Down

0 comments on commit 838de65

Please sign in to comment.