Skip to content

Commit

Permalink
fix(queries): export limit (PostHog#18046)
Browse files Browse the repository at this point in the history
  • Loading branch information
mariusandra authored and Justicea83 committed Oct 25, 2023
1 parent 4550303 commit e77037b
Show file tree
Hide file tree
Showing 20 changed files with 149 additions and 69 deletions.
1 change: 0 additions & 1 deletion frontend/src/models/cohortsModel.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ export const cohortsModel = kea<cohortsModelType>({
export_format: ExporterFormat.CSV,
export_context: {
path: `/api/cohort/${id}/persons`,
max_limit: 10000,
},
}
if (columns && columns.length > 0) {
Expand Down
40 changes: 23 additions & 17 deletions frontend/src/queries/nodes/DataTable/DataTableExport.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@ import { IconExport } from 'lib/lemon-ui/icons'
import { triggerExport } from 'lib/components/ExportButton/exporter'
import { ExporterFormat } from '~/types'
import { DataNode, DataTableNode, NodeKind } from '~/queries/schema'
import { defaultDataTableColumns, extractExpressionComment } from '~/queries/nodes/DataTable/utils'
import {
defaultDataTableColumns,
extractExpressionComment,
removeExpressionComment,
} from '~/queries/nodes/DataTable/utils'
import { isEventsQuery, isHogQLQuery, isPersonsNode } from '~/queries/utils'
import { getPersonsEndpoint } from '~/queries/query'
import { ExportWithConfirmation } from '~/queries/nodes/DataTable/ExportWithConfirmation'
Expand All @@ -18,26 +22,28 @@ const EXPORT_MAX_LIMIT = 10000

function startDownload(query: DataTableNode, onlySelectedColumns: boolean): void {
const exportContext = isPersonsNode(query.source)
? { path: getPersonsEndpoint(query.source), max_limit: EXPORT_MAX_LIMIT }
: { source: query.source, max_limit: EXPORT_MAX_LIMIT }
? { path: getPersonsEndpoint(query.source) }
: { source: query.source }
if (!exportContext) {
throw new Error('Unsupported node type')
}

const columnMapping = {
url: ['properties.$current_url', 'properties.$screen_name'],
time: 'timestamp',
event: 'event',
source: 'properties.$lib',
person: isPersonsNode(query.source)
? ['distinct_ids.0', 'properties.email']
: ['person.distinct_ids.0', 'person.properties.email'],
}

if (onlySelectedColumns) {
exportContext['columns'] = (query.columns ?? defaultDataTableColumns(query.source.kind))
?.flatMap((c) => columnMapping[c] || c)
.filter((c) => c !== 'person.$delete')
exportContext['columns'] = (
(isEventsQuery(query.source) ? query.source.select : null) ??
query.columns ??
defaultDataTableColumns(query.source.kind)
)?.filter((c) => c !== 'person.$delete')

if (isEventsQuery(query.source)) {
exportContext['columns'] = exportContext['columns'].map((c: string) =>
removeExpressionComment(c) === 'person' ? 'person.properties.email' : c
)
} else if (isPersonsNode(query.source)) {
exportContext['columns'] = exportContext['columns'].map((c: string) =>
removeExpressionComment(c) === 'person' ? 'properties.email' : c
)
}
}
triggerExport({
export_format: ExporterFormat.CSV,
Expand Down Expand Up @@ -185,7 +191,7 @@ export function DataTableExport({ query }: DataTableExportProps): JSX.Element |
(isEventsQuery(source) || isPersonsNode(source) ? source.properties?.length || 0 : 0) +
(isEventsQuery(source) && source.event ? 1 : 0) +
(isPersonsNode(source) && source.search ? 1 : 0)
const canExportAllColumns = isEventsQuery(source) || isPersonsNode(source)
const canExportAllColumns = (isEventsQuery(source) && source.select.includes('*')) || isPersonsNode(source)
const showExportClipboardButtons = isPersonsNode(source) || isEventsQuery(source) || isHogQLQuery(source)

return (
Expand Down
1 change: 0 additions & 1 deletion frontend/src/queries/query.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ describe('query', () => {
'timestamp',
],
},
max_limit: 10000,
})
})

Expand Down
3 changes: 0 additions & 3 deletions frontend/src/queries/query.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,6 @@ import { currentSessionId } from 'lib/internalMetrics'
import { featureFlagLogic } from 'lib/logic/featureFlagLogic'
import { FEATURE_FLAGS } from 'lib/constants'

const EXPORT_MAX_LIMIT = 10000

//get export context for a given query
export function queryExportContext<N extends DataNode = DataNode>(
query: N,
Expand All @@ -47,7 +45,6 @@ export function queryExportContext<N extends DataNode = DataNode>(
} else if (isEventsQuery(query) || isPersonsQuery(query)) {
return {
source: query,
max_limit: EXPORT_MAX_LIMIT,
}
} else if (isHogQLQuery(query)) {
return { source: query }
Expand Down
1 change: 0 additions & 1 deletion frontend/src/scenes/persons/personsLogic.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,6 @@ export const personsLogic = kea<personsLogicType>({
path: cohort
? api.cohorts.determineListUrl(cohort, listFilters)
: api.persons.determineListUrl(listFilters),
max_limit: 10000,
},
},
],
Expand Down
1 change: 0 additions & 1 deletion frontend/src/scenes/retention/RetentionModal.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ export function RetentionModal(): JSX.Element | null {
export_format: ExporterFormat.CSV,
export_context: {
path: row?.people_url,
max_limit: 10000,
},
})
}
Expand Down
2 changes: 0 additions & 2 deletions frontend/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2911,13 +2911,11 @@ export type OnlineExportContext = {
query?: any
body?: any
filename?: string
max_limit?: number
}

export type QueryExportContext = {
source: Record<string, any>
filename?: string
max_limit?: number
}

export type ExportContext = OnlineExportContext | LocalExportContext | QueryExportContext
Expand Down
8 changes: 4 additions & 4 deletions posthog/api/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def _unwrap_pydantic_dict(response: Any) -> Dict:


def process_query(
team: Team, query_json: Dict, default_limit: Optional[int] = None, request: Optional[Request] = None
team: Team, query_json: Dict, in_export_context: Optional[bool] = False, request: Optional[Request] = None
) -> Dict:
# query_json has been parsed by QuerySchemaParser
# it _should_ be impossible to end up in here with a "bad" query
Expand All @@ -221,10 +221,10 @@ def process_query(

if query_kind in QUERY_WITH_RUNNER:
refresh_requested = refresh_requested_by_client(request) if request else False
query_runner = get_query_runner(query_json, team)
query_runner = get_query_runner(query_json, team, in_export_context=in_export_context)
return _unwrap_pydantic_dict(query_runner.run(refresh_requested=refresh_requested))
elif query_kind in QUERY_WITH_RUNNER_NO_CACHE:
query_runner = get_query_runner(query_json, team)
query_runner = get_query_runner(query_json, team, in_export_context=in_export_context)
return _unwrap_pydantic_dict(query_runner.calculate())
elif query_kind == "HogQLQuery":
hogql_query = HogQLQuery.model_validate(query_json)
Expand All @@ -240,7 +240,7 @@ def process_query(
filters=hogql_query.filters,
modifiers=hogql_query.modifiers,
placeholders=values,
default_limit=default_limit,
in_export_context=in_export_context,
explain=hogql_query.explain,
)
return _unwrap_pydantic_dict(hogql_response)
Expand Down
1 change: 0 additions & 1 deletion posthog/api/test/test_exports.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,6 @@ def requests_side_effect(*args, **kwargs):
f"/api/projects/{self.team.pk}/exports/",
{
"export_context": {
"max_limit": 10000,
"path": path,
},
"export_format": "text/csv",
Expand Down
44 changes: 42 additions & 2 deletions posthog/api/test/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,8 @@ def test_full_hogql_query(self):
)

@patch("posthog.hogql.constants.DEFAULT_RETURNED_ROWS", 10)
def test_full_hogql_query_limit(self, DEFAULT_RETURNED_ROWS=10):
@patch("posthog.hogql.constants.MAX_SELECT_RETURNED_ROWS", 15)
def test_full_hogql_query_limit(self, MAX_SELECT_RETURNED_ROWS=15, DEFAULT_RETURNED_ROWS=10):
random_uuid = str(UUIDT())
with freeze_time("2020-01-10 12:00:00"):
for _ in range(20):
Expand All @@ -439,7 +440,28 @@ def test_full_hogql_query_limit(self, DEFAULT_RETURNED_ROWS=10):
self.assertEqual(len(response.get("results", [])), 10)

@patch("posthog.hogql.constants.DEFAULT_RETURNED_ROWS", 10)
def test_full_events_query_limit(self, DEFAULT_RETURNED_ROWS=10):
@patch("posthog.hogql.constants.MAX_SELECT_RETURNED_ROWS", 15)
def test_full_hogql_query_limit_exported(self, MAX_SELECT_RETURNED_ROWS=15, DEFAULT_RETURNED_ROWS=10):
random_uuid = str(UUIDT())
with freeze_time("2020-01-10 12:00:00"):
for _ in range(20):
_create_event(team=self.team, event="sign up", distinct_id=random_uuid, properties={"key": "test_val1"})
flush_persons_and_events()

with freeze_time("2020-01-10 12:14:00"):
response = process_query(
team=self.team,
query_json={
"kind": "HogQLQuery",
"query": f"select event from events where distinct_id='{random_uuid}'",
},
in_export_context=True, # This is the only difference
)
self.assertEqual(len(response.get("results", [])), 15)

@patch("posthog.hogql.constants.DEFAULT_RETURNED_ROWS", 10)
@patch("posthog.hogql.constants.MAX_SELECT_RETURNED_ROWS", 15)
def test_full_events_query_limit(self, MAX_SELECT_RETURNED_ROWS=15, DEFAULT_RETURNED_ROWS=10):
random_uuid = str(UUIDT())
with freeze_time("2020-01-10 12:00:00"):
for _ in range(20):
Expand All @@ -454,6 +476,24 @@ def test_full_events_query_limit(self, DEFAULT_RETURNED_ROWS=10):

self.assertEqual(len(response.get("results", [])), 10)

@patch("posthog.hogql.constants.DEFAULT_RETURNED_ROWS", 10)
@patch("posthog.hogql.constants.MAX_SELECT_RETURNED_ROWS", 15)
def test_full_events_query_limit_exported(self, MAX_SELECT_RETURNED_ROWS=15, DEFAULT_RETURNED_ROWS=10):
random_uuid = str(UUIDT())
with freeze_time("2020-01-10 12:00:00"):
for _ in range(20):
_create_event(team=self.team, event="sign up", distinct_id=random_uuid, properties={"key": "test_val1"})
flush_persons_and_events()

with freeze_time("2020-01-10 12:14:00"):
response = process_query(
team=self.team,
query_json={"kind": "EventsQuery", "select": ["event"], "where": [f"distinct_id = '{random_uuid}'"]},
in_export_context=True,
)

self.assertEqual(len(response.get("results", [])), 15)

def test_property_definition_annotation_does_not_break_things(self):
PropertyDefinition.objects.create(team=self.team, name="$browser", property_type=PropertyType.String)

Expand Down
2 changes: 1 addition & 1 deletion posthog/hogql/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# Limit applied to SELECT statements without LIMIT clause when queried via the API
DEFAULT_RETURNED_ROWS = 100
# Max limit for all SELECT queries, and the default for CSV exports.
MAX_SELECT_RETURNED_ROWS = 10000
MAX_SELECT_RETURNED_ROWS = 10000 # sync with CSV_EXPORT_LIMIT


# Settings applied at the SELECT level
Expand Down
10 changes: 6 additions & 4 deletions posthog/hogql/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def execute_hogql_query(
workload: Workload = Workload.ONLINE,
settings: Optional[HogQLGlobalSettings] = None,
modifiers: Optional[HogQLQueryModifiers] = None,
default_limit: Optional[int] = None,
in_export_context: Optional[bool] = False,
timings: Optional[HogQLTimings] = None,
explain: Optional[bool] = False,
) -> HogQLQueryResponse:
Expand Down Expand Up @@ -61,15 +61,17 @@ def execute_hogql_query(
select_query = replace_placeholders(select_query, placeholders)

with timings.measure("max_limit"):
from posthog.hogql.constants import DEFAULT_RETURNED_ROWS
from posthog.hogql.constants import DEFAULT_RETURNED_ROWS, MAX_SELECT_RETURNED_ROWS

select_queries = (
select_query.select_queries if isinstance(select_query, ast.SelectUnionQuery) else [select_query]
)
for one_query in select_queries:
if one_query.limit is None:
# One more "max" of MAX_SELECT_RETURNED_ROWS (100k) in applied in the query printer.
one_query.limit = ast.Constant(value=default_limit or DEFAULT_RETURNED_ROWS)
# One more "max" of MAX_SELECT_RETURNED_ROWS (10k) in applied in the query printer.
one_query.limit = ast.Constant(
value=MAX_SELECT_RETURNED_ROWS if in_export_context else DEFAULT_RETURNED_ROWS
)

# Get printed HogQL query, and returned columns. Using a cloned query.
with timings.measure("hogql"):
Expand Down
10 changes: 6 additions & 4 deletions posthog/hogql_queries/events_query_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,13 @@ def __init__(
query: EventsQuery | Dict[str, Any],
team: Team,
timings: Optional[HogQLTimings] = None,
default_limit: Optional[int] = None,
in_export_context: Optional[bool] = False,
):
super().__init__(query, team, timings)
super().__init__(query, team, timings, in_export_context)
if isinstance(query, EventsQuery):
self.query = query
else:
self.query = EventsQuery.model_validate(query)
self.default_limit = default_limit

def to_query(self) -> ast.SelectQuery:
# Note: This code is inefficient and problematic, see https://github.com/PostHog/posthog/issues/13485 for details.
Expand Down Expand Up @@ -193,6 +192,7 @@ def calculate(self) -> EventsQueryResponse:
workload=Workload.ONLINE,
query_type="EventsQuery",
timings=self.timings,
in_export_context=self.in_export_context,
)

# Convert star field from tuple to dict in each result
Expand Down Expand Up @@ -267,7 +267,9 @@ def limit(self) -> int:
return (
min(
MAX_SELECT_RETURNED_ROWS,
self.default_limit or DEFAULT_RETURNED_ROWS if self.query.limit is None else self.query.limit,
(MAX_SELECT_RETURNED_ROWS if self.in_export_context else DEFAULT_RETURNED_ROWS)
if self.query.limit is None
else self.query.limit,
)
+ 1
)
Expand Down
10 changes: 8 additions & 2 deletions posthog/hogql_queries/insights/lifecycle_query_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,14 @@ class LifecycleQueryRunner(QueryRunner):
query: LifecycleQuery
query_type = LifecycleQuery

def __init__(self, query: LifecycleQuery | Dict[str, Any], team: Team, timings: Optional[HogQLTimings] = None):
super().__init__(query, team, timings)
def __init__(
self,
query: LifecycleQuery | Dict[str, Any],
team: Team,
timings: Optional[HogQLTimings] = None,
in_export_context: Optional[bool] = False,
):
super().__init__(query, team, timings, in_export_context)

def to_query(self) -> ast.SelectQuery | ast.SelectUnionQuery:
placeholders = {
Expand Down
10 changes: 8 additions & 2 deletions posthog/hogql_queries/insights/trends/trends_query_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,14 @@ class TrendsQueryRunner(QueryRunner):
query_type = TrendsQuery
series: List[SeriesWithExtras]

def __init__(self, query: TrendsQuery | Dict[str, Any], team: Team, timings: Optional[HogQLTimings] = None):
super().__init__(query, team, timings)
def __init__(
self,
query: TrendsQuery | Dict[str, Any],
team: Team,
timings: Optional[HogQLTimings] = None,
in_export_context: Optional[int] = None,
):
super().__init__(query, team, timings, in_export_context)
self.series = self.setup_series()

def _is_stale(self, cached_result_package):
Expand Down
10 changes: 8 additions & 2 deletions posthog/hogql_queries/persons_query_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,14 @@ class PersonsQueryRunner(QueryRunner):
query: PersonsQuery
query_type = PersonsQuery

def __init__(self, query: PersonsQuery | Dict[str, Any], team: Team, timings: Optional[HogQLTimings] = None):
super().__init__(query, team, timings)
def __init__(
self,
query: PersonsQuery | Dict[str, Any],
team: Team,
timings: Optional[HogQLTimings] = None,
in_export_context: Optional[bool] = False,
):
super().__init__(query=query, team=team, timings=timings, in_export_context=in_export_context)
if isinstance(query, PersonsQuery):
self.query = query
else:
Expand Down
Loading

0 comments on commit e77037b

Please sign in to comment.