Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(hogql): placeholders and cleanup #14116

Merged
merged 13 commits into from
Feb 8, 2023
72 changes: 58 additions & 14 deletions posthog/hogql/ast.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from enum import Enum
from typing import Any, List, Optional
from typing import Any, List, Literal, cast

from pydantic import BaseModel, Extra

Expand All @@ -10,14 +10,20 @@ class AST(BaseModel):
class Config:
extra = Extra.forbid

def children(self) -> List[AST]:
raise NotImplementedError("AST.children() not implemented")


class Expr(AST):
pass


class Column(AST):
class Alias(Expr):
alias: str
expr: Expr
alias: Optional[str] = None

def children(self) -> List[AST]:
return cast(List[AST], [self.expr])


class BinaryOperationType(str, Enum):
Expand All @@ -33,18 +39,28 @@ class BinaryOperation(Expr):
right: Expr
op: BinaryOperationType

def children(self) -> List[AST]:
return cast(List[AST], [self.left, self.right])


class And(Expr):
class Config:
extra = Extra.forbid

class BooleanOperationType(str, Enum):
And = "and"
Or = "or"
exprs: List[Expr]

def children(self) -> List[AST]:
return cast(List[AST], self.exprs)

class BooleanOperation(Expr):

class Or(Expr):
class Config:
extra = Extra.forbid

op: BooleanOperationType
values: List[Expr]
exprs: List[Expr]

def children(self) -> List[AST]:
return cast(List[AST], self.exprs)


class CompareOperationType(str, Enum):
Expand All @@ -58,30 +74,58 @@ class CompareOperationType(str, Enum):
ILike = "ilike"
NotLike = "not like"
NotILike = "not ilike"
In = "in"
NotIn = "not in"


class CompareOperation(Expr):
left: Expr
right: Expr
op: CompareOperationType

def children(self) -> List[AST]:
return cast(List[AST], [self.left, self.right])


class Not(Expr):
expr: Expr

def children(self) -> List[AST]:
return cast(List[AST], [self.expr])


class NotOperation(Expr):
class OrderExpr(Expr):
expr: Expr
order: Literal["ASC", "DESC"] = "ASC"

def children(self) -> List[AST]:
return cast(List[AST], [self.expr])


class Constant(Expr):
value: Any


class FieldAccess(Expr):
field: str
def children(self) -> List[AST]:
return cast(List[AST], [])


class FieldAccessChain(Expr):
class Field(Expr):
chain: List[str]

def children(self) -> List[AST]:
return cast(List[AST], [])


class Placeholder(Expr):
field: str

def children(self) -> List[AST]:
return cast(List[AST], [])


class Call(Expr):
name: str
args: List[Expr]

def children(self) -> List[AST]:
return cast(List[AST], self.args)
126 changes: 126 additions & 0 deletions posthog/hogql/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# fields you can select from in the events query
EVENT_FIELDS = [
"id",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this doesn't exist on the schema

"uuid",
"event",
"timestamp",
"properties",
"elements_chain",
"created_at",
"distinct_id",
"team_id",
]
# "person.*" fields you can select from in the events query
EVENT_PERSON_FIELDS = ["id", "created_at", "properties"]

# HogQL -> ClickHouse allowed transformations
CLICKHOUSE_FUNCTIONS = {
# arithmetic
"abs": "abs",
"max2": "max2",
"min2": "min2",
# type conversions
"toInt": "toInt64OrNull",
"toFloat": "toFloat64OrNull",
"toDecimal": "toDecimal64OrNull",
"toDate": "toDateOrNull",
"toDateTime": "parseDateTimeBestEffort",
"toIntervalSecond": "toIntervalSecond",
"toIntervalMinute": "toIntervalMinute",
"toIntervalHour": "toIntervalHour",
"toIntervalDay": "toIntervalDay",
"toIntervalWeek": "toIntervalWeek",
"toIntervalMonth": "toIntervalMonth",
"toIntervalQuarter": "toIntervalQuarter",
"toIntervalYear": "toIntervalYear",
"toString": "toString",
# date functions
"now": "now",
"NOW": "now",
"toMonday": "toMonday",
"toStartOfYear": "toStartOfYear",
"toStartOfQuarter": "toStartOfQuarter",
"toStartOfMonth": "toStartOfMonth",
"toStartOfWeek": "toStartOfWeek",
"toStartOfDay": "toStartOfDay",
"toStartOfHour": "toStartOfHour",
"toStartOfMinute": "toStartOfMinute",
"toStartOfSecond": "toStartOfSecond",
"toStartOfFiveMinutes": "toStartOfFiveMinutes",
"toStartOfTenMinutes": "toStartOfTenMinutes",
"toStartOfFifteenMinutes": "toStartOfFifteenMinutes",
"toTimezone": "toTimezone",
"age": "age",
"dateDiff": "dateDiff",
"dateTrunc": "dateTrunc",
"formatDateTime": "formatDateTime",
# string functions
"length": "lengthUTF8",
"empty": "empty",
"notEmpty": "notEmpty",
"leftPad": "leftPad",
"rightPad": "rightPad",
"lower": "lower",
"upper": "upper",
"repeat": "repeat",
"format": "format",
"concat": "concat",
"coalesce": "coalesce",
"substring": "substringUTF8",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm if the route we're taking is having these little bits of magic then somewhere they should be made pretty explicit. As a suggestion if we want to keep this as is I'd at least put the "magic mappings" at the top and explain in a comment why we do so (even though I see why reading this!)

Just want to be very explicit about where we're diverging the dialects

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we should document and clean this up. I'm not sure which "magic shortcuts" we want to take. I'm very very happy to revisit this list and make a real decision on each case. The current list was composed in a 20min runthrough of the clickhouse docs, explicitly without having full sql statements in mind. Things have changed enough.

"appendTrailingCharIfAbsent": "appendTrailingCharIfAbsent",
"endsWith": "endsWith",
"startsWith": "startsWith",
"trim": "trimBoth",
"trimLeft": "trimLeft",
"trimRight": "trimRight",
"extractTextFromHTML": "extractTextFromHTML",
"like": "like",
"ilike": "ilike",
"notLike": "notLike",
"replace": "replace",
"replaceOne": "replaceOne",
# array functions
"tuple": "tuple",
# conditional
"ifElse": "if",
"multiIf": "multiIf",
# rounding
"round": "round",
"floor": "floor",
"ceil": "ceil",
"trunc": "trunc",
}
# Permitted HogQL aggregations
HOGQL_AGGREGATIONS = {
"count": 0,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I take it these are the number of arguments each function takes? Wasn't immediately obvious to me

Copy link
Collaborator Author

@mariusandra mariusandra Feb 8, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, and this will need to be changed: count can take 0..1 arguments

"countIf": 1,
"countDistinct": 1,
"countDistinctIf": 2,
"min": 1,
"minIf": 2,
"max": 1,
"maxIf": 2,
"sum": 1,
"sumIf": 2,
"avg": 1,
"avgIf": 2,
"any": 1,
"anyIf": 2,
}
# Keywords passed to ClickHouse without transformation
KEYWORDS = ["true", "false", "null"]

# Allow-listed fields returned when you select "*" from events. Person and group fields will be nested later.
SELECT_STAR_FROM_EVENTS_FIELDS = [
"uuid",
"event",
"properties",
"timestamp",
"team_id",
"distinct_id",
"elements_chain",
"created_at",
"person.id",
"person.created_at",
"person.properties",
]
23 changes: 23 additions & 0 deletions posthog/hogql/context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from dataclasses import dataclass, field
from typing import Dict, List, Literal, Optional


@dataclass
class HogQLFieldAccess:
input: List[str]
type: Optional[Literal["event", "event.properties", "person", "person.properties"]]
field: Optional[str]
sql: str
Comment on lines +5 to +10
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This HogQLFieldAccess is copied over from the printer as it was. I'd like to get rid of it in a followup. Probably when working on aliases.

HogQLFieldAccess gathers fields as they are printed. While that might be useful, we often discard this printed string just to get this metadata. The cleaner approach is to just visit the tree, check all Call or Field nodes and use that information directly. No need to print anything in this case. This PR already contains such utils. That refactor is for later.



@dataclass
class HogQLContext:
"""Context given to a HogQL expression printer"""

# If set, will save string constants to this dict. Inlines strings into the query if None.
values: Dict = field(default_factory=dict)
# List of field and property accesses found in the expression
field_access_logs: List[HogQLFieldAccess] = field(default_factory=list)
# Did the last calls to translate_hogql since setting these to False contain any of the following
found_aggregation: bool = False
using_person_on_events: bool = True
1 change: 1 addition & 0 deletions posthog/hogql/grammar/HogQLLexer.g4
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ HEXADECIMAL_LITERAL: '0' X HEX_DIGIT+;
// It's important that quote-symbol is a single character.
STRING_LITERAL: QUOTE_SINGLE ( ~([\\']) | ESCAPE_CHAR | (QUOTE_SINGLE QUOTE_SINGLE) )* QUOTE_SINGLE;
PLACEHOLDER: LBRACE ( ~([\\}]) | ESCAPE_CHAR | (LBRACE LBRACE) )* RBRACE;
// Alphabet and allowed symbols
Expand Down
5 changes: 4 additions & 1 deletion posthog/hogql/grammar/HogQLLexer.interp

Large diffs are not rendered by default.

Loading