diff --git a/.github/actions/run-backend-tests/action.yml b/.github/actions/run-backend-tests/action.yml index ddcea26116be1..47340a1c8cb52 100644 --- a/.github/actions/run-backend-tests/action.yml +++ b/.github/actions/run-backend-tests/action.yml @@ -79,7 +79,7 @@ runs: if: steps.hogql-parser-diff.outputs.changed == 'true' shell: bash # This is not cached currently, as it's important to build the current HEAD version of hogql-parser if it has - # changed (as requirements.txt only has the published version) + # changed (requirements.txt has the already-published version) run: | sudo apt-get install libboost-all-dev unzip cmake curl uuid pkg-config curl https://www.antlr.org/download/antlr4-cpp-runtime-4.13.0-source.zip --output antlr4-source.zip diff --git a/.github/workflows/build-hogql-parser.yml b/.github/workflows/build-hogql-parser.yml index 23f253dd488da..97cc24fa59abf 100644 --- a/.github/workflows/build-hogql-parser.yml +++ b/.github/workflows/build-hogql-parser.yml @@ -25,7 +25,7 @@ jobs: steps: - uses: actions/checkout@v4 with: - fetch-depth: 0 + fetch-depth: 0 # Fetching all for comparison since last push (not just last commit) - name: Check if hogql_parser/ has changed id: changed-files-yaml diff --git a/.github/workflows/ci-backend.yml b/.github/workflows/ci-backend.yml index 6fa5b276ae367..198d41c25c117 100644 --- a/.github/workflows/ci-backend.yml +++ b/.github/workflows/ci-backend.yml @@ -157,8 +157,8 @@ jobs: antlr | grep "Version" npm run grammar:build && git diff --exit-code env: - # Installing a version of ANTLR compatible with what's Homebrew as of October 2023 (version 4.13), - # as apt-get is quite out of date. The same version as here is in hogql_parser/pyproject.toml + # Installing a version of ANTLR compatible with what's in Homebrew as of October 2023 (version 4.13), + # as apt-get is quite out of date. The same version must be set in hogql_parser/pyproject.toml ANTLR_VERSION: '4.13.0' check-migrations: diff --git a/.vscode/launch.json b/.vscode/launch.json index 1624d13d056dc..9c2023d82e322 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -96,6 +96,31 @@ "console": "integratedTerminal", "python": "${workspaceFolder}/env/bin/python", "cwd": "${workspaceFolder}" + }, + { + "name": "Pytest: Current File", + "type": "python", + "request": "launch", + "module": "pytest", + "args": ["${file}", "-vvv"], + "console": "integratedTerminal", + "justMyCode": true + }, + { + "name": "(lldb) Attach", + "type": "cppdbg", + "request": "attach", + "program": "/Users/twixes/.pyenv/versions/3.10.10/envs/posthog-3.10/bin/python", + "MIMode": "lldb" + }, + { + "name": "Python C++ Debugger: Current File", + "type": "pythoncpp", + "request": "launch", + "pythonConfig": "custom", + "pythonLaunchName": "Pytest: Current File", + "cppConfig": "custom", + "cppAttachName": "(lldb) Attach" } ], "compounds": [ diff --git a/hogql_parser/CONTRIBUTING.md b/hogql_parser/CONTRIBUTING.md new file mode 100644 index 0000000000000..b26dc989d36e9 --- /dev/null +++ b/hogql_parser/CONTRIBUTING.md @@ -0,0 +1,50 @@ +# Developing `hogql-parser` + +## Mandatory reading + +If you're new to Python C/C++ extensions, there are some things you must have in your mind. + +### [Objects, Types and Reference Counts in CPython](https://docs.python.org/3/c-api/intro.html#objects-types-and-reference-counts) + + Key takeaways: + + 1. `Py_INCREF()` and `Py_DECREF()` need to be used accurately, or there'll be memory leaks (or, less likely, segfaults). + 1. `Py_None`, `Py_True`, and `Py_False` are singletons, but they still need to be incref'd/decref'd - the best way to do create a new reference to them is wrapping them in `Py_NewRef()`. + 1. Pretty much only `PyList_SET_ITEM()` _steals_ references (i.e. assumes ownership of objects passed into it), if you pass an object into any other function and no longer need it after that - remember to `Py_DECREF` it! + +### [Building Values in CPython](https://docs.python.org/3/c-api/arg.html#building-values) + + Key takeaways: + + 1. Use `Py_BuildValue()` for building tuples, dicts, and lists of static size. Use type-specific functions (e.g. `PyUnicode_FromString()` or `PyList_New()`) otherwise. + 1. `str`-building with `s` involves `strlen`, while `s#` doesn't - it's better to use the latter with C++ strings. + 1. `object`-passing with `O` increments the object's refcount, while doing it with `N` doesn't - we should use `N` pretty much exclusively, because the parse tree converter is about creating new objects (not borrowing). + +## Conventions + +1. Use `snake_case`. ANTLR is `camelCase`-heavy because of its Java heritage, but both the C++ stdlib and CPython are snaky. +2. Use the `auto` type for ANTLR and ANTLR-derived types, since they can be pretty verbose. Otherwise specify the type explictly. +3. Stay out of Python land as long as possible. E.g. avoid using `PyObject*`s` for bools or strings. + Do use Python for parsing numbers though - that way we don't need to consider integer overflow. +4. If any child rule results in an AST node, so must the parent rule - once in Python land, always in Python land. + E.g. it doesn't make sense to create a `vector`, that should just be a `PyObject*` of Python type `list`. + +## How to develop locally on macOS + +1. Install libraries: + + ```bash + brew install boost antlr + ``` + +1. Install `hogql_parser` by building from local sources: + + ```bash + pip install ./hogql_parser + ``` + +1. If you now run tests, the locally-built version of `hogql_parser` will be used: + + ```bash + pytest posthog/hogql/ + ``` diff --git a/hogql_parser/README.md b/hogql_parser/README.md index ed70674a95cd8..c8436358ffc9a 100644 --- a/hogql_parser/README.md +++ b/hogql_parser/README.md @@ -1,21 +1,3 @@ # HogQL Parser -## Developing locally on macOS - -1. Install libraries: - - ```bash - brew install boost antlr - ``` - -1. Install `hogql_parser` from local sources: - - ```bash - pip install ./hogql_parser - ``` - -1. If you now run tests, the locally-built version of `hogql_parser` will be used: - - ```bash - pytest posthog/hogql/ - ``` +Blazing fast HogQL parsing. This package can only work in the context of the PostHog Django app, as it imports from `posthog.hogql`. diff --git a/hogql_parser/parser.cpp b/hogql_parser/parser.cpp index 38de26aeaa1f7..ae166553effcb 100644 --- a/hogql_parser/parser.cpp +++ b/hogql_parser/parser.cpp @@ -16,23 +16,23 @@ VISIT(RULE) { \ throw HogQLNotImplementedException("Unsupported rule: " #RULE); \ } -#define HANDLE_HOGQL_EXCEPTION(TYPE) \ - (const HogQL##TYPE& e) { \ - PyObject* error_type = PyObject_GetAttrString(state->errors_module, #TYPE); \ - if (!error_type) { \ - return NULL; \ - } \ - string err_what = e.what(); \ +#define HANDLE_HOGQL_EXCEPTION(TYPE) \ + (const HogQL##TYPE& e) { \ + PyObject* error_type = PyObject_GetAttrString(state->errors_module, #TYPE); \ + if (!error_type) { \ + return NULL; \ + } \ + string err_what = e.what(); \ PyObject* py_err = PyObject_CallObject(error_type, Py_BuildValue("(s#)", err_what.data(), err_what.size())); \ - if (!py_err) { \ - Py_DECREF(error_type); \ - return NULL; \ - } \ - PyObject_SetAttrString(py_err, "start", PyLong_FromSize_t(e.start)); \ - PyObject_SetAttrString(py_err, "end", PyLong_FromSize_t(e.end)); \ - PyErr_SetObject(error_type, py_err); \ - Py_DECREF(error_type); \ - return NULL; \ + if (!py_err) { \ + Py_DECREF(error_type); \ + return NULL; \ + } \ + PyObject_SetAttrString(py_err, "start", PyLong_FromSize_t(e.start)); \ + PyObject_SetAttrString(py_err, "end", PyLong_FromSize_t(e.end)); \ + PyErr_SetObject(error_type, py_err); \ + Py_DECREF(error_type); \ + return NULL; \ } using namespace std; @@ -65,21 +65,6 @@ PyObject* X_PyList_FromStrings(const vector& items) { // PARSING AND AST CONVERSION -// Conventions: -// 1. If any child rule results in an AST node, so must the parent rule - once in Python land, always in Python land. -// E.g. it doesn't make sense to create a vector, that should just be a PyObject* (a Python list). -// 2. Stay out of Python land as long as possible. E.g. avoid using PyObjects* for bools or strings. -// Do use Python for parsing numbers though - that way we don't need to consider integer overflow. -// 3. REMEMBER TO Py_DECREF AND Py_INCREF. Otherwise there will be memory leaks or segfaults. -// 4. For Py_None, Py_True, and Py_False, just wrap them in Py_NewRef(). -// 5. In Py_BuildValue tend towards use of `N` (which does not increment the refcount) over `O` (which does). -// That's because we mostly use new values and not borrowed ones - but this is not a hard rule. -// 6. Use the `auto` type for HogQLParser:: and HogQLLexer:: values. Otherwise it's clearer for the type to be explicit. - -// To understand how Py_BuildValue, PyArg_ParseTuple, and PyArg_ParseTupleAndKeywords formats work, -// (for instance, what does `s`, `s#`, `i` or `N` mean) read this: -// https://docs.python.org/3/c-api/arg.html - class HogQLParseTreeConverter : public HogQLParserBaseVisitor { private: parser_state* state; @@ -164,6 +149,7 @@ class HogQLParseTreeConverter : public HogQLParserBaseVisitor { if (node.has_value() && node.type() == typeid(PyObject*)) { PyObject* py_node = any_cast(node); if (py_node && is_ast_node_instance(py_node)) { + // FIXME: This is leak, because the value argument is not decref'd. Fix for all PyObject_SetAttrString calls. PyObject_SetAttrString(py_node, "start", PyLong_FromSize_t(start)); PyObject_SetAttrString(py_node, "end", PyLong_FromSize_t(stop + 1)); } @@ -246,7 +232,7 @@ class HogQLParseTreeConverter : public HogQLParserBaseVisitor { } VISIT(SelectUnionStmt) { - // Using a vector of PyObjects atypically here, because this is a precursor to flattened_queries + // Using a vector of PyObjects atypically here, because this is a precursor of flattened_queries vector select_queries; auto select_stmt_with_parens_ctxs = ctx->selectStmtWithParens(); select_queries.reserve(select_stmt_with_parens_ctxs.size()); @@ -575,7 +561,7 @@ class HogQLParseTreeConverter : public HogQLParserBaseVisitor { VISIT(FrameStart) { return visit(ctx->winFrameBound()); } VISIT(FrameBetween) { - return Py_BuildValue("OO", visitAsPyObject(ctx->winFrameBound(0)), visitAsPyObject(ctx->winFrameBound(1))); + return Py_BuildValue("NN", visitAsPyObject(ctx->winFrameBound(0)), visitAsPyObject(ctx->winFrameBound(1))); } VISIT(WinFrameBound) { diff --git a/hogql_parser/setup.py b/hogql_parser/setup.py index bb7a4fce93001..75bdadfd56f4e 100644 --- a/hogql_parser/setup.py +++ b/hogql_parser/setup.py @@ -32,7 +32,7 @@ setup( name="hogql_parser", - version="0.1.4", + version="0.1.5", url="https://github.com/PostHog/posthog/tree/master/hogql_parser", author="PostHog Inc.", author_email="hey@posthog.com",