diff --git a/mypy-baseline.txt b/mypy-baseline.txt index 4599145228cfa..58ea7ed8a90c2 100644 --- a/mypy-baseline.txt +++ b/mypy-baseline.txt @@ -1,4 +1,67 @@ posthog/warehouse/models/ssh_tunnel.py:0: error: Incompatible types in assignment (expression has type "NoEncryption", variable has type "BestAvailableEncryption") [assignment] +posthog/temporal/data_imports/pipelines/sql_database_v2/schema_types.py:0: error: Statement is unreachable [unreachable] +posthog/temporal/data_imports/pipelines/sql_database_v2/schema_types.py:0: error: Non-overlapping equality check (left operand type: "Literal['text', 'double', 'bool', 'timestamp', 'bigint', 'json', 'decimal', 'wei', 'date', 'time'] | None", right operand type: "Literal['interval']") [comparison-overlap] +posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py:0: error: Invalid index type "str | None" for "dict[str, ndarray[Any, dtype[Any]]]"; expected type "str" [index] +posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py:0: error: Invalid index type "str | None" for "dict[str, ndarray[Any, dtype[Any]]]"; expected type "str" [index] +posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py:0: error: Invalid index type "str | None" for "dict[str, TColumnSchema]"; expected type "str" [index] +posthog/temporal/data_imports/pipelines/sql_database/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Item "None" of "Incremental[Any] | None" has no attribute "row_order" [union-attr] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Incompatible types in assignment (expression has type "Literal['asc', 'desc'] | Any | None", variable has type "Literal['asc', 'desc']") [assignment] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "Column[Any]") [assignment] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "Literal['asc', 'desc']") [assignment] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Item "None" of "dict[str, Any] | None" has no attribute "get" [union-attr] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Argument "primary_key" to "make_hints" has incompatible type "list[str] | None"; expected "str | Sequence[str] | Callable[[Any], str | Sequence[str]]" [arg-type] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Dict entry 2 has incompatible type "Literal['auto']": "None"; expected "Literal['json_response', 'header_link', 'auto', 'single_page', 'cursor', 'offset', 'page_number']": "type[BasePaginator]" [dict-item] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "AuthConfigBase") [assignment] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Argument 1 to "get_auth_class" has incompatible type "Literal['bearer', 'api_key', 'http_basic'] | None"; expected "Literal['bearer', 'api_key', 'http_basic']" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Need type annotation for "dependency_graph" [var-annotated] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible types in assignment (expression has type "None", target has type "ResolvedParam") [assignment] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible return value type (got "tuple[TopologicalSorter[Any], dict[str, EndpointResource], dict[str, ResolvedParam]]", expected "tuple[Any, dict[str, EndpointResource], dict[str, ResolvedParam | None]]") [return-value] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unsupported right operand type for in ("str | Endpoint | None") [operator] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Value of type variable "StrOrLiteralStr" of "parse" of "Formatter" cannot be "str | None" [type-var] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unsupported right operand type for in ("dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None") [operator] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unsupported right operand type for in ("dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None") [operator] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Value of type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None" is not indexable [index] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Item "None" of "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None" has no attribute "pop" [union-attr] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Value of type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None" is not indexable [index] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Item "None" of "str | None" has no attribute "format" [union-attr] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Argument 1 to "single_entity_path" has incompatible type "str | None"; expected "str" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Item "None" of "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None" has no attribute "items" [union-attr] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible types in assignment (expression has type "str | None", variable has type "str") [assignment] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible types in assignment (expression has type "str | None", variable has type "str") [assignment] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Statement is unreachable [unreachable] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unpacked dict entry 0 has incompatible type "dict[str, Any] | None"; expected "SupportsKeysAndGetItem[str, Any]" [dict-item] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unpacked dict entry 1 has incompatible type "dict[str, Any] | None"; expected "SupportsKeysAndGetItem[str, Any]" [dict-item] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unpacked dict entry 0 has incompatible type "dict[str, Any] | None"; expected "SupportsKeysAndGetItem[str, ResolveParamConfig | IncrementalParamConfig | Any]" [dict-item] +posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unpacked dict entry 1 has incompatible type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None"; expected "SupportsKeysAndGetItem[str, ResolveParamConfig | IncrementalParamConfig | Any]" [dict-item] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Not all union combinations were tried because there are too many unions [misc] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 2 to "source" has incompatible type "str | None"; expected "str" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 3 to "source" has incompatible type "str | None"; expected "str" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 4 to "source" has incompatible type "int | None"; expected "int" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 6 to "source" has incompatible type "Schema | None"; expected "Schema" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 7 to "source" has incompatible type "Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | None"; expected "Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 8 to "source" has incompatible type "type[BaseConfiguration] | None"; expected "type[BaseConfiguration]" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 1 to "build_resource_dependency_graph" has incompatible type "EndpointResourceBase | None"; expected "EndpointResourceBase" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Incompatible types in assignment (expression has type "list[str] | None", variable has type "list[str]") [assignment] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 1 to "setup_incremental_object" has incompatible type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None"; expected "dict[str, Any]" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument "base_url" to "RESTClient" has incompatible type "str | None"; expected "str" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 1 to "exclude_keys" has incompatible type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None"; expected "Mapping[str, Any]" [arg-type] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Incompatible default for argument "resolved_param" (default has type "ResolvedParam | None", argument has type "ResolvedParam") [assignment] +posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] posthog/utils.py:0: error: No overload variant of "asdict" matches argument type "type[DataclassInstance]" [call-overload] posthog/utils.py:0: note: Possible overload variants: posthog/utils.py:0: note: def asdict(obj: DataclassInstance) -> dict[str, Any] @@ -365,9 +428,40 @@ posthog/test/test_feature_flag_analytics.py:0: error: Item "None" of "Dashboard posthog/test/test_feature_flag_analytics.py:0: error: Item "None" of "Dashboard | None" has no attribute "tiles" [union-attr] posthog/test/test_feature_flag_analytics.py:0: error: Item "None" of "Dashboard | None" has no attribute "tiles" [union-attr] posthog/test/test_feature_flag_analytics.py:0: error: Item "None" of "Dashboard | None" has no attribute "delete" [union-attr] -posthog/temporal/data_imports/pipelines/sql_database_v2/schema_types.py:0: error: Statement is unreachable [unreachable] -posthog/temporal/data_imports/pipelines/sql_database_v2/schema_types.py:0: error: Non-overlapping equality check (left operand type: "Literal['text', 'double', 'bool', 'timestamp', 'bigint', 'json', 'decimal', 'wei', 'date', 'time'] | None", right operand type: "Literal['interval']") [comparison-overlap] -posthog/temporal/data_imports/pipelines/sql_database/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: error: No overload variant of "with_only_columns" of "Select" matches argument type "ReadOnlyColumnCollection[str, Column[Any]]" [call-overload] +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: Possible overload variants: +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], /) -> Select[tuple[_T0]] +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], /) -> Select[tuple[_T0, _T1]] +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], /) -> Select[tuple[_T0, _T1, _T2]] +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], /) -> Select[tuple[_T0, _T1, _T2, _T3]] +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3, _T4] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], TypedColumnsClauseRole[_T4] | SQLCoreOperations[_T4] | type[_T4], /) -> Select[tuple[_T0, _T1, _T2, _T3, _T4]] +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3, _T4, _T5] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], TypedColumnsClauseRole[_T4] | SQLCoreOperations[_T4] | type[_T4], TypedColumnsClauseRole[_T5] | SQLCoreOperations[_T5] | type[_T5], /) -> Select[tuple[_T0, _T1, _T2, _T3, _T4, _T5]] +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3, _T4, _T5, _T6] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], TypedColumnsClauseRole[_T4] | SQLCoreOperations[_T4] | type[_T4], TypedColumnsClauseRole[_T5] | SQLCoreOperations[_T5] | type[_T5], TypedColumnsClauseRole[_T6] | SQLCoreOperations[_T6] | type[_T6], /) -> Select[tuple[_T0, _T1, _T2, _T3, _T4, _T5, _T6]] +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3, _T4, _T5, _T6, _T7] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], TypedColumnsClauseRole[_T4] | SQLCoreOperations[_T4] | type[_T4], TypedColumnsClauseRole[_T5] | SQLCoreOperations[_T5] | type[_T5], TypedColumnsClauseRole[_T6] | SQLCoreOperations[_T6] | type[_T6], TypedColumnsClauseRole[_T7] | SQLCoreOperations[_T7] | type[_T7], /) -> Select[tuple[_T0, _T1, _T2, _T3, _T4, _T5, _T6, _T7]] +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def with_only_columns(self, *entities: TypedColumnsClauseRole[Any] | ColumnsClauseRole | SQLCoreOperations[Any] | Literal['*', 1] | type[Any] | Inspectable[_HasClauseElement[Any]] | _HasClauseElement[Any], maintain_column_froms: bool = ..., **Any) -> Select[Any] +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: error: No overload variant of "resource" matches argument types "Callable[[Engine, Table, int, Literal['sqlalchemy', 'pyarrow', 'pandas', 'connectorx'], Incremental[Any] | None, Any | None, bool, Callable[[Table], None] | None, Literal['minimal', 'full', 'full_with_precision'], dict[str, Any] | None, Callable[[TypeEngine[Any]], TypeEngine[Any] | type[TypeEngine[Any]] | None] | None, list[str] | None, Callable[[Select[Any], Table], Select[Any]] | None, list[str] | None], Iterator[Any]]", "str", "list[str] | None", "list[str] | None", "dict[str, TColumnSchema]", "Collection[str]", "str" [call-overload] +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: Possible overload variants: +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TResourceFunParams`-1, TDltResourceImpl: DltResource] resource(Callable[TResourceFunParams, Any], /, name: str = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ...) -> TDltResourceImpl +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TDltResourceImpl: DltResource] resource(None = ..., /, name: str = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ...) -> Callable[[Callable[TResourceFunParams, Any]], TDltResourceImpl] +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TDltResourceImpl: DltResource] resource(None = ..., /, name: str | Callable[[Any], str] = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ..., standalone: Literal[True] = ...) -> Callable[[Callable[TResourceFunParams, Any]], Callable[TResourceFunParams, TDltResourceImpl]] +posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TDltResourceImpl: DltResource] resource(list[Any] | tuple[Any] | Iterator[Any], /, name: str = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ...) -> TDltResourceImpl posthog/tasks/test/test_update_survey_iteration.py:0: error: Item "None" of "FeatureFlag | None" has no attribute "filters" [union-attr] posthog/tasks/test/test_stop_surveys_reached_target.py:0: error: No overload variant of "__sub__" of "datetime" matches argument type "None" [operator] posthog/tasks/test/test_stop_surveys_reached_target.py:0: note: Possible overload variants: @@ -505,10 +599,12 @@ posthog/warehouse/data_load/validate_schema.py:0: error: Incompatible types in a posthog/warehouse/data_load/validate_schema.py:0: error: Incompatible types in assignment (expression has type "object", variable has type "str | int | Combinable") [assignment] posthog/warehouse/data_load/validate_schema.py:0: error: Incompatible types in assignment (expression has type "dict[str, dict[str, str | bool]] | dict[str, str]", variable has type "dict[str, dict[str, str]]") [assignment] posthog/warehouse/data_load/source_templates.py:0: error: Incompatible types in assignment (expression has type "str", variable has type "Type") [assignment] -posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py:0: error: Invalid index type "str | None" for "dict[str, ndarray[Any, dtype[Any]]]"; expected type "str" [index] -posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py:0: error: Invalid index type "str | None" for "dict[str, ndarray[Any, dtype[Any]]]"; expected type "str" [index] -posthog/temporal/data_imports/pipelines/sql_database_v2/arrow_helpers.py:0: error: Invalid index type "str | None" for "dict[str, TColumnSchema]"; expected type "str" [index] +posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: error: No overload variant of "get" of "dict" matches argument types "str", "tuple[()]" [call-overload] +posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: note: Possible overload variants: +posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: note: def get(self, Type, /) -> Sequence[str] | None +posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: note: def get(self, Type, Sequence[str], /) -> Sequence[str] +posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: note: def [_T] get(self, Type, _T, /) -> Sequence[str] | _T +posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: error: Argument "source_id" to "sync_old_schemas_with_new_schemas" has incompatible type "str"; expected "UUID" [arg-type] posthog/tasks/exports/test/test_csv_exporter.py:0: error: Function is missing a return type annotation [no-untyped-def] posthog/tasks/exports/test/test_csv_exporter.py:0: error: Function is missing a type annotation [no-untyped-def] posthog/tasks/exports/test/test_csv_exporter.py:0: error: Function is missing a type annotation for one or more arguments [no-untyped-def] @@ -622,6 +718,7 @@ posthog/api/test/dashboards/test_dashboard.py:0: error: Value of type variable " posthog/api/test/dashboards/test_dashboard.py:0: error: Module "django.utils.timezone" does not explicitly export attribute "timedelta" [attr-defined] posthog/api/test/dashboards/test_dashboard.py:0: error: Module "django.utils.timezone" does not explicitly export attribute "timedelta" [attr-defined] posthog/api/test/dashboards/test_dashboard.py:0: error: Module "django.utils.timezone" does not explicitly export attribute "timedelta" [attr-defined] +posthog/api/query.py:0: error: Statement is unreachable [unreachable] posthog/api/property_definition.py:0: error: Item "AnonymousUser" of "User | AnonymousUser" has no attribute "organization" [union-attr] posthog/api/property_definition.py:0: error: Item "None" of "Organization | Any | None" has no attribute "is_feature_available" [union-attr] posthog/api/property_definition.py:0: error: Item "ForeignObjectRel" of "Field[Any, Any] | ForeignObjectRel | GenericForeignKey" has no attribute "cached_col" [union-attr] @@ -670,6 +767,16 @@ ee/clickhouse/views/experiments.py:0: error: Argument 4 to "ClickhouseTrendExper ee/clickhouse/views/experiments.py:0: error: Argument 4 to "ClickhouseFunnelExperimentResult" has incompatible type "datetime | None"; expected "datetime" [arg-type] ee/clickhouse/views/experiments.py:0: error: Argument 4 to "ClickhouseSecondaryExperimentResult" has incompatible type "datetime | None"; expected "datetime" [arg-type] ee/clickhouse/views/experiments.py:0: error: Item "None" of "User | None" has no attribute "email" [union-attr] +posthog/warehouse/api/external_data_schema.py:0: error: Incompatible return value type (got "str | None", expected "SyncType | None") [return-value] +posthog/warehouse/api/external_data_schema.py:0: error: Argument 1 to "get_sql_schemas_for_source_type" has incompatible type "str"; expected "Type" [arg-type] +posthog/warehouse/api/external_data_schema.py:0: error: No overload variant of "get" of "dict" matches argument type "str" [call-overload] +posthog/warehouse/api/external_data_schema.py:0: note: Possible overload variants: +posthog/warehouse/api/external_data_schema.py:0: note: def get(self, Type, /) -> dict[str, list[IncrementalField]] | None +posthog/warehouse/api/external_data_schema.py:0: note: def get(self, Type, dict[str, list[IncrementalField]], /) -> dict[str, list[IncrementalField]] +posthog/warehouse/api/external_data_schema.py:0: note: def [_T] get(self, Type, _T, /) -> dict[str, list[IncrementalField]] | _T +posthog/warehouse/api/table.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/warehouse/api/table.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/warehouse/api/table.py:0: error: Unused "type: ignore" comment [unused-ignore] posthog/temporal/tests/batch_exports/test_run_updates.py:0: error: Unused "type: ignore" comment [unused-ignore] posthog/temporal/tests/batch_exports/test_run_updates.py:0: error: Unused "type: ignore" comment [unused-ignore] posthog/temporal/tests/batch_exports/test_run_updates.py:0: error: Unused "type: ignore" comment [unused-ignore] @@ -678,27 +785,13 @@ posthog/temporal/tests/batch_exports/test_batch_exports.py:0: error: TypedDict k posthog/temporal/data_modeling/run_workflow.py:0: error: Dict entry 20 has incompatible type "str": "Literal['complex']"; expected "str": "Literal['text', 'double', 'bool', 'timestamp', 'bigint', 'binary', 'json', 'decimal', 'wei', 'date', 'time']" [dict-item] posthog/temporal/data_modeling/run_workflow.py:0: error: Dict entry 21 has incompatible type "str": "Literal['complex']"; expected "str": "Literal['text', 'double', 'bool', 'timestamp', 'bigint', 'binary', 'json', 'decimal', 'wei', 'date', 'time']" [dict-item] posthog/temporal/data_modeling/run_workflow.py:0: error: Dict entry 22 has incompatible type "str": "Literal['complex']"; expected "str": "Literal['text', 'double', 'bool', 'timestamp', 'bigint', 'binary', 'json', 'decimal', 'wei', 'date', 'time']" [dict-item] -posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: error: No overload variant of "get" of "dict" matches argument types "str", "tuple[()]" [call-overload] -posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: note: Possible overload variants: -posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: note: def get(self, Type, /) -> Sequence[str] | None -posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: note: def get(self, Type, Sequence[str], /) -> Sequence[str] -posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: note: def [_T] get(self, Type, _T, /) -> Sequence[str] | _T -posthog/temporal/data_imports/workflow_activities/sync_new_schemas.py:0: error: Argument "source_id" to "sync_old_schemas_with_new_schemas" has incompatible type "str"; expected "UUID" [arg-type] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Item "None" of "Incremental[Any] | None" has no attribute "row_order" [union-attr] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Incompatible types in assignment (expression has type "Literal['asc', 'desc'] | Any | None", variable has type "Literal['asc', 'desc']") [assignment] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "Column[Any]") [assignment] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "Literal['asc', 'desc']") [assignment] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Item "None" of "dict[str, Any] | None" has no attribute "get" [union-attr] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Argument "primary_key" to "make_hints" has incompatible type "list[str] | None"; expected "str | Sequence[str] | Callable[[Any], str | Sequence[str]]" [arg-type] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py:0: error: Unused "type: ignore" comment [unused-ignore] posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: "FilesystemDestinationClientConfiguration" has no attribute "delta_jobs_per_write" [attr-defined] posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: "type[FilesystemDestinationClientConfiguration]" has no attribute "delta_jobs_per_write" [attr-defined] posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Incompatible types in assignment (expression has type "object", variable has type "DataWarehouseCredential | Combinable | None") [assignment] posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Incompatible types in assignment (expression has type "object", variable has type "str | int | Combinable") [assignment] -posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Incompatible types in assignment (expression has type "dict[str, dict[str, str | bool]] | dict[str, str]", variable has type "dict[str, dict[str, str]]") [assignment] +posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Right operand of "and" is never evaluated [unreachable] +posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Statement is unreachable [unreachable] +posthog/temporal/data_imports/pipelines/pipeline_sync.py:0: error: Name "raw_db_columns" already defined on line 0 [no-redef] posthog/queries/app_metrics/test/test_app_metrics.py:0: error: Argument 3 to "AppMetricsErrorDetailsQuery" has incompatible type "AppMetricsRequestSerializer"; expected "AppMetricsErrorsRequestSerializer" [arg-type] posthog/queries/app_metrics/test/test_app_metrics.py:0: error: Argument 3 to "AppMetricsErrorDetailsQuery" has incompatible type "AppMetricsRequestSerializer"; expected "AppMetricsErrorsRequestSerializer" [arg-type] posthog/queries/app_metrics/test/test_app_metrics.py:0: error: Argument 3 to "AppMetricsErrorDetailsQuery" has incompatible type "AppMetricsRequestSerializer"; expected "AppMetricsErrorsRequestSerializer" [arg-type] @@ -726,23 +819,6 @@ posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: error: Need type annotation for "_execute_async_calls" (hint: "_execute_async_calls: list[] = ...") [var-annotated] posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: error: Need type annotation for "_cursors" (hint: "_cursors: list[] = ...") [var-annotated] posthog/temporal/tests/batch_exports/test_snowflake_batch_export_workflow.py:0: error: List item 0 has incompatible type "tuple[str, str, int, int, int, int, str, int]"; expected "tuple[str, str, int, int, str, str, str, str]" [list-item] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: error: No overload variant of "with_only_columns" of "Select" matches argument type "ReadOnlyColumnCollection[str, Column[Any]]" [call-overload] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: Possible overload variants: -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], /) -> Select[tuple[_T0]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], /) -> Select[tuple[_T0, _T1]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], /) -> Select[tuple[_T0, _T1, _T2]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], /) -> Select[tuple[_T0, _T1, _T2, _T3]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3, _T4] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], TypedColumnsClauseRole[_T4] | SQLCoreOperations[_T4] | type[_T4], /) -> Select[tuple[_T0, _T1, _T2, _T3, _T4]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3, _T4, _T5] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], TypedColumnsClauseRole[_T4] | SQLCoreOperations[_T4] | type[_T4], TypedColumnsClauseRole[_T5] | SQLCoreOperations[_T5] | type[_T5], /) -> Select[tuple[_T0, _T1, _T2, _T3, _T4, _T5]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3, _T4, _T5, _T6] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], TypedColumnsClauseRole[_T4] | SQLCoreOperations[_T4] | type[_T4], TypedColumnsClauseRole[_T5] | SQLCoreOperations[_T5] | type[_T5], TypedColumnsClauseRole[_T6] | SQLCoreOperations[_T6] | type[_T6], /) -> Select[tuple[_T0, _T1, _T2, _T3, _T4, _T5, _T6]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [_T0, _T1, _T2, _T3, _T4, _T5, _T6, _T7] with_only_columns(self, TypedColumnsClauseRole[_T0] | SQLCoreOperations[_T0] | type[_T0], TypedColumnsClauseRole[_T1] | SQLCoreOperations[_T1] | type[_T1], TypedColumnsClauseRole[_T2] | SQLCoreOperations[_T2] | type[_T2], TypedColumnsClauseRole[_T3] | SQLCoreOperations[_T3] | type[_T3], TypedColumnsClauseRole[_T4] | SQLCoreOperations[_T4] | type[_T4], TypedColumnsClauseRole[_T5] | SQLCoreOperations[_T5] | type[_T5], TypedColumnsClauseRole[_T6] | SQLCoreOperations[_T6] | type[_T6], TypedColumnsClauseRole[_T7] | SQLCoreOperations[_T7] | type[_T7], /) -> Select[tuple[_T0, _T1, _T2, _T3, _T4, _T5, _T6, _T7]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def with_only_columns(self, *entities: TypedColumnsClauseRole[Any] | ColumnsClauseRole | SQLCoreOperations[Any] | Literal['*', 1] | type[Any] | Inspectable[_HasClauseElement[Any]] | _HasClauseElement[Any], maintain_column_froms: bool = ..., **Any) -> Select[Any] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: error: No overload variant of "resource" matches argument types "Callable[[Engine, Table, int, Literal['sqlalchemy', 'pyarrow', 'pandas', 'connectorx'], Incremental[Any] | None, bool, Callable[[Table], None] | None, Literal['minimal', 'full', 'full_with_precision'], dict[str, Any] | None, Callable[[TypeEngine[Any]], TypeEngine[Any] | type[TypeEngine[Any]] | None] | None, list[str] | None, Callable[[Select[Any], Table], Select[Any]] | None, list[str] | None], Iterator[Any]]", "str", "list[str] | None", "list[str] | None", "dict[str, TColumnSchema]", "Collection[str]", "str" [call-overload] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: Possible overload variants: -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TResourceFunParams`-1, TDltResourceImpl: DltResource] resource(Callable[TResourceFunParams, Any], /, name: str = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ...) -> TDltResourceImpl -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TDltResourceImpl: DltResource] resource(None = ..., /, name: str = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ...) -> Callable[[Callable[TResourceFunParams, Any]], TDltResourceImpl] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TDltResourceImpl: DltResource] resource(None = ..., /, name: str | Callable[[Any], str] = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ..., standalone: Literal[True] = ...) -> Callable[[Callable[TResourceFunParams, Any]], Callable[TResourceFunParams, TDltResourceImpl]] -posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py:0: note: def [TDltResourceImpl: DltResource] resource(list[Any] | tuple[Any] | Iterator[Any], /, name: str = ..., table_name: str | Callable[[Any], str] = ..., max_table_nesting: int = ..., write_disposition: Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict | Callable[[Any], Literal['skip', 'append', 'replace', 'merge'] | TWriteDispositionDict | TMergeDispositionDict | TScd2StrategyDict] = ..., columns: dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel] | Callable[[Any], dict[str, TColumnSchema] | Sequence[TColumnSchema] | BaseModel | type[BaseModel]] = ..., primary_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., merge_key: str | Sequence[str] | Callable[[Any], str | Sequence[str]] = ..., schema_contract: Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | Callable[[Any], Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict] = ..., table_format: Literal['iceberg', 'delta', 'hive'] | Callable[[Any], Literal['iceberg', 'delta', 'hive']] = ..., file_format: Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference'] | Callable[[Any], Literal['preferred', 'jsonl', 'typed-jsonl', 'insert_values', 'parquet', 'csv', 'reference']] = ..., references: Sequence[TTableReference] | Callable[[Any], Sequence[TTableReference]] = ..., selected: bool = ..., spec: type[BaseConfiguration] = ..., parallelized: bool = ..., _impl_cls: type[TDltResourceImpl] = ...) -> TDltResourceImpl posthog/migrations/0237_remove_timezone_from_teams.py:0: error: Argument 2 to "RunPython" has incompatible type "Callable[[Migration, Any], None]"; expected "_CodeCallable | None" [arg-type] posthog/migrations/0228_fix_tile_layouts.py:0: error: Argument 2 to "RunPython" has incompatible type "Callable[[Migration, Any], None]"; expected "_CodeCallable | None" [arg-type] posthog/api/plugin_log_entry.py:0: error: Name "timezone.datetime" is not defined [name-defined] @@ -751,78 +827,33 @@ posthog/api/plugin_log_entry.py:0: error: Name "timezone.datetime" is not define posthog/api/plugin_log_entry.py:0: error: Module "django.utils.timezone" does not explicitly export attribute "datetime" [attr-defined] posthog/temporal/tests/batch_exports/test_redshift_batch_export_workflow.py:0: error: Incompatible types in assignment (expression has type "str | int", variable has type "int") [assignment] posthog/api/sharing.py:0: error: Item "None" of "list[Any] | None" has no attribute "__iter__" (not iterable) [union-attr] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Dict entry 2 has incompatible type "Literal['auto']": "None"; expected "Literal['json_response', 'header_link', 'auto', 'single_page', 'cursor', 'offset', 'page_number']": "type[BasePaginator]" [dict-item] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible types in assignment (expression has type "None", variable has type "AuthConfigBase") [assignment] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Argument 1 to "get_auth_class" has incompatible type "Literal['bearer', 'api_key', 'http_basic'] | None"; expected "Literal['bearer', 'api_key', 'http_basic']" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Need type annotation for "dependency_graph" [var-annotated] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible types in assignment (expression has type "None", target has type "ResolvedParam") [assignment] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible return value type (got "tuple[TopologicalSorter[Any], dict[str, EndpointResource], dict[str, ResolvedParam]]", expected "tuple[Any, dict[str, EndpointResource], dict[str, ResolvedParam | None]]") [return-value] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unsupported right operand type for in ("str | Endpoint | None") [operator] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Value of type variable "StrOrLiteralStr" of "parse" of "Formatter" cannot be "str | None" [type-var] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unsupported right operand type for in ("dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None") [operator] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unsupported right operand type for in ("dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None") [operator] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Value of type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None" is not indexable [index] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Item "None" of "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None" has no attribute "pop" [union-attr] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Value of type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None" is not indexable [index] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Item "None" of "str | None" has no attribute "format" [union-attr] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Argument 1 to "single_entity_path" has incompatible type "str | None"; expected "str" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Item "None" of "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None" has no attribute "items" [union-attr] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible types in assignment (expression has type "str | None", variable has type "str") [assignment] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Incompatible types in assignment (expression has type "str | None", variable has type "str") [assignment] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Statement is unreachable [unreachable] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unpacked dict entry 0 has incompatible type "dict[str, Any] | None"; expected "SupportsKeysAndGetItem[str, Any]" [dict-item] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unpacked dict entry 1 has incompatible type "dict[str, Any] | None"; expected "SupportsKeysAndGetItem[str, Any]" [dict-item] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unpacked dict entry 0 has incompatible type "dict[str, Any] | None"; expected "SupportsKeysAndGetItem[str, ResolveParamConfig | IncrementalParamConfig | Any]" [dict-item] -posthog/temporal/data_imports/pipelines/rest_source/config_setup.py:0: error: Unpacked dict entry 1 has incompatible type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None"; expected "SupportsKeysAndGetItem[str, ResolveParamConfig | IncrementalParamConfig | Any]" [dict-item] +posthog/temporal/data_imports/external_data_job.py:0: error: Argument "status" to "update_external_job_status" has incompatible type "str"; expected "Status" [arg-type] posthog/api/test/batch_exports/conftest.py:0: error: Signature of "run" incompatible with supertype "Worker" [override] posthog/api/test/batch_exports/conftest.py:0: note: Superclass: posthog/api/test/batch_exports/conftest.py:0: note: def run(self) -> Coroutine[Any, Any, None] posthog/api/test/batch_exports/conftest.py:0: note: Subclass: posthog/api/test/batch_exports/conftest.py:0: note: def run(self, loop: Any) -> Any posthog/api/test/batch_exports/conftest.py:0: error: Argument "activities" to "ThreadedWorker" has incompatible type "list[function]"; expected "Sequence[Callable[..., Any]]" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Not all union combinations were tried because there are too many unions [misc] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 2 to "source" has incompatible type "str | None"; expected "str" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 3 to "source" has incompatible type "str | None"; expected "str" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 4 to "source" has incompatible type "int | None"; expected "int" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 6 to "source" has incompatible type "Schema | None"; expected "Schema" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 7 to "source" has incompatible type "Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict | None"; expected "Literal['evolve', 'discard_value', 'freeze', 'discard_row'] | TSchemaContractDict" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 8 to "source" has incompatible type "type[BaseConfiguration] | None"; expected "type[BaseConfiguration]" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 1 to "build_resource_dependency_graph" has incompatible type "EndpointResourceBase | None"; expected "EndpointResourceBase" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Incompatible types in assignment (expression has type "list[str] | None", variable has type "list[str]") [assignment] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 1 to "setup_incremental_object" has incompatible type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None"; expected "dict[str, Any]" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument "base_url" to "RESTClient" has incompatible type "str | None"; expected "str" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Argument 1 to "exclude_keys" has incompatible type "dict[str, ResolveParamConfig | IncrementalParamConfig | Any] | None"; expected "Mapping[str, Any]" [arg-type] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Incompatible default for argument "resolved_param" (default has type "ResolvedParam | None", argument has type "ResolvedParam") [assignment] -posthog/temporal/data_imports/pipelines/rest_source/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] posthog/api/test/test_team.py:0: error: "HttpResponse" has no attribute "json" [attr-defined] posthog/api/test/test_team.py:0: error: "HttpResponse" has no attribute "json" [attr-defined] +posthog/api/test/test_capture.py:0: error: Statement is unreachable [unreachable] +posthog/api/test/test_capture.py:0: error: Incompatible return value type (got "_MonkeyPatchedWSGIResponse", expected "HttpResponse") [return-value] +posthog/api/test/test_capture.py:0: error: Module has no attribute "utc" [attr-defined] +posthog/api/test/test_capture.py:0: error: Unpacked dict entry 0 has incompatible type "Collection[str]"; expected "SupportsKeysAndGetItem[str, dict[Never, Never]]" [dict-item] +posthog/api/test/test_capture.py:0: error: Unpacked dict entry 0 has incompatible type "Collection[str]"; expected "SupportsKeysAndGetItem[str, dict[Never, Never]]" [dict-item] +posthog/api/test/test_capture.py:0: error: Unpacked dict entry 0 has incompatible type "Collection[str]"; expected "SupportsKeysAndGetItem[str, dict[Never, Never]]" [dict-item] +posthog/api/test/test_capture.py:0: error: Dict entry 0 has incompatible type "str": "float"; expected "str": "int" [dict-item] +posthog/api/test/test_capture.py:0: error: Dict entry 0 has incompatible type "str": "float"; expected "str": "int" [dict-item] +posthog/api/test/test_capture.py:0: error: Dict entry 0 has incompatible type "str": "float"; expected "str": "int" [dict-item] +posthog/api/test/test_capture.py:0: error: Dict entry 0 has incompatible type "str": "float"; expected "str": "int" [dict-item] +posthog/api/test/test_capture.py:0: error: Dict entry 0 has incompatible type "str": "float"; expected "str": "int" [dict-item] +posthog/api/test/test_capture.py:0: error: Dict entry 0 has incompatible type "str": "float"; expected "str": "int" [dict-item] posthog/test/test_middleware.py:0: error: Incompatible types in assignment (expression has type "_MonkeyPatchedWSGIResponse", variable has type "_MonkeyPatchedResponse") [assignment] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/zendesk/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/vitally/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/pipelines/stripe/__init__.py:0: error: Unused "type: ignore" comment [unused-ignore] +posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] +posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] +posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] +posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] +posthog/temporal/tests/data_imports/test_end_to_end.py:0: error: Unused "type: ignore" comment [unused-ignore] posthog/management/commands/test/test_create_batch_export_from_app.py:0: error: Incompatible return value type (got "dict[str, Collection[str]]", expected "dict[str, str]") [return-value] posthog/management/commands/test/test_create_batch_export_from_app.py:0: error: Incompatible types in assignment (expression has type "dict[str, Collection[str]]", variable has type "dict[str, str]") [assignment] posthog/management/commands/test/test_create_batch_export_from_app.py:0: error: Unpacked dict entry 1 has incompatible type "str"; expected "SupportsKeysAndGetItem[str, str]" [dict-item] @@ -864,32 +895,3 @@ posthog/api/test/batch_exports/test_update.py:0: error: Value of type "BatchExpo posthog/api/test/batch_exports/test_update.py:0: error: Value of type "BatchExport" is not indexable [index] posthog/api/test/batch_exports/test_update.py:0: error: Value of type "BatchExport" is not indexable [index] posthog/api/test/batch_exports/test_pause.py:0: error: "batch_export_delete_schedule" does not return a value (it only ever returns None) [func-returns-value] -posthog/warehouse/api/external_data_schema.py:0: error: Incompatible return value type (got "str | None", expected "SyncType | None") [return-value] -posthog/warehouse/api/external_data_schema.py:0: error: Argument 1 to "get_sql_schemas_for_source_type" has incompatible type "str"; expected "Type" [arg-type] -posthog/warehouse/api/external_data_schema.py:0: error: No overload variant of "get" of "dict" matches argument type "str" [call-overload] -posthog/warehouse/api/external_data_schema.py:0: note: Possible overload variants: -posthog/warehouse/api/external_data_schema.py:0: note: def get(self, Type, /) -> dict[str, list[IncrementalField]] | None -posthog/warehouse/api/external_data_schema.py:0: note: def get(self, Type, dict[str, list[IncrementalField]], /) -> dict[str, list[IncrementalField]] -posthog/warehouse/api/external_data_schema.py:0: note: def [_T] get(self, Type, _T, /) -> dict[str, list[IncrementalField]] | _T -posthog/warehouse/api/table.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/warehouse/api/table.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/warehouse/api/table.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/temporal/data_imports/external_data_job.py:0: error: Argument "status" to "update_external_job_status" has incompatible type "str"; expected "Status" [arg-type] -posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] -posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] -posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] -posthog/temporal/tests/external_data/test_external_data_job.py:0: error: Invalid index type "str" for "dict[Type, Sequence[str]]"; expected type "Type" [index] -posthog/temporal/tests/data_imports/test_end_to_end.py:0: error: Unused "type: ignore" comment [unused-ignore] -posthog/api/query.py:0: error: Statement is unreachable [unreachable] -posthog/api/test/test_capture.py:0: error: Statement is unreachable [unreachable] -posthog/api/test/test_capture.py:0: error: Incompatible return value type (got "_MonkeyPatchedWSGIResponse", expected "HttpResponse") [return-value] -posthog/api/test/test_capture.py:0: error: Module has no attribute "utc" [attr-defined] -posthog/api/test/test_capture.py:0: error: Unpacked dict entry 0 has incompatible type "Collection[str]"; expected "SupportsKeysAndGetItem[str, dict[Never, Never]]" [dict-item] -posthog/api/test/test_capture.py:0: error: Unpacked dict entry 0 has incompatible type "Collection[str]"; expected "SupportsKeysAndGetItem[str, dict[Never, Never]]" [dict-item] -posthog/api/test/test_capture.py:0: error: Unpacked dict entry 0 has incompatible type "Collection[str]"; expected "SupportsKeysAndGetItem[str, dict[Never, Never]]" [dict-item] -posthog/api/test/test_capture.py:0: error: Dict entry 0 has incompatible type "str": "float"; expected "str": "int" [dict-item] -posthog/api/test/test_capture.py:0: error: Dict entry 0 has incompatible type "str": "float"; expected "str": "int" [dict-item] -posthog/api/test/test_capture.py:0: error: Dict entry 0 has incompatible type "str": "float"; expected "str": "int" [dict-item] -posthog/api/test/test_capture.py:0: error: Dict entry 0 has incompatible type "str": "float"; expected "str": "int" [dict-item] -posthog/api/test/test_capture.py:0: error: Dict entry 0 has incompatible type "str": "float"; expected "str": "int" [dict-item] -posthog/api/test/test_capture.py:0: error: Dict entry 0 has incompatible type "str": "float"; expected "str": "int" [dict-item] diff --git a/posthog/constants.py b/posthog/constants.py index a2c69e8058947..7a04658989d6c 100644 --- a/posthog/constants.py +++ b/posthog/constants.py @@ -302,7 +302,7 @@ class FlagRequestType(StrEnum): ENRICHED_DASHBOARD_INSIGHT_IDENTIFIER = "Feature Viewed" DATA_WAREHOUSE_TASK_QUEUE = "data-warehouse-task-queue" -V2_DATA_WAREHOUSE_TASK_QUEUE = "v2-data-warehouse-task-queue" +DATA_WAREHOUSE_TASK_QUEUE_V2 = "v2-data-warehouse-task-queue" BATCH_EXPORTS_TASK_QUEUE = "batch-exports-task-queue" SYNC_BATCH_EXPORTS_TASK_QUEUE = "no-sandbox-python-django" GENERAL_PURPOSE_TASK_QUEUE = "general-purpose-task-queue" diff --git a/posthog/hogql/database/s3_table.py b/posthog/hogql/database/s3_table.py index e5136bc2348cf..479969ae93bd1 100644 --- a/posthog/hogql/database/s3_table.py +++ b/posthog/hogql/database/s3_table.py @@ -1,5 +1,5 @@ import re -from typing import Optional +from typing import TYPE_CHECKING, Optional from posthog.clickhouse.client.escape import substitute_params from posthog.hogql.context import HogQLContext @@ -7,6 +7,9 @@ from posthog.hogql.errors import ExposedHogQLError from posthog.hogql.escape_sql import escape_hogql_identifier +if TYPE_CHECKING: + from posthog.warehouse.models import ExternalDataJob + def build_function_call( url: str, @@ -15,7 +18,10 @@ def build_function_call( access_secret: Optional[str] = None, structure: Optional[str] = None, context: Optional[HogQLContext] = None, + pipeline_version: Optional["ExternalDataJob.PipelineVersion"] = None, ) -> str: + from posthog.warehouse.models import ExternalDataJob + raw_params: dict[str, str] = {} def add_param(value: str, is_sensitive: bool = True) -> str: @@ -36,10 +42,18 @@ def return_expr(expr: str) -> str: # DeltaS3Wrapper format if format == "DeltaS3Wrapper": + query_folder = "__query_v2" if pipeline_version == ExternalDataJob.PipelineVersion.V2 else "__query" + if url.endswith("/"): - escaped_url = add_param(f"{url[:len(url) - 1]}__query/*.parquet") + if pipeline_version == ExternalDataJob.PipelineVersion.V2: + escaped_url = add_param(f"{url[:-5]}{query_folder}/*.parquet") + else: + escaped_url = add_param(f"{url[:-1]}{query_folder}/*.parquet") else: - escaped_url = add_param(f"{url}__query/*.parquet") + if pipeline_version == ExternalDataJob.PipelineVersion.V2: + escaped_url = add_param(f"{url[:-4]}{query_folder}/*.parquet") + else: + escaped_url = add_param(f"{url}{query_folder}/*.parquet") if structure: escaped_structure = add_param(structure, False) diff --git a/posthog/management/commands/start_temporal_worker.py b/posthog/management/commands/start_temporal_worker.py index 005114ac4c704..77701478f2ded 100644 --- a/posthog/management/commands/start_temporal_worker.py +++ b/posthog/management/commands/start_temporal_worker.py @@ -11,9 +11,9 @@ from posthog.constants import ( BATCH_EXPORTS_TASK_QUEUE, DATA_WAREHOUSE_TASK_QUEUE, + DATA_WAREHOUSE_TASK_QUEUE_V2, GENERAL_PURPOSE_TASK_QUEUE, SYNC_BATCH_EXPORTS_TASK_QUEUE, - V2_DATA_WAREHOUSE_TASK_QUEUE, ) from posthog.temporal.batch_exports import ( ACTIVITIES as BATCH_EXPORTS_ACTIVITIES, @@ -28,14 +28,14 @@ SYNC_BATCH_EXPORTS_TASK_QUEUE: BATCH_EXPORTS_WORKFLOWS, BATCH_EXPORTS_TASK_QUEUE: BATCH_EXPORTS_WORKFLOWS, DATA_WAREHOUSE_TASK_QUEUE: DATA_SYNC_WORKFLOWS + DATA_MODELING_WORKFLOWS, - V2_DATA_WAREHOUSE_TASK_QUEUE: DATA_SYNC_WORKFLOWS + DATA_MODELING_WORKFLOWS, + DATA_WAREHOUSE_TASK_QUEUE_V2: DATA_SYNC_WORKFLOWS + DATA_MODELING_WORKFLOWS, GENERAL_PURPOSE_TASK_QUEUE: PROXY_SERVICE_WORKFLOWS, } ACTIVITIES_DICT = { SYNC_BATCH_EXPORTS_TASK_QUEUE: BATCH_EXPORTS_ACTIVITIES, BATCH_EXPORTS_TASK_QUEUE: BATCH_EXPORTS_ACTIVITIES, DATA_WAREHOUSE_TASK_QUEUE: DATA_SYNC_ACTIVITIES + DATA_MODELING_ACTIVITIES, - V2_DATA_WAREHOUSE_TASK_QUEUE: DATA_SYNC_ACTIVITIES + DATA_MODELING_ACTIVITIES, + DATA_WAREHOUSE_TASK_QUEUE_V2: DATA_SYNC_ACTIVITIES + DATA_MODELING_ACTIVITIES, GENERAL_PURPOSE_TASK_QUEUE: PROXY_SERVICE_ACTIVITIES, } diff --git a/posthog/migrations/0533_externaldatajob_pipeline_version.py b/posthog/migrations/0533_externaldatajob_pipeline_version.py new file mode 100644 index 0000000000000..e5db9a99a6d8d --- /dev/null +++ b/posthog/migrations/0533_externaldatajob_pipeline_version.py @@ -0,0 +1,30 @@ +# Generated by Django 4.2.15 on 2024-11-23 14:49 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("posthog", "0532_taxonomy_unique_on_project"), + ] + + operations = [ + migrations.AddField( + model_name="externaldatajob", + name="pipeline_version", + field=models.CharField( + blank=True, + choices=[("v1-dlt-sync", "v1-dlt-sync"), ("v2-non-dlt", "v2-non-dlt")], + max_length=400, + null=True, + ), + ), + migrations.RunSQL( + """ + UPDATE posthog_externaldatajob + SET pipeline_version = 'v1-dlt-sync' + WHERE pipeline_version is null + """, + reverse_sql=migrations.RunSQL.noop, + ), + ] diff --git a/posthog/migrations/max_migration.txt b/posthog/migrations/max_migration.txt index 01fc03d62a8a0..44547aebb012e 100644 --- a/posthog/migrations/max_migration.txt +++ b/posthog/migrations/max_migration.txt @@ -1 +1 @@ -0532_taxonomy_unique_on_project +0533_externaldatajob_pipeline_version diff --git a/posthog/tasks/test/test_usage_report.py b/posthog/tasks/test/test_usage_report.py index 88e2c5a7ac758..6215af18f6821 100644 --- a/posthog/tasks/test/test_usage_report.py +++ b/posthog/tasks/test/test_usage_report.py @@ -1314,11 +1314,23 @@ def test_external_data_rows_synced_response( for i in range(5): start_time = (now() - relativedelta(hours=i)).strftime("%Y-%m-%dT%H:%M:%SZ") - ExternalDataJob.objects.create(team_id=3, created_at=start_time, rows_synced=10, pipeline=source) + ExternalDataJob.objects.create( + team_id=3, + created_at=start_time, + rows_synced=10, + pipeline=source, + pipeline_version=ExternalDataJob.PipelineVersion.V1, + ) for i in range(5): start_time = (now() - relativedelta(hours=i)).strftime("%Y-%m-%dT%H:%M:%SZ") - ExternalDataJob.objects.create(team_id=4, created_at=start_time, rows_synced=10, pipeline=source) + ExternalDataJob.objects.create( + team_id=4, + created_at=start_time, + rows_synced=10, + pipeline=source, + pipeline_version=ExternalDataJob.PipelineVersion.V1, + ) period = get_previous_day(at=now() + relativedelta(days=1)) period_start, period_end = period @@ -1343,6 +1355,64 @@ def test_external_data_rows_synced_response( assert org_2_report["organization_name"] == "Org 2" assert org_2_report["rows_synced_in_period"] == 0 + @patch("posthog.tasks.usage_report.Client") + @patch("posthog.tasks.usage_report.send_report_to_billing_service") + def test_external_data_rows_synced_response_with_v2_jobs( + self, billing_task_mock: MagicMock, posthog_capture_mock: MagicMock + ) -> None: + self._setup_teams() + + source = ExternalDataSource.objects.create( + team=self.analytics_team, + source_id="source_id", + connection_id="connection_id", + status=ExternalDataSource.Status.COMPLETED, + source_type=ExternalDataSource.Type.STRIPE, + ) + + for i in range(5): + start_time = (now() - relativedelta(hours=i)).strftime("%Y-%m-%dT%H:%M:%SZ") + ExternalDataJob.objects.create( + team_id=3, + created_at=start_time, + rows_synced=10, + pipeline=source, + pipeline_version=ExternalDataJob.PipelineVersion.V1, + ) + + for i in range(5): + start_time = (now() - relativedelta(hours=i)).strftime("%Y-%m-%dT%H:%M:%SZ") + ExternalDataJob.objects.create( + team_id=4, + created_at=start_time, + rows_synced=10, + pipeline=source, + pipeline_version=ExternalDataJob.PipelineVersion.V2, + ) + + period = get_previous_day(at=now() + relativedelta(days=1)) + period_start, period_end = period + all_reports = _get_all_org_reports(period_start, period_end) + + assert len(all_reports) == 3 + + org_1_report = _get_full_org_usage_report_as_dict( + _get_full_org_usage_report(all_reports[str(self.org_1.id)], get_instance_metadata(period)) + ) + + org_2_report = _get_full_org_usage_report_as_dict( + _get_full_org_usage_report(all_reports[str(self.org_2.id)], get_instance_metadata(period)) + ) + + assert org_1_report["organization_name"] == "Org 1" + assert org_1_report["rows_synced_in_period"] == 50 + + assert org_1_report["teams"]["3"]["rows_synced_in_period"] == 50 + assert org_1_report["teams"]["4"]["rows_synced_in_period"] == 0 # V2 pipelines + + assert org_2_report["organization_name"] == "Org 2" + assert org_2_report["rows_synced_in_period"] == 0 + @freeze_time("2022-01-10T00:01:00Z") class TestHogFunctionUsageReports(ClickhouseDestroyTablesMixin, TestCase, ClickhouseTestMixin): diff --git a/posthog/tasks/test/test_warehouse.py b/posthog/tasks/test/test_warehouse.py index c6150ef565336..ec7bce8c7832f 100644 --- a/posthog/tasks/test/test_warehouse.py +++ b/posthog/tasks/test/test_warehouse.py @@ -36,7 +36,12 @@ def test_capture_workspace_rows_synced_by_team_month_cutoff(self, mock_get_ph_cl with freeze_time("2023-11-07T16:50:49Z"): job = ExternalDataJob.objects.create( - pipeline=source, workflow_id="fake_workflow_id", team=self.team, status="Running", rows_synced=100000 + pipeline=source, + workflow_id="fake_workflow_id", + team=self.team, + status="Running", + rows_synced=100000, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) capture_workspace_rows_synced_by_team(self.team.pk) @@ -86,12 +91,22 @@ def test_capture_workspace_rows_synced_by_team_month_cutoff_field_set(self, mock with freeze_time("2023-10-30T18:32:41Z"): ExternalDataJob.objects.create( - pipeline=source, workflow_id="fake_workflow_id", team=self.team, status="Completed", rows_synced=97747 + pipeline=source, + workflow_id="fake_workflow_id", + team=self.team, + status="Completed", + rows_synced=97747, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) with freeze_time("2023-11-07T16:50:49Z"): job2 = ExternalDataJob.objects.create( - pipeline=source, workflow_id="fake_workflow_id", team=self.team, status="Completed", rows_synced=93353 + pipeline=source, + workflow_id="fake_workflow_id", + team=self.team, + status="Completed", + rows_synced=93353, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) capture_workspace_rows_synced_by_team(self.team.pk) diff --git a/posthog/tasks/usage_report.py b/posthog/tasks/usage_report.py index ef85f8faa3210..6e99b069d754d 100644 --- a/posthog/tasks/usage_report.py +++ b/posthog/tasks/usage_report.py @@ -687,6 +687,7 @@ def get_teams_with_survey_responses_count_in_period( def get_teams_with_rows_synced_in_period(begin: datetime, end: datetime) -> list: return list( ExternalDataJob.objects.filter(created_at__gte=begin, created_at__lte=end) + .exclude(pipeline_version=ExternalDataJob.PipelineVersion.V2) .values("team_id") .annotate(total=Sum("rows_synced")) ) diff --git a/posthog/temporal/data_imports/__init__.py b/posthog/temporal/data_imports/__init__.py index c59f20b05d8cf..aab0a74ac554c 100644 --- a/posthog/temporal/data_imports/__init__.py +++ b/posthog/temporal/data_imports/__init__.py @@ -6,6 +6,7 @@ update_external_data_job_model, check_billing_limits_activity, sync_new_schemas_activity, + trigger_pipeline_v2, ) WORKFLOWS = [ExternalDataJobWorkflow] @@ -17,4 +18,5 @@ create_source_templates, check_billing_limits_activity, sync_new_schemas_activity, + trigger_pipeline_v2, ] diff --git a/posthog/temporal/data_imports/external_data_job.py b/posthog/temporal/data_imports/external_data_job.py index 916da24a1dbb2..62a1e1bc834ed 100644 --- a/posthog/temporal/data_imports/external_data_job.py +++ b/posthog/temporal/data_imports/external_data_job.py @@ -1,15 +1,22 @@ +import asyncio import dataclasses import datetime as dt import json import re +from django.conf import settings from django.db import close_old_connections import posthoganalytics from temporalio import activity, exceptions, workflow from temporalio.common import RetryPolicy +from posthog.constants import DATA_WAREHOUSE_TASK_QUEUE_V2 + # TODO: remove dependency +from posthog.settings.base_variables import TEST from posthog.temporal.batch_exports.base import PostHogWorkflow +from posthog.temporal.common.client import sync_connect +from posthog.temporal.data_imports.util import is_posthog_team from posthog.temporal.data_imports.workflow_activities.check_billing_limits import ( CheckBillingLimitsActivityInputs, check_billing_limits_activity, @@ -131,6 +138,30 @@ def update_external_data_job_model(inputs: UpdateExternalDataJobStatusInputs) -> ) +@activity.defn +def trigger_pipeline_v2(inputs: ExternalDataWorkflowInputs): + logger = bind_temporal_worker_logger_sync(team_id=inputs.team_id) + logger.debug("Triggering V2 pipeline") + + temporal = sync_connect() + + asyncio.run( + temporal.start_workflow( + workflow="external-data-job", + arg=dataclasses.asdict(inputs), + id=f"{inputs.external_data_schema_id}-V2", + task_queue=str(DATA_WAREHOUSE_TASK_QUEUE_V2), + retry_policy=RetryPolicy( + maximum_interval=dt.timedelta(seconds=60), + maximum_attempts=1, + non_retryable_error_types=["NondeterminismError"], + ), + ) + ) + + logger.debug("V2 pipeline triggered") + + @dataclasses.dataclass class CreateSourceTemplateInputs: team_id: int @@ -154,6 +185,18 @@ def parse_inputs(inputs: list[str]) -> ExternalDataWorkflowInputs: async def run(self, inputs: ExternalDataWorkflowInputs): assert inputs.external_data_schema_id is not None + if ( + settings.TEMPORAL_TASK_QUEUE != DATA_WAREHOUSE_TASK_QUEUE_V2 + and not TEST + and is_posthog_team(inputs.team_id) + ): + await workflow.execute_activity( + trigger_pipeline_v2, + inputs, + start_to_close_timeout=dt.timedelta(minutes=1), + retry_policy=RetryPolicy(maximum_attempts=1), + ) + update_inputs = UpdateExternalDataJobStatusInputs( job_id=None, status=ExternalDataJob.Status.COMPLETED, diff --git a/posthog/temporal/data_imports/pipelines/chargebee/__init__.py b/posthog/temporal/data_imports/pipelines/chargebee/__init__.py index 245afb6e5d880..7a093e65f7364 100644 --- a/posthog/temporal/data_imports/pipelines/chargebee/__init__.py +++ b/posthog/temporal/data_imports/pipelines/chargebee/__init__.py @@ -218,7 +218,13 @@ def update_request(self, request: Request) -> None: @dlt.source(max_table_nesting=0) def chargebee_source( - api_key: str, site_name: str, endpoint: str, team_id: int, job_id: str, is_incremental: bool = False + api_key: str, + site_name: str, + endpoint: str, + team_id: int, + job_id: str, + db_incremental_field_last_value: Optional[Any], + is_incremental: bool = False, ): config: RESTAPIConfig = { "client": { @@ -242,7 +248,7 @@ def chargebee_source( "resources": [get_resource(endpoint, is_incremental)], } - yield from rest_api_resources(config, team_id, job_id) + yield from rest_api_resources(config, team_id, job_id, db_incremental_field_last_value) def validate_credentials(api_key: str, site_name: str) -> bool: diff --git a/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py b/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py new file mode 100644 index 0000000000000..64cbbda922863 --- /dev/null +++ b/posthog/temporal/data_imports/pipelines/pipeline/delta_table_helper.py @@ -0,0 +1,116 @@ +from collections.abc import Sequence +from conditional_cache import lru_cache +from typing import Any +import pyarrow as pa +from dlt.common.libs.deltalake import ensure_delta_compatible_arrow_schema +from dlt.common.normalizers.naming.snake_case import NamingConvention +import deltalake as deltalake +from django.conf import settings +from posthog.settings.base_variables import TEST +from posthog.warehouse.models import ExternalDataJob + + +class DeltaTableHelper: + _resource_name: str + _job: ExternalDataJob + + def __init__(self, resource_name: str, job: ExternalDataJob) -> None: + self._resource_name = resource_name + self._job = job + + def _get_credentials(self): + if TEST: + return { + "aws_access_key_id": settings.AIRBYTE_BUCKET_KEY, + "aws_secret_access_key": settings.AIRBYTE_BUCKET_SECRET, + "endpoint_url": settings.OBJECT_STORAGE_ENDPOINT, + "region_name": settings.AIRBYTE_BUCKET_REGION, + "AWS_DEFAULT_REGION": settings.AIRBYTE_BUCKET_REGION, + "AWS_ALLOW_HTTP": "true", + "AWS_S3_ALLOW_UNSAFE_RENAME": "true", + } + + return { + "aws_access_key_id": settings.AIRBYTE_BUCKET_KEY, + "aws_secret_access_key": settings.AIRBYTE_BUCKET_SECRET, + "region_name": settings.AIRBYTE_BUCKET_REGION, + "AWS_DEFAULT_REGION": settings.AIRBYTE_BUCKET_REGION, + "AWS_S3_ALLOW_UNSAFE_RENAME": "true", + } + + def _get_delta_table_uri(self) -> str: + normalized_resource_name = NamingConvention().normalize_identifier(self._resource_name) + # Appended __v2 on to the end of the url so that data of the V2 pipeline isn't the same as V1 + return f"{settings.BUCKET_URL}/{self._job.folder_path()}/{normalized_resource_name}__v2" + + def _evolve_delta_schema(self, schema: pa.Schema) -> deltalake.DeltaTable: + delta_table = self.get_delta_table() + if delta_table is None: + raise Exception("Deltalake table not found") + + delta_table_schema = delta_table.schema().to_pyarrow() + + new_fields = [ + deltalake.Field.from_pyarrow(field) + for field in ensure_delta_compatible_arrow_schema(schema) + if field.name not in delta_table_schema.names + ] + if new_fields: + delta_table.alter.add_columns(new_fields) + + return delta_table + + @lru_cache(maxsize=1, condition=lambda result: result is not None) + def get_delta_table(self) -> deltalake.DeltaTable | None: + delta_uri = self._get_delta_table_uri() + storage_options = self._get_credentials() + + if deltalake.DeltaTable.is_deltatable(table_uri=delta_uri, storage_options=storage_options): + return deltalake.DeltaTable(table_uri=delta_uri, storage_options=storage_options) + + return None + + def write_to_deltalake( + self, data: pa.Table, is_incremental: bool, chunk_index: int, primary_keys: Sequence[Any] | None + ) -> deltalake.DeltaTable: + delta_table = self.get_delta_table() + + if delta_table: + delta_table = self._evolve_delta_schema(data.schema) + + if is_incremental and delta_table is not None: + if not primary_keys or len(primary_keys) == 0: + raise Exception("Primary key required for incremental syncs") + + delta_table.merge( + source=data, + source_alias="source", + target_alias="target", + predicate=" AND ".join([f"source.{c} = target.{c}" for c in primary_keys]), + ).when_matched_update_all().when_not_matched_insert_all().execute() + else: + mode = "append" + schema_mode = "merge" + if chunk_index == 0 or delta_table is None: + mode = "overwrite" + schema_mode = "overwrite" + + if delta_table is None: + storage_options = self._get_credentials() + delta_table = deltalake.DeltaTable.create( + table_uri=self._get_delta_table_uri(), schema=data.schema, storage_options=storage_options + ) + + deltalake.write_deltalake( + table_or_uri=delta_table, + data=data, + partition_by=None, + mode=mode, + schema_mode=schema_mode, + engine="rust", + ) # type: ignore + + delta_table = self.get_delta_table() + assert delta_table is not None + + return delta_table diff --git a/posthog/temporal/data_imports/pipelines/pipeline/hogql_schema.py b/posthog/temporal/data_imports/pipelines/pipeline/hogql_schema.py new file mode 100644 index 0000000000000..383a3296f0435 --- /dev/null +++ b/posthog/temporal/data_imports/pipelines/pipeline/hogql_schema.py @@ -0,0 +1,63 @@ +import pyarrow as pa +import deltalake as deltalake +from posthog.hogql.database.models import ( + BooleanDatabaseField, + DatabaseField, + DateDatabaseField, + DateTimeDatabaseField, + FloatDatabaseField, + IntegerDatabaseField, + StringDatabaseField, + StringJSONDatabaseField, +) + + +class HogQLSchema: + schema: dict[str, str] + + def __init__(self): + self.schema = {} + + def add_pyarrow_table(self, table: pa.Table) -> None: + for field in table.schema: + self.add_field(field, table.column(field.name)) + + def add_field(self, field: pa.Field, column: pa.ChunkedArray) -> None: + existing_type = self.schema.get(field.name) + if existing_type is not None and existing_type != StringDatabaseField.__name__: + return + + hogql_type: type[DatabaseField] = DatabaseField + + if pa.types.is_time(field.type): + hogql_type = DateTimeDatabaseField + elif pa.types.is_timestamp(field.type): + hogql_type = DateTimeDatabaseField + elif pa.types.is_date(field.type): + hogql_type = DateDatabaseField + elif pa.types.is_decimal(field.type): + hogql_type = FloatDatabaseField + elif pa.types.is_floating(field.type): + hogql_type = FloatDatabaseField + elif pa.types.is_boolean(field.type): + hogql_type = BooleanDatabaseField + elif pa.types.is_integer(field.type): + hogql_type = IntegerDatabaseField + elif pa.types.is_binary(field.type): + raise Exception("Type 'binary' is not a supported column type") + elif pa.types.is_string(field.type): + hogql_type = StringDatabaseField + + # Checking for JSON string columns with the first non-null value in the column + for value in column: + value_str = value.as_py() + if value_str is not None: + assert isinstance(value_str, str) + if value_str.startswith("{") or value_str.startswith("["): + hogql_type = StringJSONDatabaseField + break + + self.schema[field.name] = hogql_type.__name__ + + def to_hogql_types(self) -> dict[str, str]: + return self.schema diff --git a/posthog/temporal/data_imports/pipelines/pipeline/pipeline.py b/posthog/temporal/data_imports/pipelines/pipeline/pipeline.py new file mode 100644 index 0000000000000..a69d60501601b --- /dev/null +++ b/posthog/temporal/data_imports/pipelines/pipeline/pipeline.py @@ -0,0 +1,139 @@ +import time +from typing import Any +import pyarrow as pa +from dlt.sources import DltSource, DltResource +import deltalake as deltalake +from posthog.temporal.common.logger import FilteringBoundLogger +from posthog.temporal.data_imports.pipelines.pipeline.utils import ( + _update_incremental_state, + _get_primary_keys, + _evolve_pyarrow_schema, + _append_debug_column_to_pyarrows_table, + _update_job_row_count, +) +from posthog.temporal.data_imports.pipelines.pipeline.delta_table_helper import DeltaTableHelper +from posthog.temporal.data_imports.pipelines.pipeline.hogql_schema import HogQLSchema +from posthog.temporal.data_imports.pipelines.pipeline_sync import validate_schema_and_update_table_sync +from posthog.temporal.data_imports.util import prepare_s3_files_for_querying +from posthog.warehouse.models import DataWarehouseTable, ExternalDataJob, ExternalDataSchema + + +class PipelineNonDLT: + _resource: DltResource + _resource_name: str + _job: ExternalDataJob + _schema: ExternalDataSchema + _logger: FilteringBoundLogger + _is_incremental: bool + _delta_table_helper: DeltaTableHelper + _internal_schema = HogQLSchema() + _load_id: int + + def __init__(self, source: DltSource, logger: FilteringBoundLogger, job_id: str, is_incremental: bool) -> None: + resources = list(source.resources.items()) + assert len(resources) == 1 + resource_name, resource = resources[0] + + self._resource = resource + self._resource_name = resource_name + self._job = ExternalDataJob.objects.prefetch_related("schema").get(id=job_id) + self._is_incremental = is_incremental + self._logger = logger + self._load_id = time.time_ns() + + schema: ExternalDataSchema | None = self._job.schema + assert schema is not None + self._schema = schema + + self._delta_table_helper = DeltaTableHelper(resource_name, self._job) + self._internal_schema = HogQLSchema() + + def run(self): + buffer: list[Any] = [] + chunk_size = 5000 + row_count = 0 + chunk_index = 0 + + for item in self._resource: + py_table = None + + if isinstance(item, list): + if len(buffer) > 0: + buffer.extend(item) + if len(buffer) >= chunk_size: + py_table = pa.Table.from_pylist(buffer) + buffer = [] + else: + if len(item) >= chunk_size: + py_table = pa.Table.from_pylist(item) + else: + buffer.extend(item) + continue + elif isinstance(item, dict): + buffer.append(item) + if len(buffer) < chunk_size: + continue + + py_table = pa.Table.from_pylist(buffer) + buffer = [] + elif isinstance(item, pa.Table): + py_table = item + else: + raise Exception(f"Unhandled item type: {item.__class__.__name__}") + + assert py_table is not None + + self._process_pa_table(pa_table=py_table, index=chunk_index) + + row_count += py_table.num_rows + chunk_index += 1 + + if len(buffer) > 0: + py_table = pa.Table.from_pylist(buffer) + self._process_pa_table(pa_table=py_table, index=chunk_index) + row_count += py_table.num_rows + + self._post_run_operations(row_count=row_count) + + def _process_pa_table(self, pa_table: pa.Table, index: int): + delta_table = self._delta_table_helper.get_delta_table() + + pa_table = _append_debug_column_to_pyarrows_table(pa_table, self._load_id) + pa_table = _evolve_pyarrow_schema(pa_table, delta_table.schema() if delta_table is not None else None) + + table_primary_keys = _get_primary_keys(self._resource) + delta_table = self._delta_table_helper.write_to_deltalake( + pa_table, self._is_incremental, index, table_primary_keys + ) + + self._internal_schema.add_pyarrow_table(pa_table) + + _update_incremental_state(self._schema, pa_table, self._logger) + _update_job_row_count(self._job.id, pa_table.num_rows, self._logger) + + def _post_run_operations(self, row_count: int): + delta_table = self._delta_table_helper.get_delta_table() + + assert delta_table is not None + + self._logger.info("Compacting delta table") + delta_table.optimize.compact() + delta_table.vacuum(retention_hours=24, enforce_retention_duration=False, dry_run=False) + + file_uris = delta_table.file_uris() + self._logger.info(f"Preparing S3 files - total parquet files: {len(file_uris)}") + prepare_s3_files_for_querying( + self._job.folder_path(), self._resource_name, file_uris, ExternalDataJob.PipelineVersion.V2 + ) + + self._logger.debug("Validating schema and updating table") + + validate_schema_and_update_table_sync( + run_id=str(self._job.id), + team_id=self._job.team_id, + schema_id=self._schema.id, + table_schema={}, + table_schema_dict=self._internal_schema.to_hogql_types(), + row_count=row_count, + table_format=DataWarehouseTable.TableFormat.DeltaS3Wrapper, + ) diff --git a/posthog/temporal/data_imports/pipelines/pipeline/utils.py b/posthog/temporal/data_imports/pipelines/pipeline/utils.py new file mode 100644 index 0000000000000..fadb6ec02a868 --- /dev/null +++ b/posthog/temporal/data_imports/pipelines/pipeline/utils.py @@ -0,0 +1,105 @@ +import json +from collections.abc import Sequence +from typing import Any +import pyarrow as pa +from dlt.common.libs.deltalake import ensure_delta_compatible_arrow_schema +from dlt.sources import DltResource +import deltalake as deltalake +from django.db.models import F +from posthog.temporal.common.logger import FilteringBoundLogger +from posthog.warehouse.models import ExternalDataJob, ExternalDataSchema + + +def _get_primary_keys(resource: DltResource) -> list[Any] | None: + primary_keys = resource._hints.get("primary_key") + + if primary_keys is None: + return None + + if isinstance(primary_keys, list): + return primary_keys + + if isinstance(primary_keys, Sequence): + return list(primary_keys) + + raise Exception(f"primary_keys of type {primary_keys.__class__.__name__} are not supported") + + +def _evolve_pyarrow_schema(table: pa.Table, delta_schema: deltalake.Schema | None) -> pa.Table: + py_table_field_names = table.schema.names + + # Change pa.structs to JSON string + for column_name in table.column_names: + column = table.column(column_name) + if pa.types.is_struct(column.type) or pa.types.is_list(column.type): + json_column = pa.array([json.dumps(row.as_py()) if row.as_py() is not None else None for row in column]) + table = table.set_column(table.schema.get_field_index(column_name), column_name, json_column) + + if delta_schema: + for field in delta_schema.to_pyarrow(): + if field.name not in py_table_field_names: + if field.nullable: + new_column_data = pa.array([None] * table.num_rows, type=field.type) + else: + new_column_data = pa.array( + [_get_default_value_from_pyarrow_type(field.type)] * table.num_rows, type=field.type + ) + table = table.append_column(field, new_column_data) + + # Change types based on what deltalake tables support + return table.cast(ensure_delta_compatible_arrow_schema(table.schema)) + + +def _append_debug_column_to_pyarrows_table(table: pa.Table, load_id: int) -> pa.Table: + debug_info = f'{{"load_id": {load_id}}}' + + column = pa.array([debug_info] * table.num_rows, type=pa.string()) + return table.append_column("_ph_debug", column) + + +def _get_default_value_from_pyarrow_type(pyarrow_type: pa.DataType): + """ + Returns a default value for the given PyArrow type. + """ + if pa.types.is_integer(pyarrow_type): + return 0 + elif pa.types.is_floating(pyarrow_type): + return 0.0 + elif pa.types.is_string(pyarrow_type): + return "" + elif pa.types.is_boolean(pyarrow_type): + return False + elif pa.types.is_binary(pyarrow_type): + return b"" + elif pa.types.is_timestamp(pyarrow_type): + return pa.scalar(0, type=pyarrow_type).as_py() + elif pa.types.is_date(pyarrow_type): + return pa.scalar(0, type=pyarrow_type).as_py() + elif pa.types.is_time(pyarrow_type): + return pa.scalar(0, type=pyarrow_type).as_py() + else: + raise ValueError(f"No default value defined for type: {pyarrow_type}") + + +def _update_incremental_state(schema: ExternalDataSchema | None, table: pa.Table, logger: FilteringBoundLogger) -> None: + if schema is None or schema.sync_type != ExternalDataSchema.SyncType.INCREMENTAL: + return + + incremental_field_name: str | None = schema.sync_type_config.get("incremental_field") + if incremental_field_name is None: + return + + column = table[incremental_field_name] + numpy_arr = column.combine_chunks().to_pandas().to_numpy() + + # TODO(@Gilbert09): support different operations here (e.g. min) + last_value = numpy_arr.max() + + logger.debug(f"Updating incremental_field_last_value_v2 with {last_value}") + + schema.update_incremental_field_last_value(last_value) + + +def _update_job_row_count(job_id: str, count: int, logger: FilteringBoundLogger) -> None: + logger.debug(f"Updating rows_synced with +{count}") + ExternalDataJob.objects.filter(id=job_id).update(rows_synced=F("rows_synced") + count) diff --git a/posthog/temporal/data_imports/pipelines/pipeline_sync.py b/posthog/temporal/data_imports/pipelines/pipeline_sync.py index 8d2cbd6cac2ee..3fca1a7a49c82 100644 --- a/posthog/temporal/data_imports/pipelines/pipeline_sync.py +++ b/posthog/temporal/data_imports/pipelines/pipeline_sync.py @@ -455,6 +455,7 @@ def validate_schema_and_update_table_sync( table_schema: TSchemaTables, row_count: int, table_format: DataWarehouseTable.TableFormat, + table_schema_dict: Optional[dict[str, str]] = None, ) -> None: """ @@ -479,6 +480,18 @@ def validate_schema_and_update_table_sync( "pipeline", Prefetch("schema", queryset=ExternalDataSchema.objects.prefetch_related("source")) ).get(pk=run_id) + using_v2_pipeline = job.pipeline_version == ExternalDataJob.PipelineVersion.V2 + pipeline_version = ( + ExternalDataJob.PipelineVersion.V1 + if job.pipeline_version is None + else ExternalDataJob.PipelineVersion(job.pipeline_version) + ) + + # Temp so we dont create a bunch of orphaned Table objects + if using_v2_pipeline: + logger.debug("Using V2 pipeline - dont create table object or get columns") + return + credential = get_or_create_datawarehouse_credential( team_id=team_id, access_key=settings.AIRBYTE_BUCKET_KEY, @@ -528,41 +541,63 @@ def validate_schema_and_update_table_sync( assert isinstance(table_created, DataWarehouseTable) and table_created is not None # Temp fix #2 for Delta tables without table_format - try: - table_created.get_columns() - except Exception as e: - if table_format == DataWarehouseTable.TableFormat.DeltaS3Wrapper: - logger.exception("get_columns exception with DeltaS3Wrapper format - trying Delta format", exc_info=e) - - table_created.format = DataWarehouseTable.TableFormat.Delta + if not using_v2_pipeline: + try: table_created.get_columns() - table_created.save() + except Exception as e: + if table_format == DataWarehouseTable.TableFormat.DeltaS3Wrapper: + logger.exception( + "get_columns exception with DeltaS3Wrapper format - trying Delta format", exc_info=e + ) - logger.info("Delta format worked - updating table to use Delta") - else: - raise - - for schema in table_schema.values(): - if schema.get("resource") == _schema_name: - schema_columns = schema.get("columns") or {} - raw_db_columns: dict[str, dict[str, str]] = table_created.get_columns() - db_columns = {key: column.get("clickhouse", "") for key, column in raw_db_columns.items()} - - columns = {} - for column_name, db_column_type in db_columns.items(): - dlt_column = schema_columns.get(column_name) - if dlt_column is not None: - dlt_data_type = dlt_column.get("data_type") - hogql_type = dlt_to_hogql_type(dlt_data_type) - else: - hogql_type = dlt_to_hogql_type(None) + table_created.format = DataWarehouseTable.TableFormat.Delta + table_created.get_columns() + table_created.save() + + logger.info("Delta format worked - updating table to use Delta") + else: + raise + + # If using new non-DLT pipeline + if using_v2_pipeline and table_schema_dict is not None: + raw_db_columns: dict[str, dict[str, str]] = table_created.get_columns(pipeline_version=pipeline_version) + db_columns = {key: column.get("clickhouse", "") for key, column in raw_db_columns.items()} + + columns = {} + for column_name, db_column_type in db_columns.items(): + hogql_type = table_schema_dict.get(column_name) + + if hogql_type is None: + raise Exception(f"HogQL type not found for column: {column_name}") + + columns[column_name] = { + "clickhouse": db_column_type, + "hogql": hogql_type, + } + table_created.columns = columns + else: + # If using DLT pipeline + for schema in table_schema.values(): + if schema.get("resource") == _schema_name: + schema_columns = schema.get("columns") or {} + raw_db_columns: dict[str, dict[str, str]] = table_created.get_columns() + db_columns = {key: column.get("clickhouse", "") for key, column in raw_db_columns.items()} + + columns = {} + for column_name, db_column_type in db_columns.items(): + dlt_column = schema_columns.get(column_name) + if dlt_column is not None: + dlt_data_type = dlt_column.get("data_type") + hogql_type = dlt_to_hogql_type(dlt_data_type) + else: + hogql_type = dlt_to_hogql_type(None) - columns[column_name] = { - "clickhouse": db_column_type, - "hogql": hogql_type, - } - table_created.columns = columns - break + columns[column_name] = { + "clickhouse": db_column_type, + "hogql": hogql_type, + } + table_created.columns = columns + break table_created.save() @@ -573,7 +608,7 @@ def validate_schema_and_update_table_sync( .get(id=_schema_id, team_id=team_id) ) - if schema_model: + if not using_v2_pipeline and schema_model: schema_model.table = table_created schema_model.save() diff --git a/posthog/temporal/data_imports/pipelines/rest_source/__init__.py b/posthog/temporal/data_imports/pipelines/rest_source/__init__.py index 4fd019ce76753..9a8599882c652 100644 --- a/posthog/temporal/data_imports/pipelines/rest_source/__init__.py +++ b/posthog/temporal/data_imports/pipelines/rest_source/__init__.py @@ -46,6 +46,7 @@ def rest_api_source( config: RESTAPIConfig, team_id: int, job_id: str, + db_incremental_field_last_value: Optional[Any] = None, name: Optional[str] = None, section: Optional[str] = None, max_table_nesting: Optional[int] = None, @@ -108,10 +109,12 @@ def rest_api_source( spec, ) - return decorated(config, team_id, job_id) + return decorated(config, team_id, job_id, db_incremental_field_last_value) -def rest_api_resources(config: RESTAPIConfig, team_id: int, job_id: str) -> list[DltResource]: +def rest_api_resources( + config: RESTAPIConfig, team_id: int, job_id: str, db_incremental_field_last_value: Optional[Any] +) -> list[DltResource]: """Creates a list of resources from a REST API configuration. Args: @@ -193,6 +196,7 @@ def rest_api_resources(config: RESTAPIConfig, team_id: int, job_id: str) -> list resolved_param_map, team_id=team_id, job_id=job_id, + db_incremental_field_last_value=db_incremental_field_last_value, ) return list(resources.values()) @@ -205,6 +209,7 @@ def create_resources( resolved_param_map: dict[str, Optional[ResolvedParam]], team_id: int, job_id: str, + db_incremental_field_last_value: Optional[Any] = None, ) -> dict[str, DltResource]: resources = {} @@ -264,6 +269,7 @@ async def paginate_resource( incremental_object, incremental_param, incremental_cursor_transform, + db_incremental_field_last_value, ) yield client.paginate( @@ -317,6 +323,7 @@ async def paginate_dependent_resource( incremental_object, incremental_param, incremental_cursor_transform, + db_incremental_field_last_value, ) for item in items: @@ -358,6 +365,7 @@ def _set_incremental_params( incremental_object: Incremental[Any], incremental_param: Optional[IncrementalParam], transform: Optional[Callable[..., Any]], + db_incremental_field_last_value: Optional[Any] = None, ) -> dict[str, Any]: def identity_func(x: Any) -> Any: return x @@ -368,7 +376,13 @@ def identity_func(x: Any) -> Any: if incremental_param is None: return params - params[incremental_param.start] = transform(incremental_object.last_value) + last_value = ( + db_incremental_field_last_value + if db_incremental_field_last_value is not None + else incremental_object.last_value + ) + + params[incremental_param.start] = transform(last_value) if incremental_param.end: params[incremental_param.end] = transform(incremental_object.end_value) return params diff --git a/posthog/temporal/data_imports/pipelines/salesforce/__init__.py b/posthog/temporal/data_imports/pipelines/salesforce/__init__.py index ec2dc7647b606..129c8d1550be4 100644 --- a/posthog/temporal/data_imports/pipelines/salesforce/__init__.py +++ b/posthog/temporal/data_imports/pipelines/salesforce/__init__.py @@ -6,7 +6,6 @@ from posthog.temporal.data_imports.pipelines.rest_source import RESTAPIConfig, rest_api_resources from posthog.temporal.data_imports.pipelines.rest_source.typing import EndpointResource from posthog.temporal.data_imports.pipelines.salesforce.auth import SalseforceAuth -import pendulum import re @@ -352,6 +351,7 @@ def salesforce_source( endpoint: str, team_id: int, job_id: str, + db_incremental_field_last_value: Optional[Any], is_incremental: bool = False, ): config: RESTAPIConfig = { @@ -366,4 +366,4 @@ def salesforce_source( "resources": [get_resource(endpoint, is_incremental)], } - yield from rest_api_resources(config, team_id, job_id) + yield from rest_api_resources(config, team_id, job_id, db_incremental_field_last_value) diff --git a/posthog/temporal/data_imports/pipelines/sql_database/__init__.py b/posthog/temporal/data_imports/pipelines/sql_database/__init__.py index 04aa7c9678c0b..ae81f9fa61fe6 100644 --- a/posthog/temporal/data_imports/pipelines/sql_database/__init__.py +++ b/posthog/temporal/data_imports/pipelines/sql_database/__init__.py @@ -51,6 +51,7 @@ def sql_source_for_type( sslmode: str, schema: str, table_names: list[str], + db_incremental_field_last_value: Optional[Any], using_ssl: Optional[bool] = True, team_id: Optional[int] = None, incremental_field: Optional[str] = None, @@ -99,12 +100,13 @@ def sql_source_for_type( raise Exception("Unsupported source_type") db_source = sql_database( - credentials, + credentials=credentials, schema=schema, table_names=table_names, incremental=incremental, team_id=team_id, connect_args=connect_args, + db_incremental_field_last_value=db_incremental_field_last_value, ) return db_source @@ -121,6 +123,7 @@ def snowflake_source( warehouse: str, schema: str, table_names: list[str], + db_incremental_field_last_value: Optional[Any], role: Optional[str] = None, incremental_field: Optional[str] = None, incremental_field_type: Optional[IncrementalFieldType] = None, @@ -172,7 +175,13 @@ def snowflake_source( }, ) - db_source = sql_database(credentials, schema=schema, table_names=table_names, incremental=incremental) + db_source = sql_database( + credentials=credentials, + schema=schema, + table_names=table_names, + incremental=incremental, + db_incremental_field_last_value=db_incremental_field_last_value, + ) return db_source @@ -186,6 +195,7 @@ def bigquery_source( token_uri: str, table_name: str, bq_destination_table_id: str, + db_incremental_field_last_value: Optional[Any], incremental_field: Optional[str] = None, incremental_field_type: Optional[IncrementalFieldType] = None, ) -> DltSource: @@ -210,7 +220,13 @@ def bigquery_source( credentials_info=credentials_info, ) - return sql_database(engine, schema=None, table_names=[table_name], incremental=incremental) + return sql_database( + credentials=engine, + schema=None, + table_names=[table_name], + incremental=incremental, + db_incremental_field_last_value=db_incremental_field_last_value, + ) # Temp while DLT doesn't support `interval` columns @@ -231,6 +247,7 @@ def internal_remove(doc: dict) -> dict: @dlt.source(max_table_nesting=0) def sql_database( + db_incremental_field_last_value: Optional[Any], credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value, schema: Optional[str] = dlt.config.value, metadata: Optional[MetaData] = None, @@ -290,6 +307,7 @@ def sql_database( table=table, incremental=incremental, connect_args=connect_args, + db_incremental_field_last_value=db_incremental_field_last_value, ) ) diff --git a/posthog/temporal/data_imports/pipelines/sql_database/helpers.py b/posthog/temporal/data_imports/pipelines/sql_database/helpers.py index 50577b6b04d17..0400a60b32fd5 100644 --- a/posthog/temporal/data_imports/pipelines/sql_database/helpers.py +++ b/posthog/temporal/data_imports/pipelines/sql_database/helpers.py @@ -27,6 +27,7 @@ def __init__( chunk_size: int = 1000, incremental: Optional[dlt.sources.incremental[Any]] = None, connect_args: Optional[list[str]] = None, + db_incremental_field_last_value: Optional[Any] = None, ) -> None: self.engine = engine self.table = table @@ -43,7 +44,11 @@ def __init__( raise KeyError( f"Cursor column '{incremental.cursor_path}' does not exist in table '{table.name}'" ) from e - self.last_value = incremental.last_value + self.last_value = ( + db_incremental_field_last_value + if db_incremental_field_last_value is not None + else incremental.last_value + ) else: self.cursor_column = None self.last_value = None @@ -90,6 +95,7 @@ def table_rows( chunk_size: int = DEFAULT_CHUNK_SIZE, incremental: Optional[dlt.sources.incremental[Any]] = None, connect_args: Optional[list[str]] = None, + db_incremental_field_last_value: Optional[Any] = None, ) -> Iterator[TDataItem]: """ A DLT source which loads data from an SQL database using SQLAlchemy. @@ -106,7 +112,14 @@ def table_rows( """ yield dlt.mark.materialize_table_schema() # type: ignore - loader = TableLoader(engine, table, incremental=incremental, chunk_size=chunk_size, connect_args=connect_args) + loader = TableLoader( + engine, + table, + incremental=incremental, + chunk_size=chunk_size, + connect_args=connect_args, + db_incremental_field_last_value=db_incremental_field_last_value, + ) yield from loader.load_rows() engine.dispose() diff --git a/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py b/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py index bcab4c3e19282..a3fc1c6b2838b 100644 --- a/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py +++ b/posthog/temporal/data_imports/pipelines/sql_database_v2/__init__.py @@ -67,6 +67,7 @@ def sql_source_for_type( sslmode: str, schema: str, table_names: list[str], + db_incremental_field_last_value: Optional[Any], using_ssl: Optional[bool] = True, team_id: Optional[int] = None, incremental_field: Optional[str] = None, @@ -115,10 +116,11 @@ def sql_source_for_type( raise Exception("Unsupported source_type") db_source = sql_database( - credentials, + credentials=credentials, schema=schema, table_names=table_names, incremental=incremental, + db_incremental_field_last_value=db_incremental_field_last_value, team_id=team_id, connect_args=connect_args, ) @@ -137,6 +139,7 @@ def snowflake_source( warehouse: str, schema: str, table_names: list[str], + db_incremental_field_last_value: Optional[Any], role: Optional[str] = None, incremental_field: Optional[str] = None, incremental_field_type: Optional[IncrementalFieldType] = None, @@ -188,7 +191,13 @@ def snowflake_source( }, ) - db_source = sql_database(credentials, schema=schema, table_names=table_names, incremental=incremental) + db_source = sql_database( + credentials=credentials, + schema=schema, + table_names=table_names, + incremental=incremental, + db_incremental_field_last_value=db_incremental_field_last_value, + ) return db_source @@ -202,6 +211,7 @@ def bigquery_source( token_uri: str, table_name: str, bq_destination_table_id: str, + db_incremental_field_last_value: Optional[Any], incremental_field: Optional[str] = None, incremental_field_type: Optional[IncrementalFieldType] = None, ) -> DltSource: @@ -226,11 +236,18 @@ def bigquery_source( credentials_info=credentials_info, ) - return sql_database(engine, schema=None, table_names=[table_name], incremental=incremental) + return sql_database( + credentials=engine, + schema=None, + table_names=[table_name], + incremental=incremental, + db_incremental_field_last_value=db_incremental_field_last_value, + ) @dlt.source(max_table_nesting=0) def sql_database( + db_incremental_field_last_value: Optional[Any], credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value, schema: Optional[str] = dlt.config.value, metadata: Optional[MetaData] = None, @@ -317,6 +334,7 @@ def sql_database( backend_kwargs=backend_kwargs, type_adapter_callback=type_adapter_callback, incremental=incremental, + db_incremental_field_last_value=db_incremental_field_last_value, team_id=team_id, connect_args=connect_args, ) @@ -341,6 +359,7 @@ def internal_remove(table: pa.Table) -> pa.Table: @dlt.resource(name=lambda args: args["table"], standalone=True, spec=SqlTableResourceConfiguration) def sql_table( + db_incremental_field_last_value: Optional[Any], credentials: Union[ConnectionStringCredentials, Engine, str] = dlt.secrets.value, table: str = dlt.config.value, schema: Optional[str] = dlt.config.value, @@ -438,6 +457,7 @@ def query_adapter_callback(query: SelectAny, table: Table): chunk_size=chunk_size, backend=backend, incremental=incremental, + db_incremental_field_last_value=db_incremental_field_last_value, reflection_level=reflection_level, defer_table_reflect=defer_table_reflect, table_adapter_callback=table_adapter_callback, diff --git a/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py b/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py index 46f59929beb47..acd64c97aae99 100644 --- a/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py +++ b/posthog/temporal/data_imports/pipelines/sql_database_v2/helpers.py @@ -46,6 +46,7 @@ def __init__( columns: TTableSchemaColumns, chunk_size: int = 1000, incremental: Optional[dlt.sources.incremental[Any]] = None, + db_incremental_field_last_value: Optional[Any] = None, query_adapter_callback: Optional[TQueryAdapter] = None, connect_args: Optional[list[str]] = None, ) -> None: @@ -64,7 +65,11 @@ def __init__( raise KeyError( f"Cursor column '{incremental.cursor_path}' does not exist in table '{table.name}'" ) from e - self.last_value = incremental.last_value + self.last_value = ( + db_incremental_field_last_value + if db_incremental_field_last_value is not None + else incremental.last_value + ) self.end_value = incremental.end_value self.row_order: TSortOrder = self.incremental.row_order else: @@ -183,6 +188,7 @@ def table_rows( chunk_size: int, backend: TableBackend, incremental: Optional[dlt.sources.incremental[Any]] = None, + db_incremental_field_last_value: Optional[Any] = None, defer_table_reflect: bool = False, table_adapter_callback: Optional[Callable[[Table], None]] = None, reflection_level: ReflectionLevel = "minimal", @@ -226,6 +232,7 @@ def table_rows( table, columns, incremental=incremental, + db_incremental_field_last_value=db_incremental_field_last_value, chunk_size=chunk_size, query_adapter_callback=query_adapter_callback, connect_args=connect_args, diff --git a/posthog/temporal/data_imports/pipelines/stripe/__init__.py b/posthog/temporal/data_imports/pipelines/stripe/__init__.py index 5b386aa10adba..da9af92c191dc 100644 --- a/posthog/temporal/data_imports/pipelines/stripe/__init__.py +++ b/posthog/temporal/data_imports/pipelines/stripe/__init__.py @@ -325,7 +325,13 @@ def update_request(self, request: Request) -> None: @dlt.source(max_table_nesting=0) def stripe_source( - api_key: str, account_id: Optional[str], endpoint: str, team_id: int, job_id: str, is_incremental: bool = False + api_key: str, + account_id: Optional[str], + endpoint: str, + team_id: int, + job_id: str, + db_incremental_field_last_value: Optional[Any], + is_incremental: bool = False, ): config: RESTAPIConfig = { "client": { @@ -355,7 +361,7 @@ def stripe_source( "resources": [get_resource(endpoint, is_incremental)], } - yield from rest_api_resources(config, team_id, job_id) + yield from rest_api_resources(config, team_id, job_id, db_incremental_field_last_value) def validate_credentials(api_key: str) -> bool: diff --git a/posthog/temporal/data_imports/pipelines/test/test_pipeline_sync.py b/posthog/temporal/data_imports/pipelines/test/test_pipeline_sync.py index fcd84903b7249..5b765e35cea14 100644 --- a/posthog/temporal/data_imports/pipelines/test/test_pipeline_sync.py +++ b/posthog/temporal/data_imports/pipelines/test/test_pipeline_sync.py @@ -66,6 +66,7 @@ def _create_pipeline(self, schema_name: str, incremental: bool): status=ExternalDataJob.Status.RUNNING, rows_synced=0, workflow_id=str(uuid.uuid4()), + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) pipeline = DataImportPipelineSync( @@ -84,6 +85,7 @@ def _create_pipeline(self, schema_name: str, incremental: bool): is_incremental=incremental, team_id=self.team.pk, job_id=str(job.pk), + db_incremental_field_last_value=0, ), logger=structlog.get_logger(), incremental=incremental, diff --git a/posthog/temporal/data_imports/pipelines/vitally/__init__.py b/posthog/temporal/data_imports/pipelines/vitally/__init__.py index b01f783ea30d9..3f070c48653f2 100644 --- a/posthog/temporal/data_imports/pipelines/vitally/__init__.py +++ b/posthog/temporal/data_imports/pipelines/vitally/__init__.py @@ -324,6 +324,7 @@ def vitally_source( endpoint: str, team_id: int, job_id: str, + db_incremental_field_last_value: Optional[Any], is_incremental: bool = False, ): config: RESTAPIConfig = { @@ -348,7 +349,7 @@ def vitally_source( "resources": [get_resource(endpoint, is_incremental)], } - yield from rest_api_resources(config, team_id, job_id) + yield from rest_api_resources(config, team_id, job_id, db_incremental_field_last_value) def validate_credentials(secret_token: str, region: str, subdomain: Optional[str]) -> bool: diff --git a/posthog/temporal/data_imports/pipelines/zendesk/__init__.py b/posthog/temporal/data_imports/pipelines/zendesk/__init__.py index 36d842e4d3889..55b6be994f006 100644 --- a/posthog/temporal/data_imports/pipelines/zendesk/__init__.py +++ b/posthog/temporal/data_imports/pipelines/zendesk/__init__.py @@ -289,6 +289,7 @@ def zendesk_source( endpoint: str, team_id: int, job_id: str, + db_incremental_field_last_value: Optional[Any], is_incremental: bool = False, ): config: RESTAPIConfig = { @@ -312,7 +313,7 @@ def zendesk_source( "resources": [get_resource(endpoint, is_incremental)], } - yield from rest_api_resources(config, team_id, job_id) + yield from rest_api_resources(config, team_id, job_id, db_incremental_field_last_value) def validate_credentials(subdomain: str, api_key: str, email_address: str) -> bool: diff --git a/posthog/temporal/data_imports/util.py b/posthog/temporal/data_imports/util.py index cc8a4892b0aaa..4a133ef336b42 100644 --- a/posthog/temporal/data_imports/util.py +++ b/posthog/temporal/data_imports/util.py @@ -1,18 +1,33 @@ +from typing import Optional from posthog.settings.utils import get_from_env from posthog.utils import str_to_bool +from posthog.warehouse.models import ExternalDataJob from posthog.warehouse.s3 import get_s3_client from django.conf import settings from dlt.common.normalizers.naming.snake_case import NamingConvention -def prepare_s3_files_for_querying(folder_path: str, table_name: str, file_uris: list[str]): +def prepare_s3_files_for_querying( + folder_path: str, + table_name: str, + file_uris: list[str], + pipeline_version: Optional[ExternalDataJob.PipelineVersion] = None, +): s3 = get_s3_client() normalized_table_name = NamingConvention().normalize_identifier(table_name) s3_folder_for_job = f"{settings.BUCKET_URL}/{folder_path}" - s3_folder_for_schema = f"{s3_folder_for_job}/{normalized_table_name}" - s3_folder_for_querying = f"{s3_folder_for_job}/{normalized_table_name}__query" + + if pipeline_version == ExternalDataJob.PipelineVersion.V2: + s3_folder_for_schema = f"{s3_folder_for_job}/{normalized_table_name}__v2" + else: + s3_folder_for_schema = f"{s3_folder_for_job}/{normalized_table_name}" + + if pipeline_version == ExternalDataJob.PipelineVersion.V2: + s3_folder_for_querying = f"{s3_folder_for_job}/{normalized_table_name}__query_v2" + else: + s3_folder_for_querying = f"{s3_folder_for_job}/{normalized_table_name}__query" if s3.exists(s3_folder_for_querying): s3.delete(s3_folder_for_querying, recursive=True) diff --git a/posthog/temporal/data_imports/workflow_activities/create_job_model.py b/posthog/temporal/data_imports/workflow_activities/create_job_model.py index b62f0c9cc2063..b404c610c1cad 100644 --- a/posthog/temporal/data_imports/workflow_activities/create_job_model.py +++ b/posthog/temporal/data_imports/workflow_activities/create_job_model.py @@ -1,11 +1,13 @@ import dataclasses import uuid +from django.conf import settings from django.db import close_old_connections from temporalio import activity # TODO: remove dependency +from posthog.constants import DATA_WAREHOUSE_TASK_QUEUE_V2 from posthog.warehouse.models import ExternalDataJob, ExternalDataSource from posthog.warehouse.models.external_data_schema import ( ExternalDataSchema, @@ -20,6 +22,13 @@ class CreateExternalDataJobModelActivityInputs: source_id: uuid.UUID +def get_pipeline_version() -> str: + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE_V2: + return ExternalDataJob.PipelineVersion.V2 + + return ExternalDataJob.PipelineVersion.V1 + + @activity.defn def create_external_data_job_model_activity( inputs: CreateExternalDataJobModelActivityInputs, @@ -37,6 +46,7 @@ def create_external_data_job_model_activity( rows_synced=0, workflow_id=activity.info().workflow_id, workflow_run_id=activity.info().workflow_run_id, + pipeline_version=get_pipeline_version(), ) schema = ExternalDataSchema.objects.get(team_id=inputs.team_id, id=inputs.schema_id) diff --git a/posthog/temporal/data_imports/workflow_activities/import_data_sync.py b/posthog/temporal/data_imports/workflow_activities/import_data_sync.py index 81a4a943d2c4c..135d6b8d5fb89 100644 --- a/posthog/temporal/data_imports/workflow_activities/import_data_sync.py +++ b/posthog/temporal/data_imports/workflow_activities/import_data_sync.py @@ -1,17 +1,21 @@ import dataclasses import uuid from datetime import datetime +from dateutil import parser from typing import Any +from django.conf import settings from django.db import close_old_connections from django.db.models import Prefetch, F from temporalio import activity +from posthog.constants import DATA_WAREHOUSE_TASK_QUEUE_V2 from posthog.models.integration import Integration from posthog.temporal.common.heartbeat_sync import HeartbeaterSync from posthog.temporal.data_imports.pipelines.bigquery import delete_all_temp_destination_tables, delete_table +from posthog.temporal.data_imports.pipelines.pipeline.pipeline import PipelineNonDLT from posthog.temporal.data_imports.pipelines.pipeline_sync import DataImportPipelineSync, PipelineInputs from posthog.temporal.data_imports.util import is_posthog_team from posthog.warehouse.models import ( @@ -22,6 +26,7 @@ from structlog.typing import FilteringBoundLogger from posthog.warehouse.models.external_data_schema import ExternalDataSchema from posthog.warehouse.models.ssh_tunnel import SSHTunnel +from posthog.warehouse.types import IncrementalFieldType @dataclasses.dataclass @@ -32,6 +37,20 @@ class ImportDataActivityInputs: run_id: str +def process_incremental_last_value(value: Any | None, field_type: IncrementalFieldType | None) -> Any | None: + if value is None or field_type is None: + return None + + if field_type == IncrementalFieldType.Integer or field_type == IncrementalFieldType.Numeric: + return value + + if field_type == IncrementalFieldType.DateTime or field_type == IncrementalFieldType.Timestamp: + return parser.parse(value) + + if field_type == IncrementalFieldType.Date: + return parser.parse(value).date() + + @activity.defn def import_data_activity_sync(inputs: ImportDataActivityInputs): logger = bind_temporal_worker_logger_sync(team_id=inputs.team_id) @@ -64,6 +83,24 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): endpoints = [schema.name] + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE_V2: + # Get the V2 last value, if it's not set yet (e.g. the first run), then fallback to the V1 value + processed_incremental_last_value = process_incremental_last_value( + schema.sync_type_config.get("incremental_field_last_value_v2"), + schema.sync_type_config.get("incremental_field_type"), + ) + + if processed_incremental_last_value is None: + processed_incremental_last_value = process_incremental_last_value( + schema.sync_type_config.get("incremental_field_last_value"), + schema.sync_type_config.get("incremental_field_type"), + ) + else: + processed_incremental_last_value = process_incremental_last_value( + schema.sync_type_config.get("incremental_field_last_value"), + schema.sync_type_config.get("incremental_field_type"), + ) + source = None if model.pipeline.source_type == ExternalDataSource.Type.STRIPE: from posthog.temporal.data_imports.pipelines.stripe import stripe_source @@ -80,6 +117,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): team_id=inputs.team_id, job_id=inputs.run_id, is_incremental=schema.is_incremental, + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, ) return _run( @@ -178,6 +216,9 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): incremental_field_type=schema.sync_type_config.get("incremental_field_type") if schema.is_incremental else None, + db_incremental_field_last_value=processed_incremental_last_value + if schema.is_incremental + else None, team_id=inputs.team_id, using_ssl=using_ssl, ) @@ -205,6 +246,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): incremental_field_type=schema.sync_type_config.get("incremental_field_type") if schema.is_incremental else None, + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, team_id=inputs.team_id, using_ssl=using_ssl, ) @@ -255,6 +297,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): incremental_field_type=schema.sync_type_config.get("incremental_field_type") if schema.is_incremental else None, + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, ) return _run( @@ -299,6 +342,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): team_id=inputs.team_id, job_id=inputs.run_id, is_incremental=schema.is_incremental, + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, ) return _run( @@ -321,6 +365,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): team_id=inputs.team_id, job_id=inputs.run_id, is_incremental=schema.is_incremental, + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, ) return _run( @@ -342,6 +387,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): team_id=inputs.team_id, job_id=inputs.run_id, is_incremental=schema.is_incremental, + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, ) return _run( @@ -398,6 +444,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): incremental_field_type=schema.sync_type_config.get("incremental_field_type") if schema.is_incremental else None, + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, ) _run( @@ -433,6 +480,7 @@ def import_data_activity_sync(inputs: ImportDataActivityInputs): team_id=inputs.team_id, job_id=inputs.run_id, is_incremental=schema.is_incremental, + db_incremental_field_last_value=processed_incremental_last_value if schema.is_incremental else None, ) return _run( @@ -455,12 +503,18 @@ def _run( schema: ExternalDataSchema, reset_pipeline: bool, ): - table_row_counts = DataImportPipelineSync(job_inputs, source, logger, reset_pipeline, schema.is_incremental).run() - total_rows_synced = sum(table_row_counts.values()) + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE_V2: + PipelineNonDLT(source, logger, job_inputs.run_id, schema.is_incremental).run() + else: + table_row_counts = DataImportPipelineSync( + job_inputs, source, logger, reset_pipeline, schema.is_incremental + ).run() + total_rows_synced = sum(table_row_counts.values()) + + ExternalDataJob.objects.filter(id=inputs.run_id, team_id=inputs.team_id).update( + rows_synced=F("rows_synced") + total_rows_synced + ) - ExternalDataJob.objects.filter(id=inputs.run_id, team_id=inputs.team_id).update( - rows_synced=F("rows_synced") + total_rows_synced - ) source = ExternalDataSource.objects.get(id=inputs.source_id) source.job_inputs.pop("reset_pipeline", None) source.save() diff --git a/posthog/temporal/tests/batch_exports/test_import_data.py b/posthog/temporal/tests/batch_exports/test_import_data.py index c201be4470a14..abf9bb56b094e 100644 --- a/posthog/temporal/tests/batch_exports/test_import_data.py +++ b/posthog/temporal/tests/batch_exports/test_import_data.py @@ -48,6 +48,7 @@ def _setup(team: Team, job_inputs: dict[Any, Any]) -> ImportDataActivityInputs: status=ExternalDataJob.Status.RUNNING, rows_synced=0, workflow_id="some_workflow_id", + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) return ImportDataActivityInputs(team_id=team.pk, schema_id=schema.pk, source_id=source.pk, run_id=str(job.pk)) @@ -86,6 +87,7 @@ def test_postgres_source_without_ssh_tunnel(activity_environment, team, **kwargs table_names=["table_1"], incremental_field=None, incremental_field_type=None, + db_incremental_field_last_value=None, team_id=team.id, using_ssl=True, ) @@ -127,6 +129,7 @@ def test_postgres_source_with_ssh_tunnel_disabled(activity_environment, team, ** table_names=["table_1"], incremental_field=None, incremental_field_type=None, + db_incremental_field_last_value=None, team_id=team.id, using_ssl=True, ) @@ -186,6 +189,7 @@ def __exit__(self, exc_type, exc_value, exc_traceback): table_names=["table_1"], incremental_field=None, incremental_field_type=None, + db_incremental_field_last_value=None, team_id=team.id, using_ssl=True, ) diff --git a/posthog/temporal/tests/data_imports/test_end_to_end.py b/posthog/temporal/tests/data_imports/test_end_to_end.py index fce2047cd1c28..06c198ec5b2d5 100644 --- a/posthog/temporal/tests/data_imports/test_end_to_end.py +++ b/posthog/temporal/tests/data_imports/test_end_to_end.py @@ -19,7 +19,7 @@ from temporalio.testing import WorkflowEnvironment from temporalio.worker import UnsandboxedWorkflowRunner, Worker -from posthog.constants import DATA_WAREHOUSE_TASK_QUEUE +from posthog.constants import DATA_WAREHOUSE_TASK_QUEUE, DATA_WAREHOUSE_TASK_QUEUE_V2 from posthog.hogql.modifiers import create_default_modifiers_for_team from posthog.hogql.query import execute_hogql_query from posthog.hogql_queries.insights.funnels.funnel import Funnel @@ -99,6 +99,19 @@ async def minio_client(): yield minio_client +def pytest_generate_tests(metafunc): + if "task_queue" in metafunc.fixturenames: + metafunc.parametrize("task_queue", [DATA_WAREHOUSE_TASK_QUEUE, DATA_WAREHOUSE_TASK_QUEUE_V2], indirect=True) + + +@pytest.fixture(autouse=True) +def task_queue(request): + queue = getattr(request, "param", None) + + with override_settings(TEMPORAL_TASK_QUEUE=queue): + yield + + async def _run( team: Team, schema_name: str, @@ -142,18 +155,23 @@ async def _run( assert run.status == ExternalDataJob.Status.COMPLETED await sync_to_async(schema.refresh_from_db)() - assert schema.last_synced_at == run.created_at - res = await sync_to_async(execute_hogql_query)(f"SELECT * FROM {table_name}", team) - assert len(res.results) == 1 + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + assert schema.last_synced_at == run.created_at + else: + assert schema.last_synced_at is None - for name, field in external_tables.get(table_name, {}).items(): - if field.hidden: - continue - assert name in (res.columns or []) + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + res = await sync_to_async(execute_hogql_query)(f"SELECT * FROM {table_name}", team) + assert len(res.results) == 1 - await sync_to_async(source.refresh_from_db)() - assert source.job_inputs.get("reset_pipeline", None) is None + for name, field in external_tables.get(table_name, {}).items(): + if field.hidden: + continue + assert name in (res.columns or []) + + await sync_to_async(source.refresh_from_db)() + assert source.job_inputs.get("reset_pipeline", None) is None return workflow_id, inputs @@ -203,11 +221,12 @@ def mock_to_object_store_rs_credentials(class_self): ), mock.patch.object(AwsCredentials, "to_session_credentials", mock_to_session_credentials), mock.patch.object(AwsCredentials, "to_object_store_rs_credentials", mock_to_object_store_rs_credentials), + mock.patch("posthog.temporal.data_imports.external_data_job.trigger_pipeline_v2"), ): async with await WorkflowEnvironment.start_time_skipping() as activity_environment: async with Worker( activity_environment.client, - task_queue=DATA_WAREHOUSE_TASK_QUEUE, + task_queue=settings.TEMPORAL_TASK_QUEUE, workflows=[ExternalDataJobWorkflow], activities=ACTIVITIES, # type: ignore workflow_runner=UnsandboxedWorkflowRunner(), @@ -218,7 +237,7 @@ def mock_to_object_store_rs_credentials(class_self): ExternalDataJobWorkflow.run, inputs, id=workflow_id, - task_queue=DATA_WAREHOUSE_TASK_QUEUE, + task_queue=settings.TEMPORAL_TASK_QUEUE, retry_policy=RetryPolicy(maximum_attempts=1), ) @@ -525,12 +544,13 @@ async def test_postgres_binary_columns(team, postgres_config, postgres_connectio mock_data_response=[], ) - res = await sync_to_async(execute_hogql_query)(f"SELECT * FROM postgres_binary_col_test", team) - columns = res.columns + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + res = await sync_to_async(execute_hogql_query)(f"SELECT * FROM postgres_binary_col_test", team) + columns = res.columns - assert columns is not None - assert len(columns) == 1 - assert columns[0] == "id" + assert columns is not None + assert len(columns) == 1 + assert columns[0] == "id" @pytest.mark.django_db(transaction=True) @@ -558,9 +578,14 @@ def get_jobs(): latest_job = jobs[0] folder_path = await sync_to_async(latest_job.folder_path)() - s3_objects = await minio_client.list_objects_v2( - Bucket=BUCKET_NAME, Prefix=f"{folder_path}/balance_transaction__query/" - ) + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + s3_objects = await minio_client.list_objects_v2( + Bucket=BUCKET_NAME, Prefix=f"{folder_path}/balance_transaction__query/" + ) + else: + s3_objects = await minio_client.list_objects_v2( + Bucket=BUCKET_NAME, Prefix=f"{folder_path}/balance_transaction__query_v2/" + ) assert len(s3_objects["Contents"]) != 0 @@ -587,23 +612,24 @@ async def test_funnels_lazy_joins_ordering(team, stripe_customer): field_name="stripe_customer", ) - query = FunnelsQuery( - series=[EventsNode(), EventsNode()], - breakdownFilter=BreakdownFilter( - breakdown_type=BreakdownType.DATA_WAREHOUSE_PERSON_PROPERTY, breakdown="stripe_customer.email" - ), - ) - funnel_class = Funnel(context=FunnelQueryContext(query=query, team=team)) - - query_ast = funnel_class.get_query() - await sync_to_async(execute_hogql_query)( - query_type="FunnelsQuery", - query=query_ast, - team=team, - modifiers=create_default_modifiers_for_team( - team, HogQLQueryModifiers(personsOnEventsMode=PersonsOnEventsMode.PERSON_ID_OVERRIDE_PROPERTIES_JOINED) - ), - ) + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + query = FunnelsQuery( + series=[EventsNode(), EventsNode()], + breakdownFilter=BreakdownFilter( + breakdown_type=BreakdownType.DATA_WAREHOUSE_PERSON_PROPERTY, breakdown="stripe_customer.email" + ), + ) + funnel_class = Funnel(context=FunnelQueryContext(query=query, team=team)) + + query_ast = funnel_class.get_query() + await sync_to_async(execute_hogql_query)( + query_type="FunnelsQuery", + query=query_ast, + team=team, + modifiers=create_default_modifiers_for_team( + team, HogQLQueryModifiers(personsOnEventsMode=PersonsOnEventsMode.PERSON_ID_OVERRIDE_PROPERTIES_JOINED) + ), + ) @pytest.mark.django_db(transaction=True) @@ -636,12 +662,13 @@ async def test_postgres_schema_evolution(team, postgres_config, postgres_connect sync_type_config={"incremental_field": "id", "incremental_field_type": "integer"}, ) - res = await sync_to_async(execute_hogql_query)("SELECT * FROM postgres_test_table", team) - columns = res.columns + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + res = await sync_to_async(execute_hogql_query)("SELECT * FROM postgres_test_table", team) + columns = res.columns - assert columns is not None - assert len(columns) == 1 - assert any(x == "id" for x in columns) + assert columns is not None + assert len(columns) == 1 + assert any(x == "id" for x in columns) # Evole schema await postgres_connection.execute( @@ -655,18 +682,20 @@ async def test_postgres_schema_evolution(team, postgres_config, postgres_connect # Execute the same schema again - load await _execute_run(str(uuid.uuid4()), inputs, []) - res = await sync_to_async(execute_hogql_query)("SELECT * FROM postgres_test_table", team) - columns = res.columns + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + res = await sync_to_async(execute_hogql_query)("SELECT * FROM postgres_test_table", team) + columns = res.columns - assert columns is not None - assert len(columns) == 2 - assert any(x == "id" for x in columns) - assert any(x == "new_col" for x in columns) + assert columns is not None + assert len(columns) == 2 + assert any(x == "id" for x in columns) + assert any(x == "new_col" for x in columns) @pytest.mark.django_db(transaction=True) @pytest.mark.asyncio async def test_sql_database_missing_incremental_values(team, postgres_config, postgres_connection): + await postgres_connection.execute("CREATE SCHEMA IF NOT EXISTS {schema}".format(schema=postgres_config["schema"])) await postgres_connection.execute( "CREATE TABLE IF NOT EXISTS {schema}.test_table (id integer)".format(schema=postgres_config["schema"]) ) @@ -697,15 +726,16 @@ async def test_sql_database_missing_incremental_values(team, postgres_config, po sync_type_config={"incremental_field": "id", "incremental_field_type": "integer"}, ) - res = await sync_to_async(execute_hogql_query)("SELECT * FROM postgres_test_table", team) - columns = res.columns + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + res = await sync_to_async(execute_hogql_query)("SELECT * FROM postgres_test_table", team) + columns = res.columns - assert columns is not None - assert len(columns) == 1 - assert any(x == "id" for x in columns) + assert columns is not None + assert len(columns) == 1 + assert any(x == "id" for x in columns) - # Exclude rows that don't have the incremental cursor key set - assert len(res.results) == 1 + # Exclude rows that don't have the incremental cursor key set + assert len(res.results) == 1 @pytest.mark.django_db(transaction=True) @@ -739,15 +769,16 @@ async def test_sql_database_incremental_initial_value(team, postgres_config, pos sync_type_config={"incremental_field": "id", "incremental_field_type": "integer"}, ) - res = await sync_to_async(execute_hogql_query)("SELECT * FROM postgres_test_table", team) - columns = res.columns + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + res = await sync_to_async(execute_hogql_query)("SELECT * FROM postgres_test_table", team) + columns = res.columns - assert columns is not None - assert len(columns) == 1 - assert any(x == "id" for x in columns) + assert columns is not None + assert len(columns) == 1 + assert any(x == "id" for x in columns) - # Include rows that have the same incremental value as the `initial_value` - assert len(res.results) == 1 + # Include rows that have the same incremental value as the `initial_value` + assert len(res.results) == 1 @pytest.mark.django_db(transaction=True) @@ -1007,7 +1038,8 @@ async def test_delta_table_deleted(team, stripe_balance_transaction): sync_type=ExternalDataSchema.SyncType.FULL_REFRESH, ) - with mock.patch.object(DeltaTable, "delete") as mock_delta_table_delete: - await _execute_run(str(uuid.uuid4()), inputs, stripe_balance_transaction["data"]) + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE: + with mock.patch.object(DeltaTable, "delete") as mock_delta_table_delete: + await _execute_run(str(uuid.uuid4()), inputs, stripe_balance_transaction["data"]) - mock_delta_table_delete.assert_called_once() + mock_delta_table_delete.assert_called_once() diff --git a/posthog/temporal/tests/external_data/test_external_data_job.py b/posthog/temporal/tests/external_data/test_external_data_job.py index f931c97f93943..103513662daeb 100644 --- a/posthog/temporal/tests/external_data/test_external_data_job.py +++ b/posthog/temporal/tests/external_data/test_external_data_job.py @@ -149,6 +149,7 @@ def _create_external_data_job( rows_synced=0, workflow_id=workflow_id, workflow_run_id=workflow_run_id, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) return job @@ -391,6 +392,7 @@ def setup_job_1(): status=ExternalDataJob.Status.RUNNING, rows_synced=0, schema=customer_schema, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) new_job = ExternalDataJob.objects.get(id=new_job.id) @@ -423,6 +425,7 @@ def setup_job_2(): status=ExternalDataJob.Status.RUNNING, rows_synced=0, schema=charge_schema, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) new_job = ExternalDataJob.objects.get(id=new_job.id) @@ -565,6 +568,7 @@ def setup_job_1(): status=ExternalDataJob.Status.RUNNING, rows_synced=0, schema=customer_schema, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) new_job = ( @@ -764,6 +768,7 @@ async def setup_job_1(): status=ExternalDataJob.Status.RUNNING, rows_synced=0, schema=posthog_test_schema, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) new_job = await sync_to_async( diff --git a/posthog/warehouse/api/external_data_source.py b/posthog/warehouse/api/external_data_source.py index 448c06533bf19..28b1ebda1bf2e 100644 --- a/posthog/warehouse/api/external_data_source.py +++ b/posthog/warehouse/api/external_data_source.py @@ -1229,7 +1229,11 @@ def jobs(self, request: Request, *arg: Any, **kwargs: Any): after = request.query_params.get("after", None) before = request.query_params.get("before", None) - jobs = instance.jobs.prefetch_related("schema").order_by("-created_at") + jobs = ( + instance.jobs.exclude(pipeline_version=ExternalDataJob.PipelineVersion.V2) + .prefetch_related("schema") + .order_by("-created_at") + ) if after: after_date = parser.parse(after) diff --git a/posthog/warehouse/api/test/test_external_data_source.py b/posthog/warehouse/api/test/test_external_data_source.py index f638700822af8..3fede72455ebd 100644 --- a/posthog/warehouse/api/test/test_external_data_source.py +++ b/posthog/warehouse/api/test/test_external_data_source.py @@ -704,6 +704,7 @@ def test_source_jobs(self): status=ExternalDataJob.Status.COMPLETED, rows_synced=100, workflow_run_id="test_run_id", + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) response = self.client.get( @@ -720,6 +721,28 @@ def test_source_jobs(self): assert data[0]["schema"]["id"] == str(schema.pk) assert data[0]["workflow_run_id"] is not None + def test_source_jobs_v2_job(self): + source = self._create_external_data_source() + schema = self._create_external_data_schema(source.pk) + ExternalDataJob.objects.create( + team=self.team, + pipeline=source, + schema=schema, + status=ExternalDataJob.Status.COMPLETED, + rows_synced=100, + workflow_run_id="test_run_id", + pipeline_version=ExternalDataJob.PipelineVersion.V2, + ) + + response = self.client.get( + f"/api/projects/{self.team.pk}/external_data_sources/{source.pk}/jobs", + ) + + data = response.json() + + assert response.status_code, status.HTTP_200_OK + assert len(data) == 0 + def test_source_jobs_pagination(self): source = self._create_external_data_source() schema = self._create_external_data_schema(source.pk) @@ -731,6 +754,7 @@ def test_source_jobs_pagination(self): status=ExternalDataJob.Status.COMPLETED, rows_synced=100, workflow_run_id="test_run_id", + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) response = self.client.get( @@ -752,6 +776,7 @@ def test_source_jobs_pagination(self): status=ExternalDataJob.Status.COMPLETED, rows_synced=100, workflow_run_id="test_run_id", + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) response = self.client.get( @@ -773,6 +798,7 @@ def test_source_jobs_pagination(self): status=ExternalDataJob.Status.COMPLETED, rows_synced=100, workflow_run_id="test_run_id", + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) response = self.client.get( diff --git a/posthog/warehouse/api/test/test_log_entry.py b/posthog/warehouse/api/test/test_log_entry.py index c7ed98c572f72..14564015c230d 100644 --- a/posthog/warehouse/api/test/test_log_entry.py +++ b/posthog/warehouse/api/test/test_log_entry.py @@ -91,7 +91,13 @@ def external_data_resources(client, organization, team): # No status but should be completed because a data warehouse table already exists ) job = ExternalDataJob.objects.create( - pipeline=source, schema=schema, workflow_id="fake_workflow_id", team=team, status="Running", rows_synced=100000 + pipeline=source, + schema=schema, + workflow_id="fake_workflow_id", + team=team, + status="Running", + rows_synced=100000, + pipeline_version=ExternalDataJob.PipelineVersion.V1, ) return { diff --git a/posthog/warehouse/models/external_data_job.py b/posthog/warehouse/models/external_data_job.py index ae7b642494966..d9949e00d4423 100644 --- a/posthog/warehouse/models/external_data_job.py +++ b/posthog/warehouse/models/external_data_job.py @@ -15,6 +15,10 @@ class Status(models.TextChoices): COMPLETED = "Completed", "Completed" CANCELLED = "Cancelled", "Cancelled" + class PipelineVersion(models.TextChoices): + V1 = "v1-dlt-sync", "v1-dlt-sync" + V2 = "v2-non-dlt", "v2-non-dlt" + team = models.ForeignKey(Team, on_delete=models.CASCADE) pipeline = models.ForeignKey("posthog.ExternalDataSource", related_name="jobs", on_delete=models.CASCADE) schema = models.ForeignKey("posthog.ExternalDataSchema", on_delete=models.CASCADE, null=True, blank=True) @@ -25,6 +29,8 @@ class Status(models.TextChoices): workflow_id = models.CharField(max_length=400, null=True, blank=True) workflow_run_id = models.CharField(max_length=400, null=True, blank=True) + pipeline_version = models.CharField(max_length=400, choices=PipelineVersion.choices, null=True, blank=True) + __repr__ = sane_repr("id") def folder_path(self) -> str: @@ -35,9 +41,17 @@ def folder_path(self) -> str: def url_pattern_by_schema(self, schema: str) -> str: if TEST: - return f"http://{settings.AIRBYTE_BUCKET_DOMAIN}/{settings.BUCKET}/{self.folder_path()}/{schema.lower()}/" + if self.pipeline_version == ExternalDataJob.PipelineVersion.V1: + return ( + f"http://{settings.AIRBYTE_BUCKET_DOMAIN}/{settings.BUCKET}/{self.folder_path()}/{schema.lower()}/" + ) + else: + return f"http://{settings.AIRBYTE_BUCKET_DOMAIN}/{settings.BUCKET}/{self.folder_path()}/{schema.lower()}__v2/" + + if self.pipeline_version == ExternalDataJob.PipelineVersion.V1: + return f"https://{settings.AIRBYTE_BUCKET_DOMAIN}/dlt/{self.folder_path()}/{schema.lower()}/" - return f"https://{settings.AIRBYTE_BUCKET_DOMAIN}/dlt/{self.folder_path()}/{schema.lower()}/" + return f"https://{settings.AIRBYTE_BUCKET_DOMAIN}/dlt/{self.folder_path()}/{schema.lower()}__v2/" @database_sync_to_async diff --git a/posthog/warehouse/models/external_data_schema.py b/posthog/warehouse/models/external_data_schema.py index 3cb3fcfbce33c..ba07884346912 100644 --- a/posthog/warehouse/models/external_data_schema.py +++ b/posthog/warehouse/models/external_data_schema.py @@ -8,6 +8,7 @@ import numpy import snowflake.connector from django.conf import settings +from posthog.constants import DATA_WAREHOUSE_TASK_QUEUE_V2 from posthog.models.team import Team from posthog.models.utils import CreatedMetaFields, DeletedMetaFields, UUIDModel, UpdatedMetaFields, sane_repr import uuid @@ -51,6 +52,8 @@ class SyncFrequency(models.TextChoices): status = models.CharField(max_length=400, null=True, blank=True) last_synced_at = models.DateTimeField(null=True, blank=True) sync_type = models.CharField(max_length=128, choices=SyncType.choices, null=True, blank=True) + + # { "incremental_field": string, "incremental_field_type": string, "incremental_field_last_value": any, "incremental_field_last_value_v2": any } sync_type_config = models.JSONField( default=dict, blank=True, @@ -70,11 +73,6 @@ def folder_path(self) -> str: def is_incremental(self): return self.sync_type == self.SyncType.INCREMENTAL - def soft_delete(self): - self.deleted = True - self.deleted_at = datetime.now() - self.save() - def update_incremental_field_last_value(self, last_value: Any) -> None: incremental_field_type = self.sync_type_config.get("incremental_field_type") @@ -93,7 +91,17 @@ def update_incremental_field_last_value(self, last_value: Any) -> None: else: last_value_json = str(last_value_py) - self.sync_type_config["incremental_field_last_value"] = last_value_json + if settings.TEMPORAL_TASK_QUEUE == DATA_WAREHOUSE_TASK_QUEUE_V2: + key = "incremental_field_last_value_v2" + else: + key = "incremental_field_last_value" + + self.sync_type_config[key] = last_value_json + self.save() + + def soft_delete(self): + self.deleted = True + self.deleted_at = datetime.now() self.save() diff --git a/posthog/warehouse/models/external_table_definitions.py b/posthog/warehouse/models/external_table_definitions.py index 00704ec6c3994..4294cc6003836 100644 --- a/posthog/warehouse/models/external_table_definitions.py +++ b/posthog/warehouse/models/external_table_definitions.py @@ -16,6 +16,7 @@ "*": { "__dlt_id": StringDatabaseField(name="_dlt_id", hidden=True), "__dlt_load_id": StringDatabaseField(name="_dlt_load_id", hidden=True), + "__ph_debug": StringJSONDatabaseField(name="_ph_debug", hidden=True), }, "stripe_account": { "id": StringDatabaseField(name="id"), diff --git a/posthog/warehouse/models/table.py b/posthog/warehouse/models/table.py index f5bdb94b246eb..0f960d2648c8d 100644 --- a/posthog/warehouse/models/table.py +++ b/posthog/warehouse/models/table.py @@ -1,5 +1,5 @@ from datetime import datetime -from typing import Optional, TypeAlias +from typing import TYPE_CHECKING, Optional, TypeAlias from django.db import models from posthog.client import sync_execute @@ -29,6 +29,9 @@ from .external_table_definitions import external_tables from posthog.hogql.context import HogQLContext +if TYPE_CHECKING: + from posthog.warehouse.models import ExternalDataJob + SERIALIZED_FIELD_TO_CLICKHOUSE_MAPPING: dict[DatabaseSerializedFieldType, str] = { DatabaseSerializedFieldType.INTEGER: "Int64", DatabaseSerializedFieldType.FLOAT: "Float64", @@ -138,7 +141,11 @@ def validate_column_type(self, column_key) -> bool: except: return False - def get_columns(self, safe_expose_ch_error=True) -> DataWarehouseTableColumns: + def get_columns( + self, + pipeline_version: Optional["ExternalDataJob.PipelineVersion"] = None, + safe_expose_ch_error: bool = True, + ) -> DataWarehouseTableColumns: try: placeholder_context = HogQLContext(team_id=self.team.pk) s3_table_func = build_function_call( @@ -147,6 +154,7 @@ def get_columns(self, safe_expose_ch_error=True) -> DataWarehouseTableColumns: access_key=self.credential.access_key, access_secret=self.credential.access_secret, context=placeholder_context, + pipeline_version=pipeline_version, ) result = sync_execute( diff --git a/requirements.in b/requirements.in index 16ee79fb66e03..faefd16d9294d 100644 --- a/requirements.in +++ b/requirements.in @@ -14,6 +14,7 @@ celery==5.3.4 celery-redbeat==2.1.1 clickhouse-driver==0.2.7 clickhouse-pool==0.5.3 +conditional-cache==1.2 cryptography==39.0.2 dj-database-url==0.5.0 Django~=4.2.15 diff --git a/requirements.txt b/requirements.txt index cd9ac20220391..639c98066ccd4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -95,6 +95,8 @@ charset-normalizer==2.1.0 # via # requests # snowflake-connector-python +circular-dict==1.9 + # via conditional-cache click==8.1.7 # via # celery @@ -115,6 +117,8 @@ clickhouse-driver==0.2.7 # sentry-sdk clickhouse-pool==0.5.3 # via -r requirements.in +conditional-cache==1.2 + # via -r requirements.in cryptography==39.0.2 # via # -r requirements.in