From f1793c9576b35f2ffb97bb59b35069a0eb4c2867 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 10 Dec 2024 10:55:38 +0100 Subject: [PATCH] Release 0.2.0 (#479) This is a draft PR for the 0.2.0 release, all it really contains is a changelog with a curated list of user-facing changes and some doc changes for those features. It also starts to build docker images for PG14. --------- Co-authored-by: Jonathan Dance (JD) --- .github/workflows/docker.yaml | 2 +- CHANGELOG.md | 49 ++++++++++++++++++++++++++++++++- README.md | 4 +-- docker-bake.hcl | 8 ++++++ docs/README.md | 3 ++- docs/compilation.md | 2 +- docs/extensions.md | 11 ++++++-- docs/functions.md | 51 ++++++++++++++++++++++++++++++++--- docs/secrets.md | 37 +++++++++++++++++++++++-- docs/settings.md | 14 ++++++++++ docs/transactions.md | 5 ++++ docs/types.md | 2 +- 12 files changed, 173 insertions(+), 15 deletions(-) create mode 100644 docs/transactions.md diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 67d8f7e2..698ff0a6 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -18,7 +18,7 @@ jobs: runs-on: ubuntu-24.04 strategy: matrix: - postgres: ["15", "16", "17"] + postgres: ["14", "15", "16", "17"] steps: - name: Login to Docker Hub diff --git a/CHANGELOG.md b/CHANGELOG.md index 69afd520..ee99f051 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,50 @@ -# 0.1.0 +# 0.2.0 (2024-12-10) + +## Added + +- Support for reading Delta Lake storage using the `duckdb.delta_scan(...)` function. ([#403]) +- Support for reading JSON using the `duckdb.read_json(...)` function. ([#405]) +- Support for multi-statement transactions. ([#433]) +- Support reading from Azure Blob storage. ([#478]) +- Support many more array types, such as `float` , `numeric` and `uuid` arrays. ([#282]) +- Support for PostgreSQL 14. ([#397]) +- Manage cached files using the `duckdb.cache_info()` and `duckdb.cache_delete()` functions. ([#434]) +- Add `scope` column to `duckdb.secrets` table. ([#461]) +- Allow configuring the default MotherDuck database using the `duckdb.motherduck_default_database` setting. ([#470]) +- Automatically install and load known DuckDB extensions when queries use them. So, `duckdb.install_extension()` is usually not necessary anymore. ([#484]) + +## Changed + +- Improve performance of heap reading. ([#366]) +- Bump DuckDB version to 1.1.3. ([#400]) + +## Fixed + +- Throw a clear error when reading partitioned tables (reading from partitioned tables is not supported yet). ([#412]) +- Fixed crash when using `CREATE SCHEMA AUTHORIZATION`. ([#423]) +- Fix queries inserting into DuckDB tables with `DEFAULT` values. ([#448]) +- Fixed assertion failure involving recursive CTEs. ([#436]) +- Only allow setting `duckdb.motherduck_postgres_database` in `postgresql.conf`. ([#476]) +- Much better separation between C and C++ code, to avoid memory leaks and crashes (many PRs). + +[#403]: https://github.com/duckdb/pg_duckdb/pull/403 +[#405]: https://github.com/duckdb/pg_duckdb/pull/405 +[#433]: https://github.com/duckdb/pg_duckdb/pull/433 +[#478]: https://github.com/duckdb/pg_duckdb/pull/478 +[#282]: https://github.com/duckdb/pg_duckdb/pull/282 +[#397]: https://github.com/duckdb/pg_duckdb/pull/397 +[#434]: https://github.com/duckdb/pg_duckdb/pull/434 +[#461]: https://github.com/duckdb/pg_duckdb/pull/461 +[#470]: https://github.com/duckdb/pg_duckdb/pull/470 +[#366]: https://github.com/duckdb/pg_duckdb/pull/366 +[#400]: https://github.com/duckdb/pg_duckdb/pull/400 +[#412]: https://github.com/duckdb/pg_duckdb/pull/412 +[#423]: https://github.com/duckdb/pg_duckdb/pull/423 +[#448]: https://github.com/duckdb/pg_duckdb/pull/448 +[#436]: https://github.com/duckdb/pg_duckdb/pull/436 +[#476]: https://github.com/duckdb/pg_duckdb/pull/476 +[#484]: https://github.com/duckdb/pg_duckdb/pull/484 + +# 0.1.0 (2024-10-24) Initial release. diff --git a/README.md b/README.md index 6c9d394c..87719b5a 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,7 @@ Note: due to the use of `shared_preload_libraries`, pgxman's container support i To build pg_duckdb, you need: -* Postgres 15-17 +* Postgres 14-17 * Ubuntu 22.04-24.04 or MacOS * Standard set of build tools for building Postgres extensions * [Build tools that are required to build DuckDB](https://duckdb.org/docs/dev/building/build_instructions) @@ -164,7 +164,7 @@ INSERT INTO duckdb.secrets VALUES ('Azure', ''); ``` -Note: writes to Azure are not yet supported, [here][duckdb/duckdb_azure#44] is the current discussion for more information. +Note: writes to Azure are not yet supported, please see [the current discussion](duckdb/duckdb_azure#44) for more information. ### Connect with MotherDuck diff --git a/docker-bake.hcl b/docker-bake.hcl index d4e6ac22..d86a3256 100644 --- a/docker-bake.hcl +++ b/docker-bake.hcl @@ -34,6 +34,14 @@ target "pg_duckdb" { target = "output" } +target "pg_duckdb_14" { + inherits = ["pg_duckdb"] + + args = { + POSTGRES_VERSION = "14" + } +} + target "pg_duckdb_15" { inherits = ["pg_duckdb"] diff --git a/docs/README.md b/docs/README.md index 1e19a2f9..abccefcb 100644 --- a/docs/README.md +++ b/docs/README.md @@ -3,10 +3,11 @@ Documentation is a work in progress. Search for `TODO` to see areas that need further contribution. * [Functions](functions.md) -* [Motherduck](motherduck.md) +* [MotherDuck](motherduck.md) * [Data Lake Secrets Management](secrets.md) * [Settings](settings.md) * [Extensions](extensions.md) * [Types](types.md) +* [Transactions](transactions.md) * [Compiling from source](compilation.md) * [Changelog](../CHANGELOG.md) diff --git a/docs/compilation.md b/docs/compilation.md index fef78e96..9b15c903 100644 --- a/docs/compilation.md +++ b/docs/compilation.md @@ -2,7 +2,7 @@ To build pg_duckdb, you need: -* Postgres 15-17 +* Postgres 14-17 * Ubuntu 22.04-24.04 or MacOS * Standard set of build tools for building Postgres extensions * [Build tools that are required to build DuckDB](https://duckdb.org/docs/dev/building/build_instructions) diff --git a/docs/extensions.md b/docs/extensions.md index 86ebb213..2475e05e 100644 --- a/docs/extensions.md +++ b/docs/extensions.md @@ -8,12 +8,15 @@ The following extensions are installed by default: Supported extensions for installation are: * iceberg +* delta Installing other extensions may work, but is at your own risk. ## Installing an extension -Installing an extension requires superuser. +By default known extensions are allowed to be automatically installed and loaded when a DuckDB query depends on them. This behaviour can be configured using the [`duckdb.autoinstall_known_extensions`](settings.md#duckdbautoinstall_known_extensions) and [`duckdb.autoload_known_extensions`](settings.md#duckdbautoload_known_extensions) settings. + +It's also possible to manually install an extension. This can be useful when this autoinstall/autoaload behaviour is disabled, or when DuckDB fails to realise an extension is necessary to execute the query. Installing an extension requires superuser. ```sql SELECT duckdb.install_extension('extname'); @@ -40,4 +43,8 @@ There is currently no practical difference between a disabled and uninstalled ex ### `iceberg` -Iceberg support adds functions to read iceberg tables and metadata. For a list of iceberg functions, see [pg_duckdb Functions](functions.md). +Iceberg support adds functions to read Iceberg tables and metadata. For a list of iceberg functions, see [pg_duckdb Functions](functions.md). + +### `delta` + +Delta support adds the ability to read Delta Lake files via [delta_scan](functions.md#delta_scan). diff --git a/docs/functions.md b/docs/functions.md index 8e9e65b0..f803a330 100644 --- a/docs/functions.md +++ b/docs/functions.md @@ -16,11 +16,18 @@ Note: `ALTER EXTENSION pg_duckdb WITH SCHEMA schema` is not currently supported. | [`iceberg_snapshots`](#iceberg_snapshots) | Read Iceberg snapshot information | | [`delta_scan`](#delta_scan) | Read a Delta dataset | -## DuckDB Administration Functions +## Cache Management Functions | Name | Description | | :--- | :---------- | | [`duckdb.cache`](#cache) | Caches a Parquet or CSV file to disk | +| [`duckdb.cache_info`](#cache_info) | Returns metadata about cached files | +| [`duckdb.cache_delete`](#cache_delete) | Deletes a file from the cache | + +## DuckDB Administration Functions + +| Name | Description | +| :--- | :---------- | | [`duckdb.install_extension`](#install_extension) | Installs a DuckDB extension | | [`duckdb.raw_query`](#raw_query) | Runs a query directly against DuckDB (meant for debugging)| | [`duckdb.recycle_ddb`](#recycle_ddb) | Force a reset the DuckDB instance in the current connection (meant for debugging) | @@ -228,12 +235,48 @@ Further information: | :--- | :--- | :---------- | | path | text | The path, either to a remote httpfs location or a local location (if enabled) of the delta dataset to read. | -##### Optional Parameters +#### `duckdb.cache(path TEXT, type TEXT) -> bool` +Caches a parquet or CSV file to local disk. The file is downloaded synchronously during the execution of the function. Once cached, the cached file is used automatically whenever that URL is used in other httpfs calls, provided that the remote data has not changed. Data is stored based on the eTag of the remote file. -#### `duckdb.cache(path TEXT, /* optional parameters */) -> bool` +Note that cache management is not automated. Cached data must be deleted manually. -TODO +##### Required Arguments + +| Name | Type | Description | +| :--- | :--- | :---------- | +| path | text | The path to a remote httpfs location to cache. | +| type | text | File type, either `parquet` or `csv` | + + +#### `duckdb.cache_info() -> (remote_path text, cache_key text, cache_file_size BIGINT, cache_file_timestamp TIMESTAMPTZ)` + +Inspects which remote files are currently cached in DuckDB. The returned data is as follows: + +| Name | Type | Description | +| :--- | :--- | :---------- | +| remote_path | text | The original path to the remote httpfs location that was cached | +| cache_key | text | The cache key (eTag) used to store the file | +| cache_file_size | bigint | File size in bytes | +| cache_file_timestamp | timestamptz | Creation time of the cached file | + +#### `duckdb.cache_delete(cache_key text) -> bool` + +Deletes a file from the DuckDB cache using the `unique cache_key` of the file. + +Example: To delete any copies of a particular remote file: + +```sql +SELECT duckdb.cache_delete(cache_key) +FROM duckdb.cache_info() +WHERE remote_path = '...'; +``` + +##### Required Arguments + +| Name | Type | Description | +| :--- | :--- | :---------- | +| cache_key | text | The cache key (eTag) to delete | #### `duckdb.install_extension(extension_name TEXT) -> bool` diff --git a/docs/secrets.md b/docs/secrets.md index cf4e2c83..cfb7808d 100644 --- a/docs/secrets.md +++ b/docs/secrets.md @@ -1,6 +1,6 @@ # Secrets -DuckDB secrets can be configured in the `duckdb.secrets` table: +DuckDB secrets can be configured in the `duckdb.secrets` table. For example: ```sql -- Session Token is Optional @@ -9,4 +9,37 @@ INSERT INTO duckdb.secrets VALUES ('S3', 'access_key_id', 'secret_access_key', 'session_token', 'us-east-1'); ``` -TODO: document `duckdb.secrets` in full detail +## Columns + +| Name | Type | Required | Description | +| :--- | :--- | :------- | :---------- | +| name | text | no | automatically generated UUID (primary key) | +| type | text | yes | One of `S3` for Amazon S3, `GCS` for Google Cloud Storage, `R2` for Cloudflare R2, or `Azure` for Azure Blob Storage. | +| key_id | text | S3, GCS, R2 | the "ID" portion of the secret | +| secret | text | S3, GCS, R2 | the "password" portion of the secret | +| session_token | text | no | the AWS S3 session token if required for your credential | +| region | text | S3 only | for AWS S3, this specifies the region of your bucket | +| endpoint | text | no | if using an S3-compatible service other than AWS, this specifies the endpoint of the service | +| r2_account_id | text | R2 only | if using Cloudflare R2, the account ID for the credential | +| use_ssl | boolean | no | `true` by default; `false` is principally for use with custom minio configurations | +| scope | text | no | The URL prefix which applies to this credential. This is used to [select between multiple credentials](scope) for the same service. | +| connection_string | text | Azure only | Connection string for Azure | + +[scope]: https://duckdb.org/docs/configuration/secrets_manager.html#creating-multiple-secrets-for-the-same-service-type + +## How it works + +Secrets are stored in a Postgres heap table. Each time a DuckDB instance is created by pg_duckdb, and when a secret is modified, the secrets are loaded into the DuckDB secrets manager as non-persistent secrets. + +## Caveats + +* Only the listed types of secrets above are currently supported. As of DuckDB 1.1.3, MySQL, Huggingface, and PostgreSQL secrets are not supported. +* Other authentication providers are not yet supported, e.g. `CHAIN`. + +## Further reading + +* [DuckDB Secrets Manager](https://duckdb.org/docs/configuration/secrets_manager.html) +* [S3 API Support](https://duckdb.org/docs/extensions/httpfs/s3api.html) +* [Google Cloud Storage Import](https://duckdb.org/docs/guides/network_cloud_storage/gcs_import.html) +* [Cloudflare R2 Import](https://duckdb.org/docs/guides/network_cloud_storage/cloudflare_r2_import.html) +* [Azure Extension](https://duckdb.org/docs/extensions/azure) diff --git a/docs/settings.md b/docs/settings.md index 07ebc1f4..a73f1709 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -67,6 +67,20 @@ Default: `"LocalFileSystem"` Access: Superuser-only +### `duckdb.autoinstall_known_extensions` + +Whether known extensions are allowed to be automatically installed when a DuckDB query depends on them. + +Default: `true` + +Access: Superuser-only + +### `duckdb.autoload_known_extensions` + +Whether known extensions are allowed to be automatically loaded when a DuckDB query depends on them. + +Default: `true` + ### `duckdb.enable_external_access` (experimental) Allow the DuckDB to access external access (e.g., HTTP, S3, etc.). This setting is not tested very well yet and disabling it may break unintended `pg_duckdb` functionality. diff --git a/docs/transactions.md b/docs/transactions.md new file mode 100644 index 00000000..98a6e979 --- /dev/null +++ b/docs/transactions.md @@ -0,0 +1,5 @@ +# Transactions in pg_duckdb + +Multi-statement transactions are supported in pg_duckdb. There is one important restriction on this though, which is is currently necessary to ensure the expected ACID guarantees: You cannot write to both a Postgres table and a DuckDB table in the same transaction. + +Sadly, this restriction also means that running DDL (e.g. `CREATE TABLE ... USING duckdb`) is currently not supported in transactions. This is due to the fact that this requires writing to metadata tables in both Postgres and DuckDB. diff --git a/docs/types.md b/docs/types.md index 03382ab4..92e6fef8 100644 --- a/docs/types.md +++ b/docs/types.md @@ -11,7 +11,7 @@ Able to read many [data types](https://www.postgresql.org/docs/current/datatype. - `boolean` - `uuid` - `json` -- `arrays` for some of the above types +- `arrays` for all of the above types ## Known limitations