From a26fc13988952c0271a73677607b37530d71cc1c Mon Sep 17 00:00:00 2001 From: mrdrivingduck Date: Sun, 19 Nov 2023 12:52:52 +0800 Subject: [PATCH 1/3] ci: use latest image --- .github/workflows/regression-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/regression-test.yml b/.github/workflows/regression-test.yml index 07e0046d492..1620e1ac35e 100644 --- a/.github/workflows/regression-test.yml +++ b/.github/workflows/regression-test.yml @@ -44,7 +44,7 @@ jobs: -t \ --name polardb_${{ matrix.container_image }} \ -v `pwd`:/home/postgres/PolarDB-for-PostgreSQL \ - mrdrivingduck/polardb_pg_devel:${{ matrix.container_image }} \ + polardb/polardb_pg_devel:${{ matrix.container_image }} \ bash && \ docker start polardb_${{ matrix.container_image }} From 6c35ac495c37df7214ad8a448ef61c76a968ba98 Mon Sep 17 00:00:00 2001 From: mrdrivingduck Date: Thu, 28 Dec 2023 14:15:14 +0800 Subject: [PATCH 2/3] docs: image tag update --- README-CN.md | 10 +++++----- README.md | 10 +++++----- docs/deploying/fs-pfs.md | 4 ++-- docs/operation/ro-online-promote.md | 4 ++-- docs/operation/scale-out.md | 8 ++++---- docs/operation/tpch-test.md | 4 ++-- docs/zh/deploying/fs-pfs.md | 4 ++-- docs/zh/operation/ro-online-promote.md | 4 ++-- docs/zh/operation/scale-out.md | 8 ++++---- docs/zh/operation/tpch-test.md | 4 ++-- 10 files changed, 30 insertions(+), 30 deletions(-) diff --git a/README-CN.md b/README-CN.md index f60dcf2538c..ffdce5c1b29 100644 --- a/README-CN.md +++ b/README-CN.md @@ -1,6 +1,6 @@
-[![logo](docs/.vuepress/public/images/polardb.png)](https://developer.aliyun.com/topic/polardb-for-pg) +[![logo](docs/.vuepress/public/images/polardb.png)](https://www.polardbpg.com/home) # PolarDB for PostgreSQL @@ -8,7 +8,7 @@ #### [English](README.md) | 简体中文 -[![official](https://img.shields.io/badge/官方网站-blueviolet?style=for-the-badge&logo=alibabacloud)](https://developer.aliyun.com/topic/polardb-for-pg) +[![official](https://img.shields.io/badge/官方网站-blueviolet?style=for-the-badge&logo=alibabacloud)](https://www.polardbpg.com/home) [![cirrus-ci-stable](https://img.shields.io/cirrus/github/ApsaraDB/PolarDB-for-PostgreSQL/POLARDB_11_STABLE?style=for-the-badge&logo=cirrusci)](https://cirrus-ci.com/github/ApsaraDB/PolarDB-for-PostgreSQL/POLARDB_11_STABLE) [![cirrus-ci-dev](https://img.shields.io/cirrus/github/ApsaraDB/PolarDB-for-PostgreSQL/POLARDB_11_DEV?style=for-the-badge&logo=cirrusci)](https://cirrus-ci.com/github/ApsaraDB/PolarDB-for-PostgreSQL/POLARDB_11_DEV) @@ -58,11 +58,11 @@ PolarDB 采用了基于 Shared-Storage 的存储计算分离架构。数据库 ```bash # 拉取单节点 PolarDB 镜像 -docker pull polardb/polardb_pg_local_instance:single +docker pull polardb/polardb_pg_local_instance # 创建运行并进入容器 -docker run -it --cap-add=SYS_PTRACE --privileged=true --name polardb_pg_single polardb/polardb_pg_local_instance:single bash +docker run -it --rm polardb/polardb_pg_local_instance psql # 测试实例可用性 -psql -h 127.0.0.1 -c 'select version();' +postgres=# SELECT version(); version -------------------------------- PostgreSQL 11.9 (POLARDB 11.9) diff --git a/README.md b/README.md index 32ea75da503..8de04444637 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@
-[![logo](docs/.vuepress/public/images/polardb.png)](https://developer.aliyun.com/topic/polardb-for-pg) +[![logo](docs/.vuepress/public/images/polardb.png)](https://www.polardbpg.com/home) # PolarDB for PostgreSQL @@ -8,7 +8,7 @@ #### English | [简体中文](README-CN.md) -[![official](https://img.shields.io/badge/official%20site-blueviolet?style=for-the-badge&logo=alibabacloud)](https://developer.aliyun.com/topic/polardb-for-pg) +[![official](https://img.shields.io/badge/official%20site-blueviolet?style=for-the-badge&logo=alibabacloud)](https://www.polardbpg.com/home) [![cirrus-ci-stable](https://img.shields.io/cirrus/github/ApsaraDB/PolarDB-for-PostgreSQL/POLARDB_11_STABLE?style=for-the-badge&logo=cirrusci)](https://cirrus-ci.com/github/ApsaraDB/PolarDB-for-PostgreSQL/POLARDB_11_STABLE) [![cirrus-ci-dev](https://img.shields.io/cirrus/github/ApsaraDB/PolarDB-for-PostgreSQL/POLARDB_11_DEV?style=for-the-badge&logo=cirrusci)](https://cirrus-ci.com/github/ApsaraDB/PolarDB-for-PostgreSQL/POLARDB_11_DEV) @@ -58,11 +58,11 @@ If you have Docker installed already,then you can pull the instance image of P ```bash # pull the instance image from DockerHub -docker pull polardb/polardb_pg_local_instance:single +docker pull polardb/polardb_pg_local_instance # create, run and enter the container -docker run -it --cap-add=SYS_PTRACE --privileged=true --name polardb_pg_single polardb/polardb_pg_local_instance:single bash +docker run -it --rm polardb/polardb_pg_local_instance psql # check -psql -h 127.0.0.1 -c 'select version();' +postgres=# SELECT version(); version -------------------------------- PostgreSQL 11.9 (POLARDB 11.9) diff --git a/docs/deploying/fs-pfs.md b/docs/deploying/fs-pfs.md index be7f4241afc..354d9d1f001 100644 --- a/docs/deploying/fs-pfs.md +++ b/docs/deploying/fs-pfs.md @@ -17,13 +17,13 @@ PolarDB File System,简称 PFS 或 PolarFS,是由阿里云自主研发的高 推荐使用 [DockerHub](https://hub.docker.com/u/polardb) 上的 PolarDB for PostgreSQL [可执行文件镜像](https://hub.docker.com/r/polardb/polardb_pg_binary/tags),目前支持 `linux/amd64` 和 `linux/arm64` 两种架构,其中已经包含了编译完毕的 PFS 工具,无需手动编译安装。通过以下命令进入容器即可: ```shell:no-line-numbers -docker pull polardb/polardb_pg_binary:pfs +docker pull polardb/polardb_pg_binary docker run -it \ --cap-add=SYS_PTRACE \ --privileged=true \ --name polardb_pg \ --shm-size=512m \ - polardb/polardb_pg_binary:pfs \ + polardb/polardb_pg_binary \ bash ``` diff --git a/docs/operation/ro-online-promote.md b/docs/operation/ro-online-promote.md index ea8d23b273f..c74dd48bf50 100644 --- a/docs/operation/ro-online-promote.md +++ b/docs/operation/ro-online-promote.md @@ -19,13 +19,13 @@ PolarDB for PostgreSQL 是一款存储与计算分离的云原生数据库,所 为方便起见,本示例使用基于本地磁盘的实例来进行演示。拉取如下镜像并启动容器,可以得到一个基于本地磁盘的 HTAP 实例: ```shell:no-line-numbers -docker pull polardb/polardb_pg_local_instance:htap +docker pull polardb/polardb_pg_local_instance docker run -it \ --cap-add=SYS_PTRACE \ --privileged=true \ --name polardb_pg_htap \ --shm-size=512m \ - polardb/polardb_pg_local_instance:htap \ + polardb/polardb_pg_local_instance \ bash ``` diff --git a/docs/operation/scale-out.md b/docs/operation/scale-out.md index ba95dfba7c2..6e97259903f 100644 --- a/docs/operation/scale-out.md +++ b/docs/operation/scale-out.md @@ -17,13 +17,13 @@ PolarDB for PostgreSQL 是一款存储与计算分离的数据库,所有计算 首先,在已经搭建完毕的共享存储集群上,初始化并启动第一个计算节点,即读写节点,该节点可以对共享存储进行读写。我们在下面的镜像中提供了已经编译完毕的 PolarDB for PostgreSQL 内核和周边工具的可执行文件: ```shell:no-line-numbers -$ docker pull polardb/polardb_pg_binary:pfs +$ docker pull polardb/polardb_pg_binary $ docker run -it \ --cap-add=SYS_PTRACE \ --privileged=true \ --name polardb_pg \ --shm-size=512m \ - polardb/polardb_pg_binary:pfs \ + polardb/polardb_pg_binary \ bash $ ls ~/tmp_basedir_polardb_pg_1100_bld/bin/ @@ -130,13 +130,13 @@ $HOME/tmp_basedir_polardb_pg_1100_bld/bin/psql \ 类似地,在用于部署新计算节点的机器上,拉取镜像并启动带有可执行文件的容器: ```shell:no-line-numbers -docker pull polardb/polardb_pg_binary:pfs +docker pull polardb/polardb_pg_binary docker run -it \ --cap-add=SYS_PTRACE \ --privileged=true \ --name polardb_pg \ --shm-size=512m \ - polardb/polardb_pg_binary:pfs \ + polardb/polardb_pg_binary \ bash ``` diff --git a/docs/operation/tpch-test.md b/docs/operation/tpch-test.md index 9001c082bdf..220559da726 100644 --- a/docs/operation/tpch-test.md +++ b/docs/operation/tpch-test.md @@ -23,13 +23,13 @@ minute: 20 使用 Docker 快速拉起一个基于本地存储的 PolarDB for PostgreSQL 集群: ```shell:no-line-numbers -docker pull polardb/polardb_pg_local_instance:htap +docker pull polardb/polardb_pg_local_instance docker run -it \ --cap-add=SYS_PTRACE \ --privileged=true \ --name polardb_pg_htap \ --shm-size=512m \ - polardb/polardb_pg_local_instance:htap \ + polardb/polardb_pg_local_instance \ bash ``` diff --git a/docs/zh/deploying/fs-pfs.md b/docs/zh/deploying/fs-pfs.md index be7f4241afc..354d9d1f001 100644 --- a/docs/zh/deploying/fs-pfs.md +++ b/docs/zh/deploying/fs-pfs.md @@ -17,13 +17,13 @@ PolarDB File System,简称 PFS 或 PolarFS,是由阿里云自主研发的高 推荐使用 [DockerHub](https://hub.docker.com/u/polardb) 上的 PolarDB for PostgreSQL [可执行文件镜像](https://hub.docker.com/r/polardb/polardb_pg_binary/tags),目前支持 `linux/amd64` 和 `linux/arm64` 两种架构,其中已经包含了编译完毕的 PFS 工具,无需手动编译安装。通过以下命令进入容器即可: ```shell:no-line-numbers -docker pull polardb/polardb_pg_binary:pfs +docker pull polardb/polardb_pg_binary docker run -it \ --cap-add=SYS_PTRACE \ --privileged=true \ --name polardb_pg \ --shm-size=512m \ - polardb/polardb_pg_binary:pfs \ + polardb/polardb_pg_binary \ bash ``` diff --git a/docs/zh/operation/ro-online-promote.md b/docs/zh/operation/ro-online-promote.md index ea8d23b273f..c74dd48bf50 100644 --- a/docs/zh/operation/ro-online-promote.md +++ b/docs/zh/operation/ro-online-promote.md @@ -19,13 +19,13 @@ PolarDB for PostgreSQL 是一款存储与计算分离的云原生数据库,所 为方便起见,本示例使用基于本地磁盘的实例来进行演示。拉取如下镜像并启动容器,可以得到一个基于本地磁盘的 HTAP 实例: ```shell:no-line-numbers -docker pull polardb/polardb_pg_local_instance:htap +docker pull polardb/polardb_pg_local_instance docker run -it \ --cap-add=SYS_PTRACE \ --privileged=true \ --name polardb_pg_htap \ --shm-size=512m \ - polardb/polardb_pg_local_instance:htap \ + polardb/polardb_pg_local_instance \ bash ``` diff --git a/docs/zh/operation/scale-out.md b/docs/zh/operation/scale-out.md index ba95dfba7c2..6e97259903f 100644 --- a/docs/zh/operation/scale-out.md +++ b/docs/zh/operation/scale-out.md @@ -17,13 +17,13 @@ PolarDB for PostgreSQL 是一款存储与计算分离的数据库,所有计算 首先,在已经搭建完毕的共享存储集群上,初始化并启动第一个计算节点,即读写节点,该节点可以对共享存储进行读写。我们在下面的镜像中提供了已经编译完毕的 PolarDB for PostgreSQL 内核和周边工具的可执行文件: ```shell:no-line-numbers -$ docker pull polardb/polardb_pg_binary:pfs +$ docker pull polardb/polardb_pg_binary $ docker run -it \ --cap-add=SYS_PTRACE \ --privileged=true \ --name polardb_pg \ --shm-size=512m \ - polardb/polardb_pg_binary:pfs \ + polardb/polardb_pg_binary \ bash $ ls ~/tmp_basedir_polardb_pg_1100_bld/bin/ @@ -130,13 +130,13 @@ $HOME/tmp_basedir_polardb_pg_1100_bld/bin/psql \ 类似地,在用于部署新计算节点的机器上,拉取镜像并启动带有可执行文件的容器: ```shell:no-line-numbers -docker pull polardb/polardb_pg_binary:pfs +docker pull polardb/polardb_pg_binary docker run -it \ --cap-add=SYS_PTRACE \ --privileged=true \ --name polardb_pg \ --shm-size=512m \ - polardb/polardb_pg_binary:pfs \ + polardb/polardb_pg_binary \ bash ``` diff --git a/docs/zh/operation/tpch-test.md b/docs/zh/operation/tpch-test.md index 9001c082bdf..220559da726 100644 --- a/docs/zh/operation/tpch-test.md +++ b/docs/zh/operation/tpch-test.md @@ -23,13 +23,13 @@ minute: 20 使用 Docker 快速拉起一个基于本地存储的 PolarDB for PostgreSQL 集群: ```shell:no-line-numbers -docker pull polardb/polardb_pg_local_instance:htap +docker pull polardb/polardb_pg_local_instance docker run -it \ --cap-add=SYS_PTRACE \ --privileged=true \ --name polardb_pg_htap \ --shm-size=512m \ - polardb/polardb_pg_local_instance:htap \ + polardb/polardb_pg_local_instance \ bash ``` From cbfe634a704186ff4cb1761ab10f3d05ce612125 Mon Sep 17 00:00:00 2001 From: mrdrivingduck Date: Thu, 28 Dec 2023 14:31:57 +0800 Subject: [PATCH 3/3] docs: pgvector and smlar --- docs/.vuepress/configs/navbar/zh.ts | 5 +- docs/.vuepress/configs/sidebar/zh.ts | 10 +- docs/zh/README.md | 3 +- docs/zh/features/README.md | 17 ++- docs/zh/features/v11/README.md | 3 +- docs/zh/features/v11/extensions/README.md | 4 + docs/zh/features/v11/extensions/pgvector.md | 81 +++++++++++ docs/zh/features/v11/extensions/smlar.md | 143 ++++++++++++++++++++ 8 files changed, 260 insertions(+), 6 deletions(-) create mode 100644 docs/zh/features/v11/extensions/README.md create mode 100644 docs/zh/features/v11/extensions/pgvector.md create mode 100644 docs/zh/features/v11/extensions/smlar.md diff --git a/docs/.vuepress/configs/navbar/zh.ts b/docs/.vuepress/configs/navbar/zh.ts index 44aeb0c292f..922bb422568 100644 --- a/docs/.vuepress/configs/navbar/zh.ts +++ b/docs/.vuepress/configs/navbar/zh.ts @@ -67,10 +67,10 @@ export const zh: NavbarConfig = [ ], }, { - text: "内核增强功能", + text: "自研功能", children: [ { - text: "文档入口", + text: "功能总览", link: "/zh/features/", }, { @@ -81,6 +81,7 @@ export const zh: NavbarConfig = [ "/zh/features/v11/availability/", "/zh/features/v11/security/", "/zh/features/v11/epq/", + "/zh/features/v11/extensions/", ], }, ], diff --git a/docs/.vuepress/configs/sidebar/zh.ts b/docs/.vuepress/configs/sidebar/zh.ts index dc53cfdbf72..9edd2610577 100644 --- a/docs/.vuepress/configs/sidebar/zh.ts +++ b/docs/.vuepress/configs/sidebar/zh.ts @@ -76,7 +76,7 @@ export const zh: SidebarConfig = { ], "/zh/features": [ { - text: "内核增强功能", + text: "自研功能", link: "/zh/features/", children: [ { @@ -122,6 +122,14 @@ export const zh: SidebarConfig = { "/zh/features/v11/epq/epq-ctas-mtview-bulk-insert.md", ], }, + { + text: "第三方插件", + link: "/zh/features/v11/extensions/", + children: [ + "/zh/features/v11/extensions/pgvector.md", + "/zh/features/v11/extensions/smlar.md", + ], + }, ], }, ], diff --git a/docs/zh/README.md b/docs/zh/README.md index 87de3bec18b..0b9dae41464 100644 --- a/docs/zh/README.md +++ b/docs/zh/README.md @@ -49,12 +49,13 @@ postgres=# SELECT version();
diff --git a/docs/zh/features/README.md b/docs/zh/features/README.md index de168df84bd..8af08ef933d 100644 --- a/docs/zh/features/README.md +++ b/docs/zh/features/README.md @@ -1,4 +1,4 @@ -# 内核增强功能 +# 自研功能 - [PolarDB for PostgreSQL 11](./v11/README.md) @@ -118,5 +118,20 @@ / + +第三方插件 +... +... + + +pgvector +/ + + + +smlar +/ + + diff --git a/docs/zh/features/v11/README.md b/docs/zh/features/v11/README.md index 8ac287a04b3..a8e23b49e07 100644 --- a/docs/zh/features/v11/README.md +++ b/docs/zh/features/v11/README.md @@ -1,6 +1,7 @@ -# 内核增强功能 +# 自研功能 - [高性能](./performance/README.md) - [高可用](./availability/README.md) - [安全](./security/README.md) - [弹性跨机并行查询(ePQ)](./epq/README.md) +- [第三方插件](./extensions/README.md) diff --git a/docs/zh/features/v11/extensions/README.md b/docs/zh/features/v11/extensions/README.md new file mode 100644 index 00000000000..dc86fb9811c --- /dev/null +++ b/docs/zh/features/v11/extensions/README.md @@ -0,0 +1,4 @@ +# 第三方插件 + +- [pgvector](./pgvector.md) +- [smlar](./smlar.md) diff --git a/docs/zh/features/v11/extensions/pgvector.md b/docs/zh/features/v11/extensions/pgvector.md new file mode 100644 index 00000000000..cfe4788a065 --- /dev/null +++ b/docs/zh/features/v11/extensions/pgvector.md @@ -0,0 +1,81 @@ +--- +author: 山现 +date: 2023/12/25 +minute: 10 +--- + +# pgvector + + + + + +[[toc]] + +## 背景 + +[`pgvector`](https://github.com/pgvector/pgvector) 作为一款高效的向量数据库插件,基于 PostgreSQL 的扩展机制,利用 C 语言实现了多种向量数据类型和运算算法,同时还能够高效存储与查询以向量表示的 AI Embedding。 + +`pgvector` 支持 IVFFlat 索引。IVFFlat 索引能够将向量空间分为若干个划分区域,每个区域都包含一些向量,并创建倒排索引,用于快速地查找与给定向量相似的向量。IVFFlat 是 IVFADC 索引的简化版本,适用于召回精度要求高,但对查询耗时要求不严格(100ms 级别)的场景。相比其他索引类型,IVFFlat 索引具有高召回率、高精度、算法和参数简单、空间占用小的优势。 + +`pgvector` 插件算法的具体流程如下: + +1. 高维空间中的点基于隐形的聚类属性,按照 K-Means 等聚类算法对向量进行聚类处理,使得每个类簇有一个中心点 +2. 检索向量时首先遍历计算所有类簇的中心点,找到与目标向量最近的 n 个类簇中心 +3. 遍历计算 n 个类簇中心所在聚类中的所有元素,经过全局排序得到距离最近的 k 个向量 + +## 使用方法 + +`pgvector` 可以顺序检索或索引检索高维向量,关于索引类型和更多参数介绍可以参考插件源代码的 [README](https://github.com/pgvector/pgvector/blob/master/README.md)。 + +### 安装插件 + +```sql:no-line-numbers +CREATE EXTENSION vector; +``` + +### 向量操作 + +执行如下命令,创建一个含有向量字段的表: + +```sql:no-line-numbers +CREATE TABLE t (val vector(3)); +``` + +执行如下命令,可以插入向量数据: + +```sql:no-line-numbers +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +``` + +创建 IVFFlat 类型的索引: + +1. `val vector_ip_ops` 表示需要创建索引的列名为 `val`,并且使用向量操作符 `vector_ip_ops` 来计算向量之间的相似度。该操作符支持向量之间的点积、余弦相似度、欧几里得距离等计算方式 +2. `WITH (lists = 1)` 表示使用的划分区域数量为 1,这意味着所有向量都将被分配到同一个区域中。在实际应用中,划分区域数量需要根据数据规模和查询性能进行调整 + +```sql:no-line-numbers +CREATE INDEX ON t USING ivfflat (val vector_ip_ops) WITH (lists = 1); +``` + +计算近似向量: + +```sql:no-line-numbers +=> SELECT * FROM t ORDER BY val <#> '[3,3,3]'; + val +--------- + [1,2,3] + [1,1,1] + [0,0,0] + +(4 rows) +``` + +### 卸载插件 + +```sql:no-line-numbers +DROP EXTENSION vector; +``` + +## 注意事项 + +- [ePQ](../epq/README.md) 支持通过排序遍历高维向量,不支持通过索引查询向量类型 diff --git a/docs/zh/features/v11/extensions/smlar.md b/docs/zh/features/v11/extensions/smlar.md new file mode 100644 index 00000000000..7933801c35c --- /dev/null +++ b/docs/zh/features/v11/extensions/smlar.md @@ -0,0 +1,143 @@ +--- +author: 棠羽 +date: 2022/10/05 +minute: 10 +--- + +# smlar + + + + + +[[toc]] + +## 背景 + +对大规模的数据进行相似度计算在电商业务、搜索引擎中是一个很关键的技术问题。相对简易的相似度计算实现不仅运算速度慢,还十分消耗资源。[`smlar`](https://github.com/jirutka/smlar) 是 PostgreSQL 的一款开源第三方插件,提供了可以在数据库内高效计算数据相似度的函数,并提供了支持 GiST 和 GIN 索引的相似度运算符。目前该插件已经支持 PostgreSQL 所有的内置数据类型。 + +::: warning +由于 smlar 插件的 `%` 操作符与 RUM 插件的 `%` 操作符冲突,因此 smlar 与 RUM 两个插件无法同时创建在同一 schema 中。 +::: + +## 函数及运算符介绍 + +- **`float4 smlar(anyarray, anyarray)`** + + 计算两个数组的相似度,数组的数据类型需要一致。 + +- **`float4 smlar(anyarray, anyarray, bool useIntersect)`** + + 计算两个自定义复合类型数组的相似度,`useIntersect` 参数表示是否让仅重叠元素还是全部元素参与运算;复合类型可由以下方式定义: + + ```sql:no-line-numbers + CREATE TYPE type_name AS (element_name anytype, weight_name FLOAT4); + ``` + +- **`float4 smlar(anyarray a, anyarray b, text formula);`** + + 使用参数给定的公式来计算两个数组的相似度,数组的数据类型需要一致;公式中可以使用的预定义变量有: + + - `N.i`:两个数组中的相同元素个数(交集) + - `N.a`:第一个数组中的唯一元素个数 + - `N.b`:第二个数组中的唯一元素个数 + + ```sql:no-line-numbers + SELECT smlar('{1,4,6}'::int[], '{5,4,6}', 'N.i / sqrt(N.a * N.b)'); + ``` + +- **`anyarray % anyarray`** + + 该运算符的含义为,当两个数组的的相似度超过阈值时返回 `TRUE`,否则返回 `FALSE`。 + +- **`text[] tsvector2textarray(tsvector)`** + + 将 `tsvector` 类型转换为字符串数组。 + +- **`anyarray array_unique(anyarray)`** + + 对数组进行排序、去重。 + +- **`float4 inarray(anyarray, anyelement)`** + + 如果元素出现在数组中,则返回 `1.0`;否则返回 `0`。 + +- **`float4 inarray(anyarray, anyelement, float4, float4)`** + + 如果元素出现在数组中,则返回第三个参数;否则返回第四个参数。 + +## 可配置参数说明 + +- **`smlar.threshold FLOAT`** + + 相似度阈值,用于给 `%` 运算符判断两个数组是否相似。 + +- **`smlar.persistent_cache BOOL`** + + 全局统计信息的缓存是否存放在与事务无关的内存中。 + +- **`smlar.type STRING`**:相似度计算公式,可选的相似度类型包含: + + - [cosine](https://en.wikipedia.org/wiki/Cosine_similarity)(默认) + - [tfidf](https://zh.wikipedia.org/zh-cn/Tf-idf) + - [overlap](https://en.wikipedia.org/wiki/Overlap_coefficient) + +- **`smlar.stattable STRING`** + + 存储集合范围统计信息的表名,表定义如下: + + ```sql:no-line-numbers + CREATE TABLE table_name ( + value data_type UNIQUE, + ndoc int4 (or bigint) NOT NULL CHECK (ndoc>0) + ); + ``` + +- **`smlar.tf_method STRING`**:计算词频(TF,Term Frequency)的方法,取值如下 + + - `n`:简单计数(默认) + - `log`:`1 + log(n)` + - `const`:频率等于 `1` + +- **`smlar.idf_plus_one BOOL`**:计算逆文本频率指数的方法(IDF,Inverse Document Frequency)的方法,取值如下 + + - `FALSE`:`log(d / df)`(默认) + - `TRUE`:`log(1 + d / df)` + +## 基本使用方法 + +### 安装插件 + +```sql:no-line-numbers +CREATE EXTENSION smlar; +``` + +### 相似度计算 + +使用上述的函数计算两个数组的相似度: + +```sql +SELECT smlar('{3,2}'::int[], '{3,2,1}'); + smlar +---------- + 0.816497 +(1 row) + +SELECT smlar('{1,4,6}'::int[], '{5,4,6}', 'N.i / (N.a + N.b)' ); + smlar +---------- + 0.333333 +(1 row) +``` + +### 卸载插件 + +```sql:no-line-numbers +DROP EXTENSION smlar; +``` + +## 原理和设计 + +[GitHub - jirutka/smlar](https://github.com/jirutka/smlar) + +[PGCon 2012 - Finding Similar: Effective similarity search in database](https://www.pgcon.org/2012/schedule/track/Hacking/443.en.html) ([slides](https://www.pgcon.org/2012/schedule/attachments/252_smlar-2012.pdf))