diff --git a/configure b/configure index 762d77c3ede..c1d176928cb 100755 --- a/configure +++ b/configure @@ -7015,6 +7015,306 @@ if test x"$pgac_cv_prog_CLANGXX_cxxflags__fexcess_precision_standard" = x"yes"; BITCODE_CXXFLAGS="${BITCODE_CXXFLAGS} -fexcess-precision=standard" fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CLANG} supports -Xclang -no-opaque-pointers, for BITCODE_CFLAGS" >&5 +$as_echo_n "checking whether ${CLANG} supports -Xclang -no-opaque-pointers, for BITCODE_CFLAGS... " >&6; } +if ${pgac_cv_prog_CLANG_cflags__Xclang__no_opaque_pointers+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +pgac_save_CC=$CC +CC=${CLANG} +CFLAGS="${BITCODE_CFLAGS} -Xclang -no-opaque-pointers" +ac_save_c_werror_flag=$ac_c_werror_flag +ac_c_werror_flag=yes +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv_prog_CLANG_cflags__Xclang__no_opaque_pointers=yes +else + pgac_cv_prog_CLANG_cflags__Xclang__no_opaque_pointers=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_c_werror_flag=$ac_save_c_werror_flag +CFLAGS="$pgac_save_CFLAGS" +CC="$pgac_save_CC" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CLANG_cflags__Xclang__no_opaque_pointers" >&5 +$as_echo "$pgac_cv_prog_CLANG_cflags__Xclang__no_opaque_pointers" >&6; } +if test x"$pgac_cv_prog_CLANG_cflags__Xclang__no_opaque_pointers" = x"yes"; then + BITCODE_CFLAGS="${BITCODE_CFLAGS} -Xclang -no-opaque-pointers" +fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CLANGXX} supports -Xclang -no-opaque-pointers, for BITCODE_CXXFLAGS" >&5 +$as_echo_n "checking whether ${CLANGXX} supports -Xclang -no-opaque-pointers, for BITCODE_CXXFLAGS... " >&6; } +if ${pgac_cv_prog_CLANGXX_cxxflags__Xclang__no_opaque_pointers+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CXXFLAGS=$CXXFLAGS +pgac_save_CXX=$CXX +CXX=${CLANGXX} +CXXFLAGS="${BITCODE_CXXFLAGS} -Xclang -no-opaque-pointers" +ac_save_cxx_werror_flag=$ac_cxx_werror_flag +ac_cxx_werror_flag=yes +ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + pgac_cv_prog_CLANGXX_cxxflags__Xclang__no_opaque_pointers=yes +else + pgac_cv_prog_CLANGXX_cxxflags__Xclang__no_opaque_pointers=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + +ac_cxx_werror_flag=$ac_save_cxx_werror_flag +CXXFLAGS="$pgac_save_CXXFLAGS" +CXX="$pgac_save_CXX" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CLANGXX_cxxflags__Xclang__no_opaque_pointers" >&5 +$as_echo "$pgac_cv_prog_CLANGXX_cxxflags__Xclang__no_opaque_pointers" >&6; } +if test x"$pgac_cv_prog_CLANGXX_cxxflags__Xclang__no_opaque_pointers" = x"yes"; then + BITCODE_CXXFLAGS="${BITCODE_CXXFLAGS} -Xclang -no-opaque-pointers" +fi + + + NOT_THE_CFLAGS="" + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CLANG} supports -Wunused-command-line-argument, for NOT_THE_CFLAGS" >&5 +$as_echo_n "checking whether ${CLANG} supports -Wunused-command-line-argument, for NOT_THE_CFLAGS... " >&6; } +if ${pgac_cv_prog_CLANG_cflags__Wunused_command_line_argument+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +pgac_save_CC=$CC +CC=${CLANG} +CFLAGS="${NOT_THE_CFLAGS} -Wunused-command-line-argument" +ac_save_c_werror_flag=$ac_c_werror_flag +ac_c_werror_flag=yes +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv_prog_CLANG_cflags__Wunused_command_line_argument=yes +else + pgac_cv_prog_CLANG_cflags__Wunused_command_line_argument=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_c_werror_flag=$ac_save_c_werror_flag +CFLAGS="$pgac_save_CFLAGS" +CC="$pgac_save_CC" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CLANG_cflags__Wunused_command_line_argument" >&5 +$as_echo "$pgac_cv_prog_CLANG_cflags__Wunused_command_line_argument" >&6; } +if test x"$pgac_cv_prog_CLANG_cflags__Wunused_command_line_argument" = x"yes"; then + NOT_THE_CFLAGS="${NOT_THE_CFLAGS} -Wunused-command-line-argument" +fi + + if test -n "$NOT_THE_CFLAGS"; then + BITCODE_CFLAGS="$BITCODE_CFLAGS -Wno-unused-command-line-argument" + fi + NOT_THE_CFLAGS="" + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CLANG} supports -Wcompound-token-split-by-macro, for NOT_THE_CFLAGS" >&5 +$as_echo_n "checking whether ${CLANG} supports -Wcompound-token-split-by-macro, for NOT_THE_CFLAGS... " >&6; } +if ${pgac_cv_prog_CLANG_cflags__Wcompound_token_split_by_macro+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +pgac_save_CC=$CC +CC=${CLANG} +CFLAGS="${NOT_THE_CFLAGS} -Wcompound-token-split-by-macro" +ac_save_c_werror_flag=$ac_c_werror_flag +ac_c_werror_flag=yes +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv_prog_CLANG_cflags__Wcompound_token_split_by_macro=yes +else + pgac_cv_prog_CLANG_cflags__Wcompound_token_split_by_macro=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_c_werror_flag=$ac_save_c_werror_flag +CFLAGS="$pgac_save_CFLAGS" +CC="$pgac_save_CC" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CLANG_cflags__Wcompound_token_split_by_macro" >&5 +$as_echo "$pgac_cv_prog_CLANG_cflags__Wcompound_token_split_by_macro" >&6; } +if test x"$pgac_cv_prog_CLANG_cflags__Wcompound_token_split_by_macro" = x"yes"; then + NOT_THE_CFLAGS="${NOT_THE_CFLAGS} -Wcompound-token-split-by-macro" +fi + + if test -n "$NOT_THE_CFLAGS"; then + BITCODE_CFLAGS="$BITCODE_CFLAGS -Wno-compound-token-split-by-macro" + fi + NOT_THE_CFLAGS="" + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CLANG} supports -Wdeprecated-non-prototype, for NOT_THE_CFLAGS" >&5 +$as_echo_n "checking whether ${CLANG} supports -Wdeprecated-non-prototype, for NOT_THE_CFLAGS... " >&6; } +if ${pgac_cv_prog_CLANG_cflags__Wdeprecated_non_prototype+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +pgac_save_CC=$CC +CC=${CLANG} +CFLAGS="${NOT_THE_CFLAGS} -Wdeprecated-non-prototype" +ac_save_c_werror_flag=$ac_c_werror_flag +ac_c_werror_flag=yes +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv_prog_CLANG_cflags__Wdeprecated_non_prototype=yes +else + pgac_cv_prog_CLANG_cflags__Wdeprecated_non_prototype=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_c_werror_flag=$ac_save_c_werror_flag +CFLAGS="$pgac_save_CFLAGS" +CC="$pgac_save_CC" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CLANG_cflags__Wdeprecated_non_prototype" >&5 +$as_echo "$pgac_cv_prog_CLANG_cflags__Wdeprecated_non_prototype" >&6; } +if test x"$pgac_cv_prog_CLANG_cflags__Wdeprecated_non_prototype" = x"yes"; then + NOT_THE_CFLAGS="${NOT_THE_CFLAGS} -Wdeprecated-non-prototype" +fi + + if test -n "$NOT_THE_CFLAGS"; then + BITCODE_CFLAGS="$BITCODE_CFLAGS -Wno-deprecated-non-prototype" + fi + NOT_THE_CFLAGS="" + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CLANG} supports -Wformat-truncation, for NOT_THE_CFLAGS" >&5 +$as_echo_n "checking whether ${CLANG} supports -Wformat-truncation, for NOT_THE_CFLAGS... " >&6; } +if ${pgac_cv_prog_CLANG_cflags__Wformat_truncation+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +pgac_save_CC=$CC +CC=${CLANG} +CFLAGS="${NOT_THE_CFLAGS} -Wformat-truncation" +ac_save_c_werror_flag=$ac_c_werror_flag +ac_c_werror_flag=yes +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv_prog_CLANG_cflags__Wformat_truncation=yes +else + pgac_cv_prog_CLANG_cflags__Wformat_truncation=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_c_werror_flag=$ac_save_c_werror_flag +CFLAGS="$pgac_save_CFLAGS" +CC="$pgac_save_CC" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CLANG_cflags__Wformat_truncation" >&5 +$as_echo "$pgac_cv_prog_CLANG_cflags__Wformat_truncation" >&6; } +if test x"$pgac_cv_prog_CLANG_cflags__Wformat_truncation" = x"yes"; then + NOT_THE_CFLAGS="${NOT_THE_CFLAGS} -Wformat-truncation" +fi + + if test -n "$NOT_THE_CFLAGS"; then + BITCODE_CFLAGS="$BITCODE_CFLAGS -Wno-format-truncation" + fi + NOT_THE_CFLAGS="" + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CLANG} supports -Wstringop-truncation, for NOT_THE_CFLAGS" >&5 +$as_echo_n "checking whether ${CLANG} supports -Wstringop-truncation, for NOT_THE_CFLAGS... " >&6; } +if ${pgac_cv_prog_CLANG_cflags__Wstringop_truncation+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +pgac_save_CC=$CC +CC=${CLANG} +CFLAGS="${NOT_THE_CFLAGS} -Wstringop-truncation" +ac_save_c_werror_flag=$ac_c_werror_flag +ac_c_werror_flag=yes +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv_prog_CLANG_cflags__Wstringop_truncation=yes +else + pgac_cv_prog_CLANG_cflags__Wstringop_truncation=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_c_werror_flag=$ac_save_c_werror_flag +CFLAGS="$pgac_save_CFLAGS" +CC="$pgac_save_CC" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CLANG_cflags__Wstringop_truncation" >&5 +$as_echo "$pgac_cv_prog_CLANG_cflags__Wstringop_truncation" >&6; } +if test x"$pgac_cv_prog_CLANG_cflags__Wstringop_truncation" = x"yes"; then + NOT_THE_CFLAGS="${NOT_THE_CFLAGS} -Wstringop-truncation" +fi + + if test -n "$NOT_THE_CFLAGS"; then + BITCODE_CFLAGS="$BITCODE_CFLAGS -Wno-stringop-truncation" + fi fi # supply -g if --enable-debug diff --git a/configure.in b/configure.in index 39de633b751..b897445ece6 100644 --- a/configure.in +++ b/configure.in @@ -589,6 +589,35 @@ if test "$with_llvm" = yes ; then PGAC_PROG_VARCXX_VARFLAGS_OPT(CLANGXX, BITCODE_CXXFLAGS, [-fwrapv]) PGAC_PROG_VARCC_VARFLAGS_OPT(CLANG, BITCODE_CFLAGS, [-fexcess-precision=standard]) PGAC_PROG_VARCXX_VARFLAGS_OPT(CLANGXX, BITCODE_CXXFLAGS, [-fexcess-precision=standard]) + + PGAC_PROG_VARCC_VARFLAGS_OPT(CLANG, BITCODE_CFLAGS, [-Xclang -no-opaque-pointers]) + PGAC_PROG_VARCXX_VARFLAGS_OPT(CLANGXX, BITCODE_CXXFLAGS, [-Xclang -no-opaque-pointers]) + + NOT_THE_CFLAGS="" + PGAC_PROG_VARCC_VARFLAGS_OPT(CLANG, NOT_THE_CFLAGS, [-Wunused-command-line-argument]) + if test -n "$NOT_THE_CFLAGS"; then + BITCODE_CFLAGS="$BITCODE_CFLAGS -Wno-unused-command-line-argument" + fi + NOT_THE_CFLAGS="" + PGAC_PROG_VARCC_VARFLAGS_OPT(CLANG, NOT_THE_CFLAGS, [-Wcompound-token-split-by-macro]) + if test -n "$NOT_THE_CFLAGS"; then + BITCODE_CFLAGS="$BITCODE_CFLAGS -Wno-compound-token-split-by-macro" + fi + NOT_THE_CFLAGS="" + PGAC_PROG_VARCC_VARFLAGS_OPT(CLANG, NOT_THE_CFLAGS, [-Wdeprecated-non-prototype]) + if test -n "$NOT_THE_CFLAGS"; then + BITCODE_CFLAGS="$BITCODE_CFLAGS -Wno-deprecated-non-prototype" + fi + NOT_THE_CFLAGS="" + PGAC_PROG_VARCC_VARFLAGS_OPT(CLANG, NOT_THE_CFLAGS, [-Wformat-truncation]) + if test -n "$NOT_THE_CFLAGS"; then + BITCODE_CFLAGS="$BITCODE_CFLAGS -Wno-format-truncation" + fi + NOT_THE_CFLAGS="" + PGAC_PROG_VARCC_VARFLAGS_OPT(CLANG, NOT_THE_CFLAGS, [-Wstringop-truncation]) + if test -n "$NOT_THE_CFLAGS"; then + BITCODE_CFLAGS="$BITCODE_CFLAGS -Wno-stringop-truncation" + fi fi # supply -g if --enable-debug diff --git a/doc/src/sgml/parallel.sgml b/doc/src/sgml/parallel.sgml index 6c30916655d..8f98b71ffff 100644 --- a/doc/src/sgml/parallel.sgml +++ b/doc/src/sgml/parallel.sgml @@ -136,7 +136,7 @@ EXPLAIN SELECT * FROM pgbench_accounts WHERE filler LIKE '%x%'; In addition, the system must not be running in single-user mode. Since - the entire database system is running in single process in this situation, + the entire database system is running as a single process in this situation, no background workers will be available. diff --git a/doc/src/sgml/ref/alter_table.sgml b/doc/src/sgml/ref/alter_table.sgml index 5c4685addd7..09bb3944004 100644 --- a/doc/src/sgml/ref/alter_table.sgml +++ b/doc/src/sgml/ref/alter_table.sgml @@ -899,7 +899,7 @@ WITH ( MODULUS numeric_literal, REM constraint. This does not work, however, if any of the partition keys is an expression and the partition does not accept NULL values. If attaching a list partition that will - not accept NULL values, also add + not accept NULL values, also add a NOT NULL constraint to the partition key column, unless it's an expression. diff --git a/doc/src/sgml/sources.sgml b/doc/src/sgml/sources.sgml index 1a3d905ab7e..7d1d16efd08 100644 --- a/doc/src/sgml/sources.sgml +++ b/doc/src/sgml/sources.sgml @@ -886,9 +886,9 @@ BETTER: unrecognized node type: 42 Function-Like Macros and Inline Functions - Both, macros with arguments and static inline - functions, may be used. The latter are preferable if there are - multiple-evaluation hazards when written as a macro, as e.g. the + Both macros with arguments and static inline + functions may be used. The latter are preferable if there are + multiple-evaluation hazards when written as a macro, as e.g., the case with #define Max(x, y) ((x) > (y) ? (x) : (y)) diff --git a/docs/.vuepress/configs/navbar/zh.ts b/docs/.vuepress/configs/navbar/zh.ts index 2480e2a7469..44aeb0c292f 100644 --- a/docs/.vuepress/configs/navbar/zh.ts +++ b/docs/.vuepress/configs/navbar/zh.ts @@ -67,32 +67,20 @@ export const zh: NavbarConfig = [ ], }, { - text: "内核功能增强", + text: "内核增强功能", children: [ { text: "文档入口", - link: "/zh/features/v11/", + link: "/zh/features/", }, { text: "PolarDB for PostgreSQL 11", link: "/zh/features/v11/", children: [ - { - text: "高性能", - link: "/zh/features/v11/performance/", - }, - { - text: "高可用", - link: "/zh/features/v11/availability/", - }, - { - text: "安全", - link: "/zh/features/v11/security/", - }, - { - text: "HTAP", - link: "/zh/features/v11/htap/", - }, + "/zh/features/v11/performance/", + "/zh/features/v11/availability/", + "/zh/features/v11/security/", + "/zh/features/v11/epq/", ], }, ], diff --git a/docs/.vuepress/configs/sidebar/zh.ts b/docs/.vuepress/configs/sidebar/zh.ts index e38f609e81a..dc53cfdbf72 100644 --- a/docs/.vuepress/configs/sidebar/zh.ts +++ b/docs/.vuepress/configs/sidebar/zh.ts @@ -76,8 +76,8 @@ export const zh: SidebarConfig = { ], "/zh/features": [ { - text: "内核功能增强", - link: "/zh/features/v11/", + text: "内核增强功能", + link: "/zh/features/", children: [ { text: "PolarDB for PostgreSQL 11", @@ -86,18 +86,41 @@ export const zh: SidebarConfig = { { text: "高性能", link: "/zh/features/v11/performance/", + children: [ + "/zh/features/v11/performance/bulk-read-and-extend.md", + "/zh/features/v11/performance/rel-size-cache.md", + "/zh/features/v11/performance/shared-server.md", + ], }, { text: "高可用", link: "/zh/features/v11/availability/", + children: [ + "/zh/features/v11/availability/avail-online-promote.md", + "/zh/features/v11/availability/avail-parallel-replay.md", + "/zh/features/v11/availability/datamax.md", + "/zh/features/v11/availability/resource-manager.md", + "/zh/features/v11/availability/flashback-table.md", + ], }, { text: "安全", link: "/zh/features/v11/security/", + children: ["/zh/features/v11/security/tde.md"], }, { - text: "HTAP", - link: "/zh/features/v11/htap/", + text: "弹性跨机并行查询(ePQ)", + link: "/zh/features/v11/epq/", + children: [ + "/zh/features/v11/epq/epq-explain-analyze.md", + "/zh/features/v11/epq/epq-node-and-dop.md", + "/zh/features/v11/epq/epq-partitioned-table.md", + "/zh/features/v11/epq/epq-create-btree-index.md", + "/zh/features/v11/epq/cluster-info.md", + "/zh/features/v11/epq/adaptive-scan.md", + "/zh/features/v11/epq/parallel-dml.md", + "/zh/features/v11/epq/epq-ctas-mtview-bulk-insert.md", + ], }, ], }, diff --git a/docs/.vuepress/styles/index.scss b/docs/.vuepress/styles/index.scss index 24830785500..299f7dc6a29 100644 --- a/docs/.vuepress/styles/index.scss +++ b/docs/.vuepress/styles/index.scss @@ -6,6 +6,8 @@ --c-brand-light: #fc5207; --c-tip: #fc5207; + + --content-width: 1020px; } html.dark { diff --git a/docs/zh/README.md b/docs/zh/README.md index 9570116d01d..87de3bec18b 100644 --- a/docs/zh/README.md +++ b/docs/zh/README.md @@ -49,12 +49,12 @@ postgres=# SELECT version();
-

内核功能增强

+

内核增强功能

diff --git a/docs/zh/features/README.md b/docs/zh/features/README.md new file mode 100644 index 00000000000..de168df84bd --- /dev/null +++ b/docs/zh/features/README.md @@ -0,0 +1,122 @@ +# 内核增强功能 + +- [PolarDB for PostgreSQL 11](./v11/README.md) + +## 功能 / 版本映射矩阵 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
功能 / 版本PostgreSQLPolarDB for PostgreSQL 11
高性能......
预读 / 预扩展/
表大小缓存/
Shared Server/
高可用......
只读节点 Online Promote/
WAL 日志并行回放/
DataMax 日志节点/
Resource Manager/
闪回表和闪回日志/
安全......
透明数据加密/
弹性跨机并行查询(ePQ)......
ePQ 执行计划查看与分析/
ePQ 计算节点范围选择与并行度控制/
ePQ 支持分区表查询/
ePQ 支持创建 B-Tree 索引并行加速/
集群拓扑视图/
自适应扫描/
并行 INSERT/
ePQ 支持创建/刷新物化视图并行加速和批量写入/
diff --git a/docs/zh/features/v11/README.md b/docs/zh/features/v11/README.md index 439ff054f0e..8ac287a04b3 100644 --- a/docs/zh/features/v11/README.md +++ b/docs/zh/features/v11/README.md @@ -1,6 +1,6 @@ -# 内核功能增强 +# 内核增强功能 - [高性能](./performance/README.md) - [高可用](./availability/README.md) - [安全](./security/README.md) -- [HTAP](./htap/README.md) +- [弹性跨机并行查询(ePQ)](./epq/README.md) diff --git a/docs/zh/features/v11/availability/README.md b/docs/zh/features/v11/availability/README.md index a9ddd5610a4..ce6875f93a7 100644 --- a/docs/zh/features/v11/availability/README.md +++ b/docs/zh/features/v11/availability/README.md @@ -1,6 +1,7 @@ # 高可用 - [只读节点 Online Promote](./avail-online-promote.md) +- [WAL 日志并行回放](./avail-parallel-replay.md) - [DataMax 日志节点](./datamax.md) - [Resource Manager](./resource-manager.md) - [闪回表和闪回日志](./flashback-table.md) diff --git a/docs/zh/features/v11/availability/avail-parallel-replay.md b/docs/zh/features/v11/availability/avail-parallel-replay.md new file mode 100644 index 00000000000..aeb8d86f1e8 --- /dev/null +++ b/docs/zh/features/v11/availability/avail-parallel-replay.md @@ -0,0 +1,146 @@ +--- +author: 学弈 +date: 2022/09/20 +minute: 30 +--- + +# WAL 日志并行回放 + + + + + +[[toc]] + +## 背景 + +在 PolarDB for PostgreSQL 的一写多读架构下,只读节点(Replica 节点)运行过程中,LogIndex 后台回放进程(LogIndex Background Worker)和会话进程(Backend)分别使用 LogIndex 数据在不同的 Buffer 上回放 WAL 日志,本质上达到了一种并行回放 WAL 日志的效果。 + +鉴于 WAL 日志回放在 PolarDB 集群的高可用中起到至关重要的作用,将并行回放 WAL 日志的思想用到常规的日志回放路径上,是一种很好的优化思路。 + +并行回放 WAL 日志至少可以在以下三个场景下发挥优势: + +1. 主库节点、只读节点以及备库节点崩溃恢复(Crash Recovery)的过程; +2. 只读节点 LogIndex BGW 进程持续回放 WAL 日志的过程; +3. 备库节点 Startup 进程持续回放 WAL 日志的过程。 + +## 术语 + +- Block:数据块 +- WAL:Write-Ahead Logging,预写日志 +- Task Node:并行执行框架中的子任务执行节点,可以接收并执行一个子任务 +- Task Tag:子任务的分类标识,同一类的子任务执行顺序有先后关系 +- Hold List:并行执行框架中,每个子进程调度执行回放子任务所使用的链表 + +## 原理 + +### 概述 + +一条 WAL 日志可能修改多个数据块 Block,因此可以使用如下定义来表示 WAL 日志的回放过程: + +- 假设第 `i` 条 WAL 日志 LSN 为 $LSN_i$,其修改了 `m` 个数据块,则定义第 `i` 条 WAL 日志修改的数据块列表 $Block_i = [Block_{i,0}, Block_{i,1}, ..., Block_{i,m}]$; +- 定义最小的回放子任务为 $Task_{i,j}={LSN_i -> Block_{i,j}}$,表示在数据块 $Block_{i,j}$ 上回放第 `i` 条 WAL 日志; +- 因此,一条修改了 `k` 个 Block 的 WAL 日志就可以表示成 `k` 个回放子任务的集合:$TASK_{i,*} = [Task_{i,0}, Task_{i,1}, ..., Task_{i,k}]$; +- 进而,多条 WAL 日志就可以表示成一系列回放子任务的集合:$TASK_{*,*} = [Task_{0,*}, Task_{1,*}, ..., Task_{N,*}]$; + +在日志回放子任务集合 $Task_{*,*}$ 中,每个子任务的执行,有时并不依赖于前序子任务的执行结果。假设回放子任务集合如下:$TASK_{*,*} = [Task_{0,*}, Task_{1,*}, Task_{2,*}]$,其中: + +- $Task_{0,*}=[Task_{0,0}, Task_{0,1}, Task_{0,2}]$ +- $Task_{1,*}=[Task_{1,0}, Task_{1,1}]$, +- $Task_{2,*}=[Task_{2,0}]$ + +并且 $Block_{0,0} = Block_{1,0}$,$Block_{0,1} = Block_{1,1}$,$Block_{0,2} = Block_{2,0}$ + +则可以并行回放的子任务集合有三个:$[Task_{0,0},Task_{1,0}]$、$[Task_{0,1},Task_{1,1}]$、$[Task_{0,2},Task_{2,0}]$ + +综上所述,在整个 WAL 日志所表示的回放子任务集合中,存在很多子任务序列可以并行执行,而且不会影响最终回放结果的一致性。PolarDB 借助这种思想,提出了一种并行任务执行框架,并成功运用到了 WAL 日志回放的过程中。 + +### 并行任务执行框架 + +将一段共享内存根据并发进程数目进行等分,每一段作为一个环形队列,分配给一个进程。通过配置参数设定每个环形队列的深度: + +![image.png](../../../imgs/pr_parallel_execute_1.png) + +- Dispatcher 进程 + - 通过将任务分发给指定的进程来控制并发调度; + - 负责将进程执行完的任务从队列中删除; +- 进程组 + - 组内每一个进程从相应的环形队列中获取需要执行的任务,根据任务的状态决定是否执行。 + +![image.png](../../../imgs/pr_parallel_execute_2.png) + +#### 任务 + +环形队列的内容由 Task Node 组成,每个 Task Node 包含五个状态:Idle、Running、Hold、Finished、Removed。 + +- `Idle`:表示该 Task Node 未分配任务; +- `Running`:表示该 Task Node 已经分配任务,正在等待进程执行,或已经在执行; +- `Hold`:表示该 Task Node 有前向依赖的任务,需要等待依赖的任务执行完再执行; +- `Finished`:表示进程组中的进程已经执行完该任务; +- `Removed`:当 Dispatcher 进程发现一个任务的状态已经为 `Finished`,那么该任务所有的前置依赖任务也都应该为 `Finished` 状态,`Removed` 状态表示 Dispatcher 进程已经将该任务以及该任务所有前置任务都从管理结构体中删除;可以通过该机制保证 Dispatcher 进程按顺序处理有依赖关系的任务执行结果。 + +![image.png](../../../imgs/pr_parallel_execute_task.png) + +上述状态机的状态转移过程中,黑色线标识的状态转移过程在 Dispatcher 进程中完成,橙色线标识的状态转移过程在并行回放进程组中完成。 + +#### Dispatcher 进程 + +Dispatcher 进程有三个关键数据结构:Task HashMap、Task Running Queue 以及 Task Idle Nodes。 + +- **Task HashMap** 负责记录 Task Tag 和相应的执行任务列表的 hash 映射关系: + - 每个任务有一个指定的 Task Tag,如果两个任务间存在依赖关系,则它们的 Task Tag 相同; + - 在分发任务时,如果一个 Task Node 存在前置依赖任务,则状态标识为 `Hold`,需等待前置任务先执行。 +- **Task Running Queue** 负责记录当前正在执行的任务; +- **Task Idel Nodes** 负责记录进程组中不同进程,当前处于 `Idle` 状态的 Task Node; + +Dispatcher 调度策略如下: + +- 如果要执行的 Task Node 有相同 Task Tag 的任务在执行,则优先将该 Task Node 分配到该 Task Tag 链表最后一个 Task Node 所在的执行进程;目的是让有依赖关系的任务尽量被同一个进程执行,减少进程间同步的开销; +- 如果期望优先分配的进程队列已满,或者没有相同的 Task Tag 在执行,则在进程组中按顺序选择一个进程,从中获取状态为 `Idle` 的 Task Node 来调度任务执行;目的是让任务尽量平均分配到不同的进程进行执行。 + +![image.png](../../../imgs/pr_parallel_execute_dispatcher.png) + +#### 进程组 + +该并行执行针对的是相同类型的任务,它们具有相同的 Task Node 数据结构;在进程组初始化时配置 `SchedContext`,指定负责执行具体任务的函数指针: + +- `TaskStartup` 表示进程执行任务前需要进行的初始化动作 +- `TaskHandler` 根据传入的 Task Node,负责执行具体的任务 +- `TaskCleanup` 表示执行进程退出前需要执行的回收动作 + +![image.png](../../../imgs/pr_parallel_execute_procs_1.png) + +进程组中的进程从环形队列中获取一个 Task Node,如果 Task Node 当前的状态是 `Hold`,则将该 Task Node 插入到 Hold List 的尾部;如果 Task Node 的状态为 Running,则调用 TaskHandler 执行;如果 TaskHandler 执行失败,则设置该 Task Node 重新执行需要等待调用的次数,默认为 3,将该 Task Node 插入到 Hold List 的头部。 + +![image.png](../../../imgs/pr_parallel_execute_procs_2.png) + +进程优先从 Hold List 头部搜索,获取可执行的 Task;如果 Task 状态为 Running,且等待调用次数为 0,则执行该 Task;如果 Task 状态为 Running,但等待调用次数大于 0,则将等待调用次数减去 1。 + +![image.png](../../../imgs/pr_parallel_execute_procs_3.png) + +### WAL 日志并行回放 + +根据 LogIndex 章节介绍,LogIndex 数据中记录了 WAL 日志和其修改的数据块之间的对应关系,而且 LogIndex 数据支持使用 LSN 进行检索,鉴于此,PolarDB 数据库在 Standby 节点持续回放 WAL 日志过程中,引入了上述并行任务执行框架,并结合 LogIndex 数据将 WAL 日志的回放任务并行化,提高了 Standby 节点数据同步的速度。 + +#### 工作流程 + +- Startup 进程:解析 WAL 日志后,仅构建 LogIndex 数据而不真正回放 WAL 日志; +- LogIndex BGW 后台回放进程:成为上述并行任务执行框架的 Dispatcher 进程,利用 LSN 来检索 LogIndex 数据,构建日志回放的子任务,并分配给并行回放进程组; +- 并行回放进程组内的进程:执行日志回放子任务,对数据块执行单个日志的回放操作; +- Backend 进程:主动读取数据块时,根据 PageTag 来检索 LogIndex 数据,获得修改该数据块的 LSN 日志链表,对数据块执行完整日志链的回放操作。 + +![image.png](../../../imgs/pr_parallel_replay_1.png) + +- Dispatcher 进程利用 LSN 来检索 LogIndex 数据,按 LogIndex 插入顺序枚举 PageTag 和对应 LSN,构建{LSN -> PageTag},组成相应的 Task Node; +- PageTag 作为 Task Node 的 Task Tag; +- 将枚举组成的 Task Node 分发给并行执行框架中进程组的子进程进行回放; + +![image.png](../../../imgs/pr_parallel_replay_2.png) + +## 使用方法 + +在 Standby 节点的 `postgresql.conf` 中添加以下参数开启功能: + +```ini:no-line-numbers +polar_enable_parallel_replay_standby_mode = ON +``` diff --git a/docs/zh/features/v11/epq/README.md b/docs/zh/features/v11/epq/README.md new file mode 100644 index 00000000000..423f73e514e --- /dev/null +++ b/docs/zh/features/v11/epq/README.md @@ -0,0 +1,10 @@ +# 弹性跨机并行查询(ePQ) + +- [ePQ 执行计划查看与分析](./epq-explain-analyze.md) +- [ePQ 计算节点范围选择与并行度控制](./epq-node-and-dop.md) +- [ePQ 支持分区表查询](./epq-partitioned-table.md) +- [ePQ 支持创建 B-Tree 索引并行加速](./epq-create-btree-index.md) +- [集群拓扑视图](./cluster-info.md) +- [自适应扫描](./adaptive-scan.md) +- [并行 INSERT](./parallel-dml.md) +- [ePQ 支持创建/刷新物化视图并行加速和批量写入](./epq-ctas-mtview-bulk-insert.md) diff --git a/docs/zh/features/v11/htap/adaptive-scan.md b/docs/zh/features/v11/epq/adaptive-scan.md similarity index 72% rename from docs/zh/features/v11/htap/adaptive-scan.md rename to docs/zh/features/v11/epq/adaptive-scan.md index 5c85396a73f..4a62f72ab02 100644 --- a/docs/zh/features/v11/htap/adaptive-scan.md +++ b/docs/zh/features/v11/epq/adaptive-scan.md @@ -14,26 +14,26 @@ minute: 25 ## 背景介绍 -PolarDB for PostgreSQL 提供了一款强大的分析型查询引擎——PX(Parallel eXecution),通过利用集群中多个节点的计算能力,来实现跨节点的并行查询功能。PX 可以支持顺序扫描、索引扫描等多种物理算子的跨节点并行化。其中,对顺序扫描算子,PX 提供了两种扫描模式,分别为 **自适应扫描模式** 与 **非自适应扫描模式**。 +PolarDB for PostgreSQL 支持 ePQ 弹性跨机并行查询特性,通过利用集群中多个节点的计算能力,来实现跨节点的并行查询功能。ePQ 可以支持顺序扫描、索引扫描等多种物理算子的跨节点并行化。其中,对顺序扫描算子,ePQ 提供了两种扫描模式,分别为 **自适应扫描模式** 与 **非自适应扫描模式**。 ## 术语 -- QC:Query Coordinator,发起 PX 并行查询的进程角色。 -- PX Worker:参与 PX 跨节点并行查询的工作进程角色。 +- QC:Query Coordinator,发起 ePQ 并行查询的进程角色。 +- PX Worker:参与 ePQ 跨节点并行查询的工作进程角色。 - Worker ID:唯一标识一个 PX Worker 的编号。 -- Disk Unit ID:PX 跨节点并行扫描的最小存储单元,默认为 4MB 大小。 +- Disk Unit ID:ePQ 跨节点并行扫描的最小存储单元,默认为 4MB 大小。 ## 功能介绍 ### 非自适应扫描 -非自适应扫描模式是 PX 顺序扫描算子(Sequential Scan)的默认扫描方式。每一个参与并行查询的 PX Worker 在执行过程中都会被分配一个唯一的 Worker ID。非自适应扫描模式将会依据 Worker ID 划分数据表在物理存储上的 Disk Unit ID,从而实现每个 PX Worker 可以均匀扫描数据表在共享存储上的存储单元,所有 PX Worker 的扫描结果最终汇总形成全量的数据。 +非自适应扫描模式是 ePQ 顺序扫描算子(Sequential Scan)的默认扫描方式。每一个参与并行查询的 PX Worker 在执行过程中都会被分配一个唯一的 Worker ID。非自适应扫描模式将会依据 Worker ID 划分数据表在物理存储上的 Disk Unit ID,从而实现每个 PX Worker 可以均匀扫描数据表在共享存储上的存储单元,所有 PX Worker 的扫描结果最终汇总形成全量的数据。 ### 自适应扫描 在非自适应扫描模式下,扫描单元会均匀划分给每个 PX Worker。当存在个别只读节点计算资源不足的情况下,可能会导致扫描过程发生计算倾斜:用户发起的单次并行查询迟迟不能完成,查询受限于计算资源不足的节点长时间不能完成扫描任务。 -PX 提供的自适应扫描模式可以解决这个问题。自适应扫描模式不再限定每个 PX Worker 扫描特定的 Disk Unit ID,而是采用 **请求-响应(Request-Response)模式**,通过 QC 进程与 PX Worker 进程之间的特定 RPC 通信机制,由 QC 进程负责告知每个 PX Worker 进程可以执行的扫描任务,从而消除计算倾斜的问题。 +ePQ 提供的自适应扫描模式可以解决这个问题。自适应扫描模式不再限定每个 PX Worker 扫描特定的 Disk Unit ID,而是采用 **请求-响应(Request-Response)模式**,通过 QC 进程与 PX Worker 进程之间的特定 RPC 通信机制,由 QC 进程负责告知每个 PX Worker 进程可以执行的扫描任务,从而消除计算倾斜的问题。 ## 功能设计 @@ -65,7 +65,7 @@ PX Worker 进程在执行顺序扫描算子时,会首先向 QC 进程发起询 #### 可变颗粒度 -为了减少请求带来的网络交互次数,PX 实现了可变的任务颗粒度。当扫描任务量剩余较多时,PX Worker 进程单次领取的扫描物理块数较多;当扫描任务量剩余较少时,PX Worker 进程单次领取的扫描物理块数相应减少。通过这种方法,可以平衡 **网络开销** 与 **负载均衡** 两者之间的关系。 +为了减少请求带来的网络交互次数,ePQ 实现了可变的任务颗粒度。当扫描任务量剩余较多时,PX Worker 进程单次领取的扫描物理块数较多;当扫描任务量剩余较少时,PX Worker 进程单次领取的扫描物理块数相应减少。通过这种方法,可以平衡 **网络开销** 与 **负载均衡** 两者之间的关系。 #### 缓存友好 @@ -105,7 +105,7 @@ INSERT 0 100 ### 非自适应扫描 -开启 PX 并行查询功能,并设置单节点并发度为 3。通过 `EXPLAIN` 可以看到执行计划来自 PX 优化器。由于参与测试的只读节点有两个,所以从执行计划中可以看到整体并发度为 6。 +开启 ePQ 并行查询功能,并设置单节点并发度为 3。通过 `EXPLAIN` 可以看到执行计划来自 PX 优化器。由于参与测试的只读节点有两个,所以从执行计划中可以看到整体并发度为 6。 ```sql postgres=# SET polar_enable_px = 1; diff --git a/docs/zh/features/v11/epq/cluster-info.md b/docs/zh/features/v11/epq/cluster-info.md new file mode 100644 index 00000000000..d36ea6c17f9 --- /dev/null +++ b/docs/zh/features/v11/epq/cluster-info.md @@ -0,0 +1,124 @@ +--- +author: 烛远 +date: 2022/09/20 +minute: 20 +--- + +# 集群拓扑视图 + + + + + +[[toc]] + +## 功能介绍 + +PolarDB for PostgreSQL 的 ePQ 弹性跨机并行查询功能可以将一个大查询分散到多个节点上执行,从而加快查询速度。该功能会涉及到各个节点之间的通信,包括执行计划的分发、执行的控制、结果的获取等等。因此设计了 **集群拓扑视图** 功能,用于为 ePQ 组件收集并展示集群的拓扑信息,实现跨节点查询。 + +## 术语 + +- RW / Primary:读写节点,后统称为 Primary +- RO / Replica:只读节点,后统称为 Replica +- Standby:灾备节点 +- Replication Slot:流复制槽,PostgreSQL 中用于持久化流复制关系的机制 + +## 功能使用 + +集群拓扑视图的维护是完全透明的,用户只需要按照部署文档搭建一写多读的集群,集群拓扑视图即可正确维护起来。关键在于需要搭建带有流复制槽的 Replica / Standby 节点。 + +使用以下接口可以获取集群拓扑视图(执行结果来自于 PolarDB for PostgreSQL 11): + +```sql:no-line-numbers +postgres=# SELECT * FROM polar_cluster_info; + name | host | port | release_date | version | slot_name | type | state | cpu | cpu_quota | memory | memory_quota | iops | iops_quota | connection | connection_quota | px_connection | px_connection_quota | px_node +-------+-----------+------+--------------+---------+-----------+---------+-------+-----+-----------+--------+--------------+------+------------+------------+------------------+---------------+---------------------+--------- + node0 | 127.0.0.1 | 5432 | 20220930 | 1.1.27 | | RW | Ready | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | f + node1 | 127.0.0.1 | 5433 | 20220930 | 1.1.27 | replica1 | RO | Ready | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | t + node2 | 127.0.0.1 | 5434 | 20220930 | 1.1.27 | replica2 | RO | Ready | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | t + node3 | 127.0.0.1 | 5431 | 20220930 | 1.1.27 | standby1 | Standby | Ready | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | f +(4 rows) +``` + +- `name` 是节点的名称,是自动生成的。 +- `host` / `port` 表示了节点的连接信息。在这里,都是本地地址。 +- `release_date` 和 `version` 标识了 PolarDB 的版本信息。 +- `slot_name` 是节点连接所使用的流复制槽,只有使用流复制槽连接上来的节点才会被统计在该视图中(除 Primary 节点外)。 +- `type` 表示节点的类型,有三类: + - PolarDB for PostgreSQL 11:RW / RO / Standby + - PolarDB for PostgreSQL 14:Primary / Replica / Standby +- `state` 表示节点的状态。有 Offline / Going Offline / Disabled / Initialized / Pending / Ready / Unknown 这些状态,其中只有 Ready 才有可能参与 PX 计算,其他的都无法参与 PX 计算。 +- `px_node` 表示是否参与 PX 计算。 +- 后续字段都是性能采集相关的字段,目前都是留空的。 + +对于 ePQ 查询来说,默认只有 Replica 节点参与。可以通过参数控制使用 Primary 节点或者 Standby 节点参与计算: + +```sql:no-line-numbers +-- 使 Primary 节点参与计算 +SET polar_px_use_master = ON; + +-- 使 Standby 节点参与计算 +SET polar_px_use_standby = ON; +``` + +::: tip +从 PolarDB for PostgreSQL 14 起,`polar_px_use_master` 参数改名为 `polar_px_use_primary`。 +::: + +还可以使用 `polar_px_nodes` 指定哪些节点参与 PX 计算。例如使用上述集群拓扑视图,可以执行如下命令,让 PX 查询只在 replica1 上执行。 + +```sql:no-line-numbers +SET polar_px_nodes = 'node1'; +``` + +## 设计实现 + +### 信息采集 + +集群拓扑视图信息的采集是通过流复制来传递信息的。该功能对流复制协议增加了新的消息类型用于集群拓扑视图的传递。分为以下两个步骤: + +- Replica / Standby 将状态传递给 Primary +- Primary 汇总集群拓扑视图,返回给 Replica / Standby + +### 更新频率 + +集群拓扑视图并非定时更新与发送,因为视图并非一直变化。只有当节点刚启动时,或发生关键状态变化时再进行更新发送。 + +在具体实现上,Primary 节点收集的全局状态带有版本 generation,只有在接收到节点拓扑变化才会递增;当全局状态版本更新后,才会发送到其他节点,其他节点接收到后,设置到自己的节点上。 + +![生成集群拓扑视图](../../../imgs/cluster_info_generate.png) + +### 采集维度 + +状态指标: + +- 节点 name +- 节点 host / port +- 节点 slot_name +- 节点负载(CPU / MEM / 连接 / IOPS) +- 节点状态 + - Offline + - Going Offline + - Disabled + - Initialized + - Pending + - Ready + - Unknown + +### 消息格式 + +同 WAL Sender / WAL Reciver 的其他消息的做法,新增 `'m'` 和 `'M'` 消息类型,用于收集节点信息和广播集群拓扑视图。 + +### 内部使用 + +提供接口获取 Replica 列表,提供 IP / port 等信息,用于 PX 查询。 + +预留了较多的负载接口,可以根据负载来实现动态调整并行度。(尚未接入) + +同时增加了参数 `polar_px_use_master` / `polar_px_use_standby`,将 Primary / Standby 加入到 PX 计算中,默认不打开(可能会有正确性问题,因为快照格式、Vacuum 等原因,快照有可能不可用)。 + +ePQ 会使用上述信息生成节点的连接信息并缓存下来,并在 ePQ 查询中使用该视图。当 generation 更新或者设置了 `polar_px_nodes` / `polar_px_use_master` / `polar_px_use_standby` 时,该缓存会被重置,并在下次使用时重新生成缓存。 + +### 结果展示 + +通过 `polar_monitor` 插件提供视图,将上述集群拓扑视图提供出去,在任意节点均可获取。 diff --git a/docs/zh/features/v11/epq/epq-create-btree-index.md b/docs/zh/features/v11/epq/epq-create-btree-index.md new file mode 100644 index 00000000000..207e63345b4 --- /dev/null +++ b/docs/zh/features/v11/epq/epq-create-btree-index.md @@ -0,0 +1,81 @@ +--- +author: 棠羽 +date: 2023/09/20 +minute: 20 +--- + +# ePQ 支持创建 B-Tree 索引并行加速 + + + + + +[[toc]] + +## 背景 + +在使用 PostgreSQL 时,如果想要在一张表中查询符合某个条件的行,默认情况下需要扫描整张表的数据,然后对每一行数据依次判断过滤条件。如果符合条件的行数非常少,而表的数据总量非常大,这显然是一个非常低效的操作。与阅读书籍类似,想要阅读某个特定的章节时,读者通常会通过书籍开头处的索引查询到对应章节的页码,然后直接从指定的页码开始阅读;在数据库中,通常会对被频繁查找的列创建索引,以避免进行开销极大的全表扫描:通过索引可以精确定位到被查找的数据位于哪些数据页面上。 + +PostgreSQL 支持创建多种类型的索引,其中使用得最多的是 [B-Tree](https://www.postgresql.org/docs/current/indexes-types.html#INDEXES-TYPES-BTREE) 索引,也是 PostgreSQL 默认创建的索引类型。在一张数据量较大的表上创建索引是一件非常耗时的事,因为其中涉及到的工作包含: + +1. 顺序扫描表中的每一行数据 +2. 根据要创建索引的列值(Scan Key)顺序,对每行数据在表中的物理位置进行排序 +3. 构建索引元组,按 B-Tree 的结构组织并写入索引页面 + +PostgreSQL 支持并行(多进程扫描/排序)和并发(不阻塞 DML)创建索引,但只能在创建索引的过程中使用单个计算节点的资源。 + +PolarDB-PG 的 ePQ 弹性跨机并行查询特性支持对 B-Tree 类型的索引创建进行加速。ePQ 能够利用多个计算节点的 I/O 带宽并行扫描全表数据,并利用多个计算节点的 CPU 和内存资源对每行数据在表中的物理位置按索引列值进行排序,构建索引元组。最终,将有序的索引元组归并到创建索引的进程中,写入索引页面,完成索引的创建。 + +## 使用方法 + +### 数据准备 + +创建一张包含三个列,数据量为 1000000 行的表: + +```sql:no-line-numbers +CREATE TABLE t (id INT, age INT, msg TEXT); + +INSERT INTO t +SELECT + random() * 1000000, + random() * 10000, + md5(random()::text) +FROM generate_series(1, 1000000); +``` + +### 创建索引 + +使用 ePQ 创建索引需要以下三个步骤: + +1. 设置参数 `polar_enable_px` 为 `ON`,打开 ePQ 的开关 +2. 按需设置参数 `polar_px_dop_per_node` 调整查询并行度 +3. 在创建索引时显式声明 `px_build` 属性为 `ON` + +```sql:no-line-numbers +SET polar_enable_px TO ON; +SET polar_px_dop_per_node TO 8; +CREATE INDEX t_idx1 ON t(id, msg) WITH(px_build = ON); +``` + +在创建索引的过程中,数据库会对正在创建索引的表施加 [`ShareLock`](https://www.postgresql.org/docs/current/explicit-locking.html#LOCKING-TABLES) 锁。这个级别的锁将会阻塞其它进程对表的 DML 操作(`INSERT` / `UPDATE` / `DELETE`)。 + +### 并发创建索引 + +类似地,ePQ 支持并发创建索引,只需要在 `CREATE INDEX` 后加上 `CONCURRENTLY` 关键字即可: + +```sql:no-line-numbers +SET polar_enable_px TO ON; +SET polar_px_dop_per_node TO 8; +CREATE INDEX CONCURRENTLY t_idx2 ON t(id, msg) WITH(px_build = ON); +``` + +在创建索引的过程中,数据库会对正在创建索引的表施加 [`ShareUpdateExclusiveLock`](https://www.postgresql.org/docs/current/explicit-locking.html#LOCKING-TABLES) 锁。这个级别的锁将不会阻塞其它进程对表的 DML 操作。 + +## 使用限制 + +ePQ 加速创建索引暂不支持以下场景: + +- 创建 `UNIQUE` 索引 +- 创建索引时附带 `INCLUDING` 列 +- 创建索引时指定 `TABLESPACE` +- 创建索引时带有 `WHERE` 而成为部分索引(Partial Index) diff --git a/docs/zh/features/v11/epq/epq-ctas-mtview-bulk-insert.md b/docs/zh/features/v11/epq/epq-ctas-mtview-bulk-insert.md new file mode 100644 index 00000000000..5a296ed89f5 --- /dev/null +++ b/docs/zh/features/v11/epq/epq-ctas-mtview-bulk-insert.md @@ -0,0 +1,53 @@ +--- +author: 棠羽 +date: 2023/02/08 +minute: 10 +--- + +# ePQ 支持创建/刷新物化视图并行加速和批量写入 + + + + + +[[toc]] + +## 背景 + +[物化视图 (Materialized View)](https://en.wikipedia.org/wiki/Materialized_view) 是一个包含查询结果的数据库对象。与普通的视图不同,物化视图不仅保存视图的定义,还保存了 [创建物化视图](https://www.postgresql.org/docs/current/sql-creatematerializedview.html) 时的数据副本。当物化视图的数据与视图定义中的数据不一致时,可以进行 [物化视图刷新 (Refresh)](https://www.postgresql.org/docs/current/sql-refreshmaterializedview.html) 保持物化视图中的数据与视图定义一致。物化视图本质上是对视图定义中的查询做预计算,以便于在查询时复用。 + +[`CREATE TABLE AS`](https://www.postgresql.org/docs/current/sql-createtableas.html) 语法用于将一个查询所对应的数据构建为一个新的表,其表结构与查询的输出列完全相同。 + +[`SELECT INTO`](https://www.postgresql.org/docs/current/sql-selectinto.html) 语法用于建立一张新表,并将查询所对应的数据写入表中,而不是将查询到的数据返回给客户端。其表结构与查询的输出列完全相同。 + +## 功能原理介绍 + +对于物化视图的创建和刷新,以及 `CREATE TABLE AS` / `SELECT INTO` 语法,由于在数据库层面需要完成的工作步骤十分相似,因此 PostgreSQL 内核使用同一套代码逻辑来处理这几种语法。内核执行过程中的主要步骤包含: + +1. 数据扫描:执行视图定义或 `CREATE TABLE AS` / `SELECT INTO` 语法中定义的查询,扫描符合查询条件的数据 +2. 数据写入:将上述步骤中扫描到的数据写入到一个新的物化视图 / 表中 + +PolarDB for PostgreSQL 对上述两个步骤分别引入了 ePQ 并行扫描和批量数据写入的优化。在需要扫描或写入的数据量较大时,能够显著提升上述 DDL 语法的性能,缩短执行时间: + +1. ePQ 并行扫描:通过 ePQ 功能,利用多个计算节点的 I/O 带宽和计算资源并行执行视图定义中的查询,提升计算资源和带宽的利用率 +2. 批量写入:不再将扫描到的每一个元组依次写入表或物化视图,而是在内存中攒够一定数量的元组后,一次性批量写入表或物化视图中,减少记录 WAL 日志的开销,降低对页面的锁定频率 + +## 使用说明 + +### ePQ 并行扫描 + +将以下参数设置为 `ON` 即可启用 ePQ 并行扫描来加速上述语法中的查询过程,目前其默认值为 `ON`。该参数生效的前置条件是 ePQ 特性的总开关 `polar_enable_px` 被打开。 + +```sql:no-line-numbers +SET polar_px_enable_create_table_as = ON; +``` + +由于 ePQ 特性的限制,该优化不支持 `CREATE TABLE AS ... WITH OIDS` 语法。对于该语法的处理流程中将会回退使用 PostgreSQL 内置优化器为 DDL 定义中的查询生成执行计划,并通过 PostgreSQL 的单机执行器完成查询。 + +### 批量写入 + +将以下参数设置为 `ON` 即可启用批量写入来加速上述语法中的写入过程,目前其默认值为 `ON`。 + +```sql:no-line-numbers +SET polar_enable_create_table_as_bulk_insert = ON; +``` diff --git a/docs/zh/features/v11/epq/epq-explain-analyze.md b/docs/zh/features/v11/epq/epq-explain-analyze.md new file mode 100644 index 00000000000..cd9fdf086a8 --- /dev/null +++ b/docs/zh/features/v11/epq/epq-explain-analyze.md @@ -0,0 +1,71 @@ +--- +author: 渊云、秦疏 +date: 2023/09/06 +minute: 30 +--- + +# ePQ 执行计划查看与分析 + + + + + +[[toc]] + +## 背景 + +PostgreSQL 提供了 `EXPLAIN` 命令用于 SQL 语句的性能分析。它能够输出 SQL 对应的查询计划,以及在执行过程中的具体耗时、资源消耗等信息,可用于排查 SQL 的性能瓶颈。 + +`EXPLAIN` 命令原先只适用于单机执行的 SQL 性能分析。PolarDB-PG 的 ePQ 弹性跨机并行查询扩展了 `EXPLAIN` 的功能,使其可以打印 ePQ 的跨机并行执行计划,还能够统计 ePQ 执行计划在各个算子上的执行时间、数据扫描量、内存使用量等信息,并以统一的视角返回给客户端。 + +## 功能介绍 + +### 执行计划查看 + +ePQ 的执行计划是分片的。每个计划分片(Slice)由计算节点上的虚拟执行单元(Segment)启动的一组进程(Gang)负责执行,完成 SQL 的一部分计算。ePQ 在执行计划中引入了 Motion 算子,用于在执行不同计划分片的进程组之间进行数据传递。因此,Motion 算子就是计划分片的边界。 + +ePQ 中总共引入了三种 Motion 算子: + +- `PX Coordinator`:源端数据发送到同一个目标端(汇聚) +- `PX Broadcast`:源端数据发送到每一个目标端(广播) +- `PX Hash`:源端数据经过哈希计算后发送到某一个目标端(重分布) + +以一个简单查询作为例子: + +```sql:no-line-numbers +=> CREATE TABLE t (id INT); +=> SET polar_enable_px TO ON; +=> EXPLAIN (COSTS OFF) SELECT * FROM t LIMIT 1; + QUERY PLAN +------------------------------------------------- + Limit + -> PX Coordinator 6:1 (slice1; segments: 6) + -> Partial Seq Scan on t + Optimizer: PolarDB PX Optimizer +(4 rows) +``` + +以上执行计划以 Motion 算子为界,被分为了两个分片:一个是接收最终结果的分片 `slice0`,一个是扫描数据的分片`slice1`。对于 `slice1` 这个计划分片,ePQ 将使用六个执行单元(`segments: 6`)分别启动一个进程来执行,这六个进程各自负责扫描表的一部分数据(`Partial Seq Scan`),通过 Motion 算子将六个进程的数据汇聚到一个目标端(`PX Coordinator 6:1`),传递给 `Limit` 算子。 + +如果查询逐渐复杂,则执行计划中的计划分片和 Motion 算子会越来越多: + +```sql:no-line-numbers +=> CREATE TABLE t1 (a INT, b INT, c INT); +=> SET polar_enable_px TO ON; +=> EXPLAIN (COSTS OFF) SELECT SUM(b) FROM t1 GROUP BY a LIMIT 1; + QUERY PLAN +------------------------------------------------------------ + Limit + -> PX Coordinator 6:1 (slice1; segments: 6) + -> GroupAggregate + Group Key: a + -> Sort + Sort Key: a + -> PX Hash 6:6 (slice2; segments: 6) + Hash Key: a + -> Partial Seq Scan on t1 + Optimizer: PolarDB PX Optimizer +(10 rows) +``` + +以上执行计划中总共有三个计划分片。将会有六个进程(`segments: 6`)负责执行 `slice2` 分片,分别扫描表的一部分数据,然后通过 Motion 算子(`PX Hash 6:6`)将数据重分布到另外六个(`segments: 6`)负责执行 `slice1` 分片的进程上,各自完成排序(`Sort`)和聚合(`GroupAggregate`),最终通过 Motion 算子(`PX Coordinator 6:1`)将数据汇聚到结果分片 `slice0`。 diff --git a/docs/zh/features/v11/epq/epq-node-and-dop.md b/docs/zh/features/v11/epq/epq-node-and-dop.md new file mode 100644 index 00000000000..b89805c7349 --- /dev/null +++ b/docs/zh/features/v11/epq/epq-node-and-dop.md @@ -0,0 +1,142 @@ +--- +author: 渊云 +date: 2023/09/06 +minute: 20 +--- + +# ePQ 计算节点范围选择与并行度控制 + + + + + +[[toc]] + +## 背景介绍 + +PolarDB-PG 的 ePQ 弹性跨机并行查询特性提供了精细的粒度控制方法,可以合理使用集群内的计算资源。在最大程度利用闲置计算资源进行并行查询,提升资源利用率的同时,避免了对其它业务负载产生影响: + +1. ePQ 可以动态调整集群中参与并行查询的计算节点范围,避免使用负载较高的计算节点 +2. ePQ 支持为每条查询动态调整在计算节点上的并行度,避免 ePQ 并行查询进程对计算资源的消耗影响到相同节点上的其它进程 + +## 计算节点范围选择 + +参数 `polar_px_nodes` 指定了参与 ePQ 的计算节点范围,默认值为空,表示所有只读节点都参与 ePQ 并行查询: + +```sql:no-line-numbers +=> SHOW polar_px_nodes; + polar_px_nodes +---------------- + +(1 row) +``` + +如果希望读写节点也参与 ePQ 并行,则可以设置如下参数: + +```sql:no-line-numbers +SET polar_px_use_primary TO ON; +``` + +如果部分只读节点负载较高,则可以通过修改 `polar_px_nodes` 参数设置仅特定几个而非所有只读节点参与 ePQ 并行查询。参数 `polar_px_nodes` 的合法格式是一个以英文逗号分隔的节点名称列表。获取节点名称需要安装 `polar_monitor` 插件: + +```sql:no-line-numbers +CREATE EXTENSION IF NOT EXISTS polar_monitor; +``` + +通过 `polar_monitor` 插件提供的集群拓扑视图,可以查询到集群中所有计算节点的名称: + +```sql:no-line-numbers +=> SELECT name,slot_name,type FROM polar_cluster_info; + name | slot_name | type +-------+-----------+--------- + node0 | | Primary + node1 | standby1 | Standby + node2 | replica1 | Replica + node3 | replica2 | Replica +(4 rows) +``` + +其中: + +- `Primary` 表示读写节点 +- `Replica` 表示只读节点 +- `Standby` 表示备库节点 + +通用的最佳实践是使用负载较低的只读节点参与 ePQ 并行查询: + +```sql:no-line-numbers +=> SET polar_px_nodes = 'node2,node3'; +=> SHOW polar_px_nodes; + polar_px_nodes +---------------- + node2,node3 +(1 row) +``` + +## 并行度控制 + +参数 `polar_px_dop_per_node` 用于设置当前会话中的 ePQ 查询在每个计算节点上的执行单元(Segment)数量,每个执行单元会为其需要执行的每一个计划分片(Slice)启动一个进程。 + +该参数默认值为 `3`,通用最佳实践值为当前计算节点 CPU 核心数的一半。如果计算节点的 CPU 负载较高,可以酌情递减该参数,控制计算节点的 CPU 占用率至 80% 以下;如果查询性能不佳时,可以酌情递增该参数,也需要保持计算节点的 CPU 水位不高于 80%。否则可能会拖慢其它的后台进程。 + +## 并行度计算方法示例 + +创建一张表: + +```sql:no-line-numbers +CREATE TABLE test(id INT); +``` + +假设集群内有两个只读节点,`polar_px_nodes` 为空,此时 ePQ 将使用集群内的所有只读节点参与并行查询;参数 `polar_px_dop_per_node` 的值为 `3`,表示每个计算节点上将会有三个执行单元。执行计划如下: + +```sql:no-line-numbers +=> SHOW polar_px_nodes; + polar_px_nodes +---------------- + +(1 row) + +=> SHOW polar_px_dop_per_node; + polar_px_dop_per_node +----------------------- + 3 +(1 row) + +=> EXPLAIN SELECT * FROM test; + QUERY PLAN +------------------------------------------------------------------------------- + PX Coordinator 6:1 (slice1; segments: 6) (cost=0.00..431.00 rows=1 width=4) + -> Partial Seq Scan on test (cost=0.00..431.00 rows=1 width=4) + Optimizer: PolarDB PX Optimizer +(3 rows) +``` + +从执行计划中可以看出,两个只读节点上总计有六个执行单元(`segments: 6`)将会执行这个计划中唯一的计划分片 `slice1`。这意味着总计会有六个进程并行执行当前查询。 + +此时,调整 `polar_px_dop_per_node` 为 `4`,再次执行查询,两个只读节点上总计会有八个执行单元参与当前查询。由于执行计划中只有一个计划分片 `slice1`,这意味着总计会有八个进程并行执行当前查询: + +```sql:no-line-numbers +=> SET polar_px_dop_per_node TO 4; +SET +=> EXPLAIN SELECT * FROM test; + QUERY PLAN +------------------------------------------------------------------------------- + PX Coordinator 8:1 (slice1; segments: 8) (cost=0.00..431.00 rows=1 width=4) + -> Partial Seq Scan on test (cost=0.00..431.00 rows=1 width=4) + Optimizer: PolarDB PX Optimizer +(3 rows) +``` + +此时,如果设置 `polar_px_use_primary` 参数,让读写节点也参与查询,那么读写节点上也将会有四个执行单元参与 ePQ 并行执行,集群内总计 12 个进程参与并行执行: + +```sql:no-line-numbers +=> SET polar_px_use_primary TO ON; +SET +=> EXPLAIN SELECT * FROM test; + QUERY PLAN +--------------------------------------------------------------------------------- + PX Coordinator 12:1 (slice1; segments: 12) (cost=0.00..431.00 rows=1 width=4) + -> Partial Seq Scan on test (cost=0.00..431.00 rows=1 width=4) + Optimizer: PolarDB PX Optimizer +(3 rows) +``` diff --git a/docs/zh/features/v11/epq/epq-partitioned-table.md b/docs/zh/features/v11/epq/epq-partitioned-table.md new file mode 100644 index 00000000000..3f22a76e59e --- /dev/null +++ b/docs/zh/features/v11/epq/epq-partitioned-table.md @@ -0,0 +1,227 @@ +--- +author: 渊云 +date: 2023/09/06 +minute: 20 +--- + +# ePQ 支持分区表查询 + + + + + +[[toc]] + +## 背景 + +随着数据量的不断增长,表的规模将会越来越大。为了方便管理和提高查询性能,比较好的实践是使用分区表,将大表拆分成多个子分区表。甚至每个子分区表还可以进一步拆成二级子分区表,从而形成了多级分区表。 + +PolarDB-PG 支持 ePQ 弹性跨机并行查询,能够利用集群中多个计算节点提升只读查询的性能。ePQ 不仅能够对普通表进行高效的跨机并行查询,对分区表也实现了跨机并行查询。 + +ePQ 对分区表的基础功能支持包含: + +- 对分区策略为 Range / List / Hash 的分区表进行并行扫描 +- 对分区表进行索引扫描 +- 对分区表进行连接查询 + +此外,ePQ 还支持了部分与分区表相关的高级功能: + +- 分区裁剪 +- 智能分区连接(Partition Wise Join) +- 对多级分区表进行并行查询 + +ePQ 暂不支持对具有多列分区键的分区表进行并行查询。 + +## 使用指南 + +### 分区表并行查询 + +创建一张分区策略为 Range 的分区表,并创建三个子分区: + +```sql:no-line-numbers +CREATE TABLE t1 (id INT) PARTITION BY RANGE(id); +CREATE TABLE t1_p1 PARTITION OF t1 FOR VALUES FROM (0) TO (200); +CREATE TABLE t1_p2 PARTITION OF t1 FOR VALUES FROM (200) TO (400); +CREATE TABLE t1_p3 PARTITION OF t1 FOR VALUES FROM (400) TO (600); +``` + +设置参数打开 ePQ 开关和 ePQ 分区表扫描功能的开关: + +```sql:no-line-numbers +SET polar_enable_px TO ON; +SET polar_px_enable_partition TO ON; +``` + +查看对分区表进行全表扫描的执行计划: + +```sql:no-line-numbers +=> EXPLAIN (COSTS OFF) SELECT * FROM t1; + QUERY PLAN +------------------------------------------- + PX Coordinator 6:1 (slice1; segments: 6) + -> Append + -> Partial Seq Scan on t1_p1 + -> Partial Seq Scan on t1_p2 + -> Partial Seq Scan on t1_p3 + Optimizer: PolarDB PX Optimizer +(6 rows) +``` + +ePQ 将会启动一组进程并行扫描分区表的每一个子表。每一个扫描进程都会通过 `Append` 算子依次扫描每一个子表的一部分数据(`Partial Seq Scan`),并通过 Motion 算子(`PX Coordinator`)将所有进程的扫描结果汇聚到发起查询的进程并返回。 + +### 分区静态裁剪 + +当查询的过滤条件中包含分区键时,ePQ 优化器可以根据过滤条件对将要扫描的分区表进行裁剪,避免扫描不需要的子分区,节省系统资源,提升查询性能。以上述 `t1` 表为例,查看以下查询的执行计划: + +```sql:no-line-numbers +=> EXPLAIN (COSTS OFF) SELECT * FROM t1 WHERE id < 100; + QUERY PLAN +------------------------------------------- + PX Coordinator 6:1 (slice1; segments: 6) + -> Append + -> Partial Seq Scan on t1_p1 + Filter: (id < 100) + Optimizer: PolarDB PX Optimizer +(5 rows) +``` + +由于查询的过滤条件 `id < 100` 包含分区键,因此 ePQ 优化器可以根据分区表的分区边界,在产生执行计划时去除不符合过滤条件的子分区(`t1_p2`、`t1_p3`),只保留符合过滤条件的子分区(`t1_p1`)。 + +### 智能分区连接 + +在进行分区表之间的连接操作时,如果分区策略和边界相同,并且连接条件为分区键时,ePQ 优化器可以产生以子分区为单位进行连接的执行计划,避免两张分区表的进行笛卡尔积式的连接,节省系统资源,提升查询性能。 + +以两张 Range 分区表的连接为例。使用以下 SQL 创建两张分区策略和边界都相同的分区表 `t2` 和 `t3`: + +```sql:no-line-numbers +CREATE TABLE t2 (id INT) PARTITION BY RANGE(id); +CREATE TABLE t2_p1 PARTITION OF t2 FOR VALUES FROM (0) TO (200); +CREATE TABLE t2_p2 PARTITION OF t2 FOR VALUES FROM (200) TO (400); +CREATE TABLE t2_p3 PARTITION OF t2 FOR VALUES FROM (400) TO (600); + +CREATE TABLE t3 (id INT) PARTITION BY RANGE(id); +CREATE TABLE t3_p1 PARTITION OF t3 FOR VALUES FROM (0) TO (200); +CREATE TABLE t3_p2 PARTITION OF t3 FOR VALUES FROM (200) TO (400); +CREATE TABLE t3_p3 PARTITION OF t3 FOR VALUES FROM (400) TO (600); +``` + +打开以下参数启用 ePQ 对分区表的支持: + +```sql:no-line-numbers +SET polar_enable_px TO ON; +SET polar_px_enable_partition TO ON; +``` + +当 Partition Wise join 关闭时,两表在分区键上等值连接的执行计划如下: + +```sql:no-line-numbers +=> SET polar_px_enable_partitionwise_join TO OFF; +=> EXPLAIN (COSTS OFF) SELECT * FROM t2 JOIN t3 ON t2.id = t3.id; + QUERY PLAN +----------------------------------------------------------- + PX Coordinator 6:1 (slice1; segments: 6) + -> Hash Join + Hash Cond: (t2_p1.id = t3_p1.id) + -> Append + -> Partial Seq Scan on t2_p1 + -> Partial Seq Scan on t2_p2 + -> Partial Seq Scan on t2_p3 + -> Hash + -> PX Broadcast 6:6 (slice2; segments: 6) + -> Append + -> Partial Seq Scan on t3_p1 + -> Partial Seq Scan on t3_p2 + -> Partial Seq Scan on t3_p3 + Optimizer: PolarDB PX Optimizer +(14 rows) +``` + +从执行计划中可以看出,执行 `slice1` 计划分片的六个进程会分别通过 `Append` 算子依次扫描分区表 `t2` 每一个子分区的一部分数据,并通过 Motion 算子(`PX Broadcast`)接收来自执行 `slice2` 的六个进程广播的 `t3` 全表数据,在本地完成哈希连接(`Hash Join`)后,通过 Motion 算子(`PX Coordinator`)汇聚结果并返回。本质上,分区表 `t2` 的每一行数据都与 `t3` 的每一行数据做了一次连接。 + +打开参数 `polar_px_enable_partitionwise_join` 启用 Partition Wise join 后,再次查看执行计划: + +```sql:no-line-numbers +=> SET polar_px_enable_partitionwise_join TO ON; +=> EXPLAIN (COSTS OFF) SELECT * FROM t2 JOIN t3 ON t2.id = t3.id; + QUERY PLAN +------------------------------------------------ + PX Coordinator 6:1 (slice1; segments: 6) + -> Append + -> Hash Join + Hash Cond: (t2_p1.id = t3_p1.id) + -> Partial Seq Scan on t2_p1 + -> Hash + -> Full Seq Scan on t3_p1 + -> Hash Join + Hash Cond: (t2_p2.id = t3_p2.id) + -> Partial Seq Scan on t2_p2 + -> Hash + -> Full Seq Scan on t3_p2 + -> Hash Join + Hash Cond: (t2_p3.id = t3_p3.id) + -> Partial Seq Scan on t2_p3 + -> Hash + -> Full Seq Scan on t3_p3 + Optimizer: PolarDB PX Optimizer +(18 rows) +``` + +在上述执行计划中,执行 `slice1` 计划分片的六个进程将通过 `Append` 算子依次扫描分区表 `t2` 每个子分区中的一部分数据,以及分区表 `t3` **相对应子分区** 的全部数据,将两份数据进行哈希连接(`Hash Join`),最终通过 Motion 算子(`PX Coordinator`)汇聚结果并返回。在上述执行过程中,分区表 `t2` 的每一个子分区 `t2_p1`、`t2_p2`、`t2_p3` 分别只与分区表 `t3` 对应的 `t3_p1`、`t3_p2`、`t3_p3` 做了连接,并没有与其它不相关的分区连接,节省了不必要的工作。 + +### 多级分区表并行查询 + +在多级分区表中,每级分区表的分区维度(分区键)可以不同:比如一级分区表按照时间维度分区,二级分区表按照地域维度分区。当查询 SQL 的过滤条件中包含每一级分区表中的分区键时,ePQ 优化器支持对多级分区表进行静态分区裁剪,从而过滤掉不需要被扫描的子分区。 + +以下图为例:当查询过滤条件 `WHERE date = '202201' AND region = 'beijing'` 中包含一级分区键 `date` 和二级分区键 `region` 时,ePQ 优化器能够裁剪掉所有不相关的分区,产生的执行计划中只包含符合条件的子分区。由此,执行器只对需要扫描的子分区进行扫描即可。 + +![multi-level-partition](../../../imgs/htap-multi-level-partition-1.png) + +使用以下 SQL 为例,创建一张多级分区表: + +```sql:no-line-numbers +CREATE TABLE r1 (a INT, b TIMESTAMP) PARTITION BY RANGE (b); + +CREATE TABLE r1_p1 PARTITION OF r1 FOR VALUES FROM ('2000-01-01') TO ('2010-01-01') PARTITION BY RANGE (a); +CREATE TABLE r1_p1_p1 PARTITION OF r1_p1 FOR VALUES FROM (1) TO (1000000); +CREATE TABLE r1_p1_p2 PARTITION OF r1_p1 FOR VALUES FROM (1000000) TO (2000000); + +CREATE TABLE r1_p2 PARTITION OF r1 FOR VALUES FROM ('2010-01-01') TO ('2020-01-01') PARTITION BY RANGE (a); +CREATE TABLE r1_p2_p1 PARTITION OF r1_p2 FOR VALUES FROM (1) TO (1000000); +CREATE TABLE r1_p2_p2 PARTITION OF r1_p2 FOR VALUES FROM (1000000) TO (2000000); +``` + +打开以下参数启用 ePQ 对分区表的支持: + +```sql:no-line-numbers +SET polar_enable_px TO ON; +SET polar_px_enable_partition TO ON; +``` + +执行一条以两级分区键作为过滤条件的 SQL,并关闭 ePQ 的多级分区扫描功能,将得到 PostgreSQL 内置优化器经过多级分区静态裁剪后的执行计划: + +```sql:no-line-numbers +=> SET polar_px_optimizer_multilevel_partitioning TO OFF; +=> EXPLAIN (COSTS OFF) SELECT * FROM r1 WHERE a < 1000000 AND b < '2009-01-01 00:00:00'; + QUERY PLAN +---------------------------------------------------------------------------------------- + Seq Scan on r1_p1_p1 r1 + Filter: ((a < 1000000) AND (b < '2009-01-01 00:00:00'::timestamp without time zone)) +(2 rows) +``` + +启用 ePQ 的多级分区扫描功能,再次查看执行计划: + +```sql:no-line-numbers +=> SET polar_px_optimizer_multilevel_partitioning TO ON; +=> EXPLAIN (COSTS OFF) SELECT * FROM r1 WHERE a < 1000000 AND b < '2009-01-01 00:00:00'; + QUERY PLAN +---------------------------------------------------------------------------------------------------- + PX Coordinator 6:1 (slice1; segments: 6) + -> Append + -> Partial Seq Scan on r1_p1_p1 + Filter: ((a < 1000000) AND (b < '2009-01-01 00:00:00'::timestamp without time zone)) + Optimizer: PolarDB PX Optimizer +(5 rows) +``` + +在上述计划中,ePQ 优化器进行了对多级分区表的静态裁剪。执行 `slice1` 计划分片的六个进程只需对符合过滤条件的子分区 `r1_p1_p1` 进行并行扫描(`Partial Seq Scan`)即可,并将扫描到的数据通过 Motion 算子(`PX Coordinator`)汇聚并返回。 diff --git a/docs/zh/features/v11/epq/parallel-dml.md b/docs/zh/features/v11/epq/parallel-dml.md new file mode 100644 index 00000000000..3bee16edce8 --- /dev/null +++ b/docs/zh/features/v11/epq/parallel-dml.md @@ -0,0 +1,99 @@ +--- +author: 渊云 +date: 2022/09/27 +minute: 30 +--- + +# 并行 INSERT + + + + + +[[toc]] + +## 背景介绍 + +PolarDB-PG 支持 ePQ 弹性跨机并行查询,能够利用集群中多个计算节点提升只读查询的性能。此外,ePQ 也支持在读写节点上通过多进程并行写入,实现对 `INSERT` 语句的加速。 + +## 功能介绍 + +ePQ 的并行 `INSERT` 功能可以用于加速 `INSERT INTO ... SELECT ...` 这种读写兼备的 SQL。对于 SQL 中的 `SELECT` 部分,ePQ 将启动多个进程并行执行查询;对于 SQL 中的 `INSERT` 部分,ePQ 将在读写节点上启动多个进程并行执行写入。执行写入的进程与执行查询的进程之间通过 **Motion 算子** 进行数据传递。 + +能够支持并行 `INSERT` 的表类型有: + +- 普通表 +- 分区表 +- (部分)外部表 + +并行 `INSERT` 支持动态调整写入并行度(写入进程数量),在查询不成为瓶颈的条件下性能最高能提升三倍。 + +## 使用方法 + +创建两张表 `t1` 和 `t2`,向 `t1` 中插入一些数据: + +```sql:no-line-numbers +CREATE TABLE t1 (id INT); +CREATE TABLE t2 (id INT); +INSERT INTO t1 SELECT generate_series(1,100000); +``` + +打开 ePQ 及并行 `INSERT` 的开关: + +```sql:no-line-numbers +SET polar_enable_px TO ON; +SET polar_px_enable_insert_select TO ON; +``` + +通过 `INSERT` 语句将 `t1` 表中的所有数据插入到 `t2` 表中。查看并行 `INSERT` 的执行计划: + +```sql:no-line-numbers +=> EXPLAIN INSERT INTO t2 SELECT * FROM t1; + QUERY PLAN +----------------------------------------------------------------------------------------- + Insert on t2 (cost=0.00..952.87 rows=33334 width=4) + -> Result (cost=0.00..0.00 rows=0 width=0) + -> PX Hash 6:3 (slice1; segments: 6) (cost=0.00..432.04 rows=100000 width=8) + -> Partial Seq Scan on t1 (cost=0.00..431.37 rows=16667 width=4) + Optimizer: PolarDB PX Optimizer +(5 rows) +``` + +其中的 `PX Hash 6:3` 表示 6 个并行查询 `t1` 的进程通过 Motion 算子将数据传递给 3 个并行写入 `t2` 的进程。 + +通过参数 `polar_px_insert_dop_num` 可以动态调整写入并行度,比如: + +```sql:no-line-numbers +=> SET polar_px_insert_dop_num TO 12; +=> EXPLAIN INSERT INTO t2 SELECT * FROM t1; + QUERY PLAN +------------------------------------------------------------------------------------------ + Insert on t2 (cost=0.00..952.87 rows=8334 width=4) + -> Result (cost=0.00..0.00 rows=0 width=0) + -> PX Hash 6:12 (slice1; segments: 6) (cost=0.00..432.04 rows=100000 width=8) + -> Partial Seq Scan on t1 (cost=0.00..431.37 rows=16667 width=4) + Optimizer: PolarDB PX Optimizer +(5 rows) +``` + +执行计划中的 `PX Hash 6:12` 显示,并行查询 `t1` 的进程数量不变,并行写入 `t2` 的进程数量变更为 `12`。 + +## 使用说明 + +调整 `polar_px_dop_per_node` 和 `polar_px_insert_dop_num` 可以分别修改 `INSERT INTO ... SELECT ...` 中查询和写入的并行度。 + +1. 当查询并行度较低时,逐步提升写入并行度,SQL 执行时间将会逐渐下降并趋于平缓;趋于平缓的原因是查询速度跟不上写入速度而成为瓶颈 +2. 当查询并行度较高时,逐步提升写入并行度,SQL 执行时间将会逐渐下降并趋于平缓;趋于平缓的原因是并行写入只能在读写节点上进行,写入速度因多个写入进程对表页面扩展锁的争抢而跟不上查询速度,成为瓶颈 + +## 原理介绍 + +ePQ 对并行 `INSERT` 的处理如下: + +1. ePQ 优化器以查询解析得到的语法树作为输入,产生计划树 +2. ePQ 执行器将计划树分发到各计算节点,并创建并行查询/并行写入进程,开始执行各自负责执行的子计划 +3. 并行查询进程从存储中并行读取各自负责的数据分片,并将数据发送到 Motion 算子 +4. 并行写入进程从 Motion 算子中获取数据,向存储并行写入数据 + +并行查询和并行写入是以流水线的形式同时进行的。上述执行过程如图所示: + +![parallel_insert_data_flow](../../../imgs/parallel_data_flow.png) diff --git a/docs/zh/features/v11/htap/README.md b/docs/zh/features/v11/htap/README.md deleted file mode 100644 index d53df9db40a..00000000000 --- a/docs/zh/features/v11/htap/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# HTAP - -- [自适应扫描](./adaptive-scan.md) -- [并行 DML](./parallel-dml.md) -- [多级分区表静态裁剪与并行扫描](./multi-level-partition.md) diff --git a/docs/zh/features/v11/htap/multi-level-partition.md b/docs/zh/features/v11/htap/multi-level-partition.md deleted file mode 100644 index 9c6db097559..00000000000 --- a/docs/zh/features/v11/htap/multi-level-partition.md +++ /dev/null @@ -1,117 +0,0 @@ ---- -author: 秦疏 -date: 2022/11/28 -minute: 20 ---- - -# 多级分区表静态裁剪与并行扫描 - - - - - -[[toc]] - -## 背景 - -随着数据量的不断增长,表的规模将会越来越大。为了方便管理和提高查询性能,用户一般会使用分区表,将大表拆分成多个子分区表,每个子分区表又进一步可以拆成二级子分区表,从而形成了多级分区表。 - -PolarDB for PostgreSQL 支持多级分区表的静态分区裁剪,避免对无关分区进行扫描。同时,针对被裁剪后的分区表,可以进一步开启并行查询能力,从而加快分区表的查询性能。 - -## 术语 - -- QC:Query Coordinator,发起 PX 并行查询的进程角色。 -- PX Worker:参与 PX 跨节点并行查询的工作进程角色。 -- Worker ID:唯一标识一个 PX Worker 的编号。 - -## 原理 - -在多级分区表中,每一级分区表的分区维度可以不同,如下图所示:比如一级分区表按照时间(date)维度分区,二级分区表按照地域(region)维度分区。当 QC 发起查询时,优化器可以根据查询条件(如 `date = '202201' AND region = 'beijing'`)与每一级分区表的分区键进行匹配,从而过滤掉不需要被扫描的子分区,只保留符合条件的分区表。 - -如果满足条件的分区表数量较多,或者分区表中数据较多,那么可以结合 PolarDB for PostgreSQL 的并行查询(PX)能力,并行扫描对应的数据页面。在 PolarDB for PostgreSQL 共享存储的架构下,读写节点和只读节点对所有表数据都是可见的,因此可以在多个只读节点中启动 PX Worker 并行扫描,最后将结果汇总到 QC 进程。 - -![multi-level-partition](../../../imgs/htap-multi-level-partition-1.png) - -## 使用指南 - -### GUC 参数 - -多级分区表并行查询功能依赖如下两个 GUC 参数: - -| GUC 参数名 | 参数说明 | -| -------------------------------------------- | -------------------------------------- | -| `polar_enable_px` | 开启 PolarDB PostgreSQL 的并行查询功能 | -| `polar_px_optimizer_multilevel_partitioning` | 开启多级分区表并行查询功能 | - -具体开启方式如下: - -```sql:no-line-numbers -SET polar_enable_px = ON; -SET polar_px_optimizer_multilevel_partitioning = ON; -``` - -### 创建多级分区表 - -```sql:no-line-numbers --- 主表 -CREATE TABLE range_list (a int,b timestamp,c varchar(10)) PARTITION BY RANGE (b); - --- 创建两个一级分区表 -CREATE TABLE range_pa1 PARTITION OF range_list FOR VALUES FROM ('2000-01-01') TO ('2010-01-01') PARTITION BY RANGE (a); -CREATE TABLE range_pa2 PARTITION OF range_list FOR VALUES FROM ('2010-01-01') TO ('2020-01-01') PARTITION BY RANGE (a); - --- 分别为每个一级分区表创建两个二级子分区表 -CREATE TABLE range_list_2000_2010_1_10 PARTITION OF range_pa1 FOR VALUES from (1) TO (1000000); -CREATE TABLE range_list_2000_2010_10_20 PARTITION OF range_pa1 FOR VALUES from (1000000) TO (2000000); -CREATE TABLE range_list_2010_2020_1_10 PARTITION OF range_pa2 FOR VALUES from (1) TO (1000000); -CREATE TABLE range_list_2010_2020_10_20 PARTITION OF range_pa2 FOR VALUES from (1000000) TO (2000000); -``` - -### 插入示例数据 - -```sql:no-line-numbers -INSERT INTO range_list SELECT round(random()*8) + 1, '2005-01-01' FROM generate_series(1,100); -INSERT INTO range_list SELECT round(random()*8) + 1000000, '2005-01-01' FROM generate_series(1,100); -INSERT INTO range_list SELECT round(random()*8) + 1, '2019-01-01' FROM generate_series(1,100); -INSERT INTO range_list SELECT round(random()*8) + 1000000, '2019-01-01' FROM generate_series(1,100); -``` - -### 关闭多级分区表并行功能 - -```sql:no-line-numbers -SET polar_enable_px = ON; -SET polar_px_optimizer_multilevel_partitioning = OFF; -``` - -此时,虽然可以进行多级分区表的静态裁剪(只会扫描 `range_list_2000_2010_1_10` 这张分区表),但是并不能使用并行查询功能: - -```sql:no-line-numbers -EXPLAIN SELECT * FROM range_list WHERE a < 1000000 AND b < '2009-01-01 00:00:00'; - QUERY PLAN ----------------------------------------------------------------------------------------------- - Append (cost=0.00..26.18 rows=116 width=50) - -> Seq Scan on range_list_2000_2010_1_10 (cost=0.00..25.60 rows=116 width=50) - Filter: ((a < 1000000) AND (b < '2009-01-01 00:00:00'::timestamp without time zone)) -(3 rows) -``` - -### 开启多级分区表并行功能 - -```sql:no-line-numbers -SET polar_enable_px = ON; -SET polar_px_optimizer_multilevel_partitioning = ON; -``` - -此时,可以进行多级分区表的静态裁剪(只会扫描 `range_list_2000_2010_1_10` 这张分区表),同时也可以使用并行查询功能(6 个并行度): - -```sql -EXPLAIN SELECT count(*) FROM range_list WHERE a < 1000000 AND b < '2009-01-01 00:00:00'; - QUERY PLAN ----------------------------------------------------------------------------------------------------- - PX Coordinator 6:1 (slice1; segments: 6) (cost=0.00..431.00 rows=1 width=22) - -> Append (cost=0.00..431.00 rows=1 width=22) - -> Partial Seq Scan on range_list_2000_2010_1_10 (cost=0.00..431.00 rows=1 width=22) - Filter: ((a < 1000000) AND (b < '2009-01-01 00:00:00'::timestamp without time zone)) - Optimizer: PolarDB PX Optimizer -(5 rows) -``` diff --git a/docs/zh/features/v11/htap/parallel-dml.md b/docs/zh/features/v11/htap/parallel-dml.md deleted file mode 100644 index 57d7ac3f75b..00000000000 --- a/docs/zh/features/v11/htap/parallel-dml.md +++ /dev/null @@ -1,362 +0,0 @@ ---- -author: 渊云 -date: 2022/09/27 -minute: 60 ---- - -# 并行 DML - - - - - -[[toc]] - -## 背景介绍 - -PolarDB for PostgreSQL 提供了一款强大的分析型查询引擎——PX(Parallel eXecution),通过利用集群中多个只读节点来提升查询性能。同时,PX 针对 DML(`INSERT` / `UPDATE` / `DELETE`)也可以做到并行读并行写的加速。其中: - -- **并行读** 是指借助多个只读节点上的多进程来加速 DML 中的查找操作 -- **并行写** 是指在一个 PolarDB 唯一的读写节点上利用多进程实现并行写入 - -## 术语 - -- QC:Query Coordinator,发起 PX 并行查询的进程角色。 -- PX Worker:参与 PX 跨节点并行查询的工作进程角色。 -- DML:数据操作语句,包含 `INSERT` / `UPDATE` / `DELETE`。 -- Slice:指每个 PX Worker 负责执行的计划分片。 -- RW / RO:读写节点 / 只读节点。 - -## 功能介绍 - -### Parallel Insert - -为了加速 `INSERT ... SELECT ...` 这种既有读取又有写入的 DML SQL,PolarDB for PG 使用 Parallel Insert 来提升性能。对于 `SELECT` 子查询,PolarDB 使用多个 PX Worker 并行加速查询;对于 `INSERT` 的写入操作,由于 PolarDB 只有一个 RW 节点,我们会在 RW 节点上启动多个执行写入的 PX Worker 进程,通过 **Motion 算子** 来接收 RO 节点上读取的数据,实现加速并行写入。 - -这里需要注意的是,RO 节点上的 PX Worker 只能执行只读操作,但是在 RW 节点上的 PX Worker 可以执行写入操作。Parallel Insert 在读写数据量均衡的情况下,最高能提升 3 倍的性能。Parallel Insert 已支持: - -- 普通表 -- 分区表 -- 强制有序 -- 并行度动态调整 - -### Parallel Update - -与 Parallel Insert 类似,针对 `UPDATE ... SET ...`,PolarDB 使用多个 PX Worker 来执行并行查询,实现加速筛选需要更新的行;同时,在 RW 节点上启动多个 PX Worker 进程来执行更新操作。在读写数据量均衡的情况下,最高能提升 3 倍的性能。Parallel Update 不支持分区表,支持并行度动态调整。 - -### Parallel Delete - -与 Parallel Update 基本相同,针对 `DELETE FROM ...`,PolarDB 通过多个 PX Worker 来执行并行查询,实现加速筛选需要删除的行;同时,在 RW 节点启动多个 PX Worker 来执行删除操作。Parallel Delete 不支持分区表,支持并行度动态调整。 - -## 功能设计 - -### Parallel Insert - -Parallel Insert 的总体框架如下所示: - -![parallel_insert_arch](../../../imgs/parallel_insert_architecture.png) - -Parallel Insert 的处理步骤如下: - -1. QC 进程接收到 `INSERT ... SEELCT` -2. QC 进程对 SQL 进行解析、重写,生成查询树,通过 PX 优化器生成计划树 -3. 通过 bitmap 标志来指定每个 PX Worker 负责执行哪部分执行计划 -4. 将完整的计划树分发到 RO 节点和 RW 节点,并创建 PX Worker 进程,不同的 PX Workers 根据自己的 ID 在 bitmap 中查找自己负责执行的计划 -5. RO 节点上的 PX Workers 执行查询计划,从存储中并行读取各自负责的数据分片; -6. RO 节点上的 PX Workers 通过 Motion 算子将查询数据发送给 RW 节点上的 PX Workers; -7. RW 节点上的 PX Workers 并行向存储写入数据。 - -其中 5、6、7 三个步骤是全流水线执行的。 - -下面以最简单的并行 DML `INSERT INTO t1 SELECT * FROM t2` 为例。表 `t1` 和 `t2` 都是只有两列的表。 - -```sql - QUERY PLAN -------------------------------------------------- - Insert on public.t1 - -> Result - Output: t2.c1, t2.c2 - -> PX Hash 6:6 (slice1; segments: 6) - Output: t2.c1, t2.c2, (1) - -> Partial Seq Scan on public.t2 - Output: t2.c1, t2.c2, 1 - Optimizer: PolarDB PX Optimizer -(8 rows) -``` - -在执行计划中,`Partial Seq Scan` 代表每个 PX Workers 并行读取的数据分片,`PX Hash 6:6` 说明有 6 个负责读取的 PX Workers 和 6 个负责写入的 PX Workers。计划中的 `Hash` 代表负责读取的 PX Worker 所读取到的数据会 hash 重分布到 RW 节点上负责写入的 PX Worker 上。 - -Parallel Insert 也支持单个写 Worker,多个读 Worker 的执行计划: - -```sql - QUERY PLAN -------------------------------------------------------- - Insert on public.t1 - -> Result - Output: t2.c1, t2.c2 - -> PX Coordinator 6:1 (slice1; segments: 6) - Output: t2.c1, t2.c2 - -> Partial Seq Scan on public.t2 - Output: t2.c1, t2.c2 - Optimizer: PolarDB PX Optimizer -(8 rows) -``` - -由于只有一个写 Worker,所以计划中显示的是 `PX Coordinator 6:1`,将 RO 节点上的数据汇聚到 RW 节点上。 - -下图是以数据流的方式展示 Parallel Insert 的执行过程: - -![parallel_insert_data_flow](../../../imgs/parallel_data_flow.png) - -执行过程如下: - -1. 每个负责读取的 PX Worker 执行一部分的顺序扫描操作,读取数据,进入到 `RedistributeMotionRandom`,将读取到的每条数据重分布,发送给各个负责写入的 PX Worker; -2. 通过 `SendMotion` 来向 RW 节点上的 PX Worker 发送数据,RO 节点上的每个 PX Worker 会从所有 RW 节点上的 PX Worker 中选择一个进行数据重分布,重分布的策略有哈希分布和随机分布两种; -3. RW 节点上被选中的 PX Worker 通过 `RecvMotion` 来接收数据,然后将数据通过 `ModifyTable` 算子写入存储。 - -### Parallel Update - -由于 Parallel Update 和 Delete 在 SQL 解析、重写的过程和 Parallel Insert 相同,下面只说明 Parallel Update 的执行计划和数据流动方式。 - -不带子查询的并行 Update 计划: - -```sql - QUERY PLAN --------------------------------------------------------------------------------------------------------- - Update (segment: 6) on public.t1 - -> Result - Output: t1_1.c1, t1_1.c2, (DMLAction), t1_1.ctid - -> PX Hash 6:6 (slice1; segments: 6) - Output: t1_1.c1, t1_1.c2, t1_1.ctid, t1_1._px_worker_id, (DMLAction), ('16397'::oid) - -> Result - Output: t1_1.c1, t1_1.c2, t1_1.ctid, t1_1._px_worker_id, (DMLAction), '16397'::oid - -> Split - Output: t1_1.c1, t1_1.c2, t1_1.ctid, t1_1._px_worker_id, DMLAction - -> Partial Seq Scan on public.t1 t1_1 - Output: t1_1.c1, t1_1.c2, 3, t1_1.ctid, t1_1._px_worker_id - Optimizer: PolarDB PX Optimizer -(12 rows) -``` - -从执行计划中可以看出,从 RO 节点读取数据到 RW 节点写入数据之前存在一个 Split 算子。算子中还包含了一个 `DMLAction` 的标志,用于表示当前正在进行的 DML 操作类型(`DML_INSERT` / `DML_DELETE`)。Split 算子用于把 `UPDATE` 拆分为 `DELETE` 和 `INSERT` 两个阶段,表明要删除哪些行、插入哪些行。 - -对于带有子查询的 `UPDATE` 计划,除写入计划分片之外加入了自查询的执行计划分片。示例如下: - -```sql - QUERY PLAN ------------------------------------------------------------------------------------------------------------- - Update (segment: 6) on public.t1 - -> Result - Output: t1_1.c1, t1_1.c2, (DMLAction), t1_1.ctid - -> PX Hash 6:6 (slice1; segments: 6) - Output: t1_1.c1, t1_1.c2, t1_1.ctid, t1_1._px_worker_id, (DMLAction), ('16397'::oid) - -> Result - Output: t1_1.c1, t1_1.c2, t1_1.ctid, t1_1._px_worker_id, (DMLAction), '16397'::oid - -> Split - Output: t1_1.c1, t1_1.c2, t1_1.ctid, t1_1._px_worker_id, DMLAction - -> Partial Seq Scan on public.t1 t1_1 - Output: t1_1.c1, t1_1.c2, int4((SubPlan 1)), t1_1.ctid, t1_1._px_worker_id - SubPlan 1 - -> Materialize - Output: (count()) - -> PX Broadcast 1:6 (slice2) - Output: (count()) - -> Aggregate - Output: count() - -> PX Coordinator 6:1 (slice3; segments: 6) - -> Partial Seq Scan on public.t2 - Optimizer: PolarDB PX Optimizer -(21 rows) -``` - -Parallel Update 处理数据流图如下图所示: - -![parallel_update_dataflow](../../../imgs/parallel_dml_update_dataflow.png) - -- 对于不带子查询的情况,如 `UPDATE t1 SET c1=3` - 1. 每个负责写入的 PX Worker 并行查找要更新的行 - 2. 通过 Split 算子,拆分成 `DELETE` 和 `INSERT` 操作 - 3. 执行 `ExecDelete` 和 `ExecInsert` -- 带子查询的情况,如 `UPDATE t1 SET c1=(SELECT COUNT(*) FROM t2)` - 1. 每个负责读取的 PX Worker 从共享存储上并行读取自己负责的数据分片,然后通过 `SendMotion` 将自己读到的数据汇聚到 QC 进程 - 2. QC 进程将数据(过滤条件)广播给 RW 节点上的各个负责写入的 PX Worker - 3. 各个负责写入的 PX Worker 分别扫描各自负责的数据分片,查找待更新的数据 - 4. 通过 Split 算子,拆分成 `DELETE` 和 `INSERT` 操作 - 5. 执行 `ExecDelete` 和 `ExecInsert` - -### Parallel Delete - -不带子查询的并行 Delete 计划: - -```sql - QUERY PLAN --------------------------------------------------------------------------------- - Delete (segment: 6) on public.t1 - -> Result - Output: t1_1.c1, t1_1.c2, t1_1.ctid - -> PX Hash 6:6 (slice1; segments: 6) - Output: t1_1.c1, t1_1.c2, t1_1.ctid, t1_1._px_worker_id, (0) - -> Partial Seq Scan on public.t1 t1_1 - Output: t1_1.c1, t1_1.c2, t1_1.ctid, t1_1._px_worker_id, 0 - Filter: (t1_1.c1 < 10) - Optimizer: PolarDB PX Optimizer -(9 rows) -``` - -带有子查询的并行 Delete 计划: - -```sql - QUERY PLAN ------------------------------------------------------------------------------------ - Delete (segment: 6) on public.t1 - -> Result - Output: t1_1.c1, t1_1.c2, t1_1.ctid - -> PX Hash 6:6 (slice1; segments: 6) - Output: t1_1.c1, t1_1.c2, t1_1.ctid, t1_1._px_worker_id, (0) - -> Hash Semi Join - Output: t1_1.c1, t1_1.c2, t1_1.ctid, t1_1._px_worker_id, 0 - Hash Cond: (t1_1.c1 = t2.c1) - -> Partial Seq Scan on public.t1 t1_1 - Output: t1_1.c1, t1_1.c2, t1_1.ctid, t1_1._px_worker_id - -> Hash - Output: t2.c1 - -> Full Seq Scan on public.t2 - Output: t2.c1 - Optimizer: PolarDB PX Optimizer -(15 rows) -``` - -负责读写的 PX Workers 数量: - -```sql - QUERY PLAN ------------------------------------------------------------------------------------ - Delete (segment: 10) on public.t1 - -> Result - Output: t1_1.c1, t1_1.c2, t1_1.ctid - -> PX Hash 6:10 (slice1; segments: 6) - Output: t1_1.c1, t1_1.c2, t1_1.ctid, t1_1._px_worker_id, (0) - -> Hash Semi Join - Output: t1_1.c1, t1_1.c2, t1_1.ctid, t1_1._px_worker_id, 0 - Hash Cond: (t1_1.c1 = t2.c1) - -> Partial Seq Scan on public.t1 t1_1 - Output: t1_1.c1, t1_1.c2, t1_1.ctid, t1_1._px_worker_id - -> Hash - Output: t2.c1 - -> Full Seq Scan on public.t2 - Output: t2.c1 - Optimizer: PolarDB PX Optimizer -(15 rows) -``` - -可以看到 Parallel Delete 的计划与 Parallel Update 类似,区别在于: - -1. 由于 Parallel Delete 只执行删除操作,不执行插入操作,所以不需要 Split 算子 -2. 顶层的 DML 算子由 Update 变为 Delete 算子 - -并行 Delete 的数据流图如下所示: - -![parallel_dml_delete_dataflow](../../../imgs/parallel_dml_delete_dataflow.png) - -1. 每个负责读取的 PX Workers 扫描属于自己的数据分片,找出要删除的行 -2. 将待删除的行通过 Motion 算子传输给每个负责写入的 PX Workers,并行执行 Delete 操作 - -## 使用说明 - -### Parallel Insert - -Parallel Insert 默认关闭,需要打开开关来使用: - -```sql:no-line-numbers --- 使用 Parallel Insert 前,需要打开 PX -SET polar_enable_px = ON; - --- 开启 Parallel Insert 功能 -SET polar_px_enable_insert_select = ON; - --- 开启 Parallel Insert 写入分区表,默认关闭 -SET polar_px_enable_insert_partition_table = ON; - --- 写入并行度控制,默认为 6,表示 RW 节点上会启动 6 个 PX Workers 来执行写入 -SET polar_px_insert_dop_num = 6; - --- 支持无表查询的开关,默认关闭 -SET polar_px_enable_insert_from_tableless = ON; -``` - -由于 Parallel Insert 无法保证写入顺序,提供以下开关以强制保证写入结果有序: - -```sql:no-line-numbers --- 默认打开,关闭后则不保证并行 Insert 结果有序 -SET polar_px_enable_insert_order_sensitive = ON; -``` - -### Parallel Update - -参数 `polar_px_enable_update` 控制是否开启 Parallel Update 功能,默认关闭。 - -```sql:no-line-numbers -SET polar_px_enable_update = ON; -``` - -参数 `polar_px_update_dop_num` 控制 Parallel Update 的写入并行度。默认为 `6`,范围为 `1~128`。 - -```sql:no-line-numbers --- 启动 6 个 PX Workers 进行写入 -SET polar_px_update_dop_num = 6; -``` - -### Parallel Delete - -参数 `polar_px_enable_delete` 控制是否开启 Parallel Delete,默认关闭。 - -```sql:no-line-numbers -SET polar_px_enable_delete = ON; -``` - -参数 `polar_px_delete_dop_num` 控制 Parallel Delete 的写入并行度。默认值为 `6`,取值范围为 `1~128`。 - -```sql:no-line-numbers --- 启动 6 个 PX Workers 进行删除 -SET polar_px_delete_dop_num = 6; -``` - -## 性能表现 - -下面将简单说明一下 PDML 的性能表现。 - -### Parallel Insert - -在读写数据量相同的情况下,总数据量为 75GB 时,Parallel Insert 的性能表现如下图所示: - -![parallel_dml_insert_result_equal](../../../imgs/parallel_dml_insert_result_1.png) - -当读数据量远大于写数据量的情况下,总数据量为 75GB 时,写入数据量占读数据量的 0.25% 时,Parallel Insert 的性能表现如下图所示: - -![parallel_dml_insert_result_](../../../imgs/parallel_dml_insert_result_read.png) - -由两张图可知: - -1. 在读写数据量相同的情况下,Parallel Insert 最高能提升 3 倍的性能 -2. 读数据量越大,Parallel Insert 性能提升幅度越大,最高能有 4 倍左右的提升 -3. 提升写入并行度对性能提升不大,主要原因是 PX Worker 必须在 RW 上执行并行写入,数据库中的表扩展锁成为性能瓶颈 - -### Parallel Update - -在读写数据量相同的情况下,总数据量为 75GB 时,并行 Update 的性能表现: - -![parallel_dml_update_result](../../../imgs/parallel_dml_update_result.png) - -在读数据量远大于写数据量的情况下,读写数据比例为 100:1 时,并行 Update 的性能表现: - -![parallel_dml_update_read_result](../../../imgs/parallel_dml_update_read_result.png) - -由这两张性能表现图可知: - -1. 当读写数据量相同的情况下,Parallel Update 最高能提升 3 倍的性能 -2. 读数据量越大,Parallel Update 性能提升幅度越大,最高能到达 10 倍的提升 -3. 提升写入并行度对性能提升不大,原因同上 - -### Parallel Delete - -Parallel Delete 的性能表现和结论与 Parallel Update 基本一致,不再赘述。 diff --git a/docs/zh/imgs/cluster_info_generate.png b/docs/zh/imgs/cluster_info_generate.png new file mode 100644 index 00000000000..2442fbd75af Binary files /dev/null and b/docs/zh/imgs/cluster_info_generate.png differ diff --git a/docs/zh/imgs/pr_parallel_execute_1.png b/docs/zh/imgs/pr_parallel_execute_1.png new file mode 100644 index 00000000000..0861d45b981 Binary files /dev/null and b/docs/zh/imgs/pr_parallel_execute_1.png differ diff --git a/docs/zh/imgs/pr_parallel_execute_2.png b/docs/zh/imgs/pr_parallel_execute_2.png new file mode 100644 index 00000000000..a505d26040b Binary files /dev/null and b/docs/zh/imgs/pr_parallel_execute_2.png differ diff --git a/docs/zh/imgs/pr_parallel_execute_dispatcher.png b/docs/zh/imgs/pr_parallel_execute_dispatcher.png new file mode 100644 index 00000000000..77273b49879 Binary files /dev/null and b/docs/zh/imgs/pr_parallel_execute_dispatcher.png differ diff --git a/docs/zh/imgs/pr_parallel_execute_procs_1.png b/docs/zh/imgs/pr_parallel_execute_procs_1.png new file mode 100644 index 00000000000..759689ee109 Binary files /dev/null and b/docs/zh/imgs/pr_parallel_execute_procs_1.png differ diff --git a/docs/zh/imgs/pr_parallel_execute_procs_2.png b/docs/zh/imgs/pr_parallel_execute_procs_2.png new file mode 100644 index 00000000000..f8a79ce0a39 Binary files /dev/null and b/docs/zh/imgs/pr_parallel_execute_procs_2.png differ diff --git a/docs/zh/imgs/pr_parallel_execute_procs_3.png b/docs/zh/imgs/pr_parallel_execute_procs_3.png new file mode 100644 index 00000000000..466184289ba Binary files /dev/null and b/docs/zh/imgs/pr_parallel_execute_procs_3.png differ diff --git a/docs/zh/imgs/pr_parallel_execute_task.png b/docs/zh/imgs/pr_parallel_execute_task.png new file mode 100644 index 00000000000..e187b55934f Binary files /dev/null and b/docs/zh/imgs/pr_parallel_execute_task.png differ diff --git a/docs/zh/imgs/pr_parallel_replay_1.png b/docs/zh/imgs/pr_parallel_replay_1.png new file mode 100644 index 00000000000..db3c6f0685b Binary files /dev/null and b/docs/zh/imgs/pr_parallel_replay_1.png differ diff --git a/docs/zh/imgs/pr_parallel_replay_2.png b/docs/zh/imgs/pr_parallel_replay_2.png new file mode 100644 index 00000000000..21d0705fe6a Binary files /dev/null and b/docs/zh/imgs/pr_parallel_replay_2.png differ diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c index 57021f6ca1a..5bc29991311 100644 --- a/src/backend/access/common/bufmask.c +++ b/src/backend/access/common/bufmask.c @@ -78,7 +78,7 @@ mask_unused_space(Page page) if (pd_lower > pd_upper || pd_special < pd_upper || pd_lower < SizeOfPageHeaderData || pd_special > BLCKSZ) { - elog(ERROR, "invalid page pd_lower %u pd_upper %u pd_special %u\n", + elog(ERROR, "invalid page pd_lower %u pd_upper %u pd_special %u", pd_lower, pd_upper, pd_special); } diff --git a/src/backend/jit/README b/src/backend/jit/README index e2fac8558e8..5427bdf2153 100644 --- a/src/backend/jit/README +++ b/src/backend/jit/README @@ -10,11 +10,11 @@ SQL expressions to evaluate an SQL predicate like WHERE a.col = 3, it is possible to generate a function than can be natively executed by the CPU that just handles that expression, yielding a speedup. -That this is done at query execution time, possibly even only in cases -where the relevant task is done a number of times, makes it JIT, -rather than ahead-of-time (AOT). Given the way JIT compilation is used -in PostgreSQL, the lines between interpretation, AOT and JIT are -somewhat blurry. +This is JIT, rather than ahead-of-time (AOT) compilation, because it +is done at query execution time, and perhaps only in cases where the +relevant task is repeated a number of times. Given the way JIT +compilation is used in PostgreSQL, the lines between interpretation, +AOT and JIT are somewhat blurry. Note that the interpreted program turned into a native program does not necessarily have to be a program in the classical sense. E.g. it @@ -99,7 +99,7 @@ Lifetimes of JITed functions are managed via JITContext. Exactly one such context should be created for work in which all created JITed function should have the same lifetime. E.g. there's exactly one JITContext for each query executed, in the query's EState. Only the -release of an JITContext is exposed to the provider independent +release of a JITContext is exposed to the provider independent facility, as the creation of one is done on-demand by the JIT implementations. @@ -231,7 +231,7 @@ needs to be referenced as an offset to one block of memory stored in an ExprState, rather than absolute pointers into memory. Once that is addressed, adding an LRU cache that's keyed by the -generated LLVM IR will allow to use optimized functions even for +generated LLVM IR will allow the usage of optimized functions even for faster queries. A longer term project is to move expression compilation to the planner diff --git a/src/backend/jit/llvm/Makefile b/src/backend/jit/llvm/Makefile index e2db4cea65c..0d22279578c 100644 --- a/src/backend/jit/llvm/Makefile +++ b/src/backend/jit/llvm/Makefile @@ -22,6 +22,12 @@ endif PGFILEDESC = "llvmjit - JIT using LLVM" NAME = llvmjit +# LLVM 14 produces deprecation warnings. We'll need to make some changes +# before the relevant functions are removed, but for now silence the warnings. +ifeq ($(GCC), yes) +LLVM_CFLAGS += -Wno-deprecated-declarations +endif + # All files in this directy use LLVM. CFLAGS += $(LLVM_CFLAGS) CXXFLAGS += $(LLVM_CXXFLAGS) diff --git a/src/backend/jit/llvm/llvmjit.c b/src/backend/jit/llvm/llvmjit.c index 7510698f863..dc3efd27460 100644 --- a/src/backend/jit/llvm/llvmjit.c +++ b/src/backend/jit/llvm/llvmjit.c @@ -29,7 +29,13 @@ #include #include #include +#if LLVM_VERSION_MAJOR > 11 +#include +#include +#include +#else #include +#endif #include #include #include @@ -43,8 +49,13 @@ /* Handle of a module emitted via ORC JIT */ typedef struct LLVMJitHandle { +#if LLVM_VERSION_MAJOR > 11 + LLVMOrcLLJITRef lljit; + LLVMOrcResourceTrackerRef resource_tracker; +#else LLVMOrcJITStackRef stack; LLVMOrcModuleHandle orc_handle; +#endif } LLVMJitHandle; @@ -94,12 +105,15 @@ static const char *llvm_triple = NULL; static const char *llvm_layout = NULL; -static LLVMTargetMachineRef llvm_opt0_targetmachine; -static LLVMTargetMachineRef llvm_opt3_targetmachine; - static LLVMTargetRef llvm_targetref; +#if LLVM_VERSION_MAJOR > 11 +static LLVMOrcThreadSafeContextRef llvm_ts_context; +static LLVMOrcLLJITRef llvm_opt0_orc; +static LLVMOrcLLJITRef llvm_opt3_orc; +#else /* LLVM_VERSION_MAJOR > 11 */ static LLVMOrcJITStackRef llvm_opt0_orc; static LLVMOrcJITStackRef llvm_opt3_orc; +#endif /* LLVM_VERSION_MAJOR > 11 */ static void llvm_release_context(JitContext *context); @@ -111,6 +125,10 @@ static void llvm_optimize_module(LLVMJitContext *context, LLVMModuleRef module); static void llvm_create_types(void); static uint64_t llvm_resolve_symbol(const char *name, void *ctx); +#if LLVM_VERSION_MAJOR > 11 +static LLVMOrcLLJITRef llvm_create_jit_instance(LLVMTargetMachineRef tm); +static char *llvm_error_message(LLVMErrorRef error); +#endif /* LLVM_VERSION_MAJOR > 11 */ PG_MODULE_MAGIC; @@ -163,32 +181,57 @@ llvm_release_context(JitContext *context) { LLVMJitContext *llvm_context = (LLVMJitContext *) context; - llvm_enter_fatal_on_oom(); - /* * When this backend is exiting, don't clean up LLVM. As an error might * have occurred from within LLVM, we do not want to risk reentering. All * resource cleanup is going to happen through process exit. */ - if (!proc_exit_inprogress) + if (proc_exit_inprogress) + return; + + llvm_enter_fatal_on_oom(); + + if (llvm_context->module) { - if (llvm_context->module) - { - LLVMDisposeModule(llvm_context->module); - llvm_context->module = NULL; - } + LLVMDisposeModule(llvm_context->module); + llvm_context->module = NULL; + } - while (llvm_context->handles != NIL) - { - LLVMJitHandle *jit_handle; + while (llvm_context->handles != NIL) + { + LLVMJitHandle *jit_handle; - jit_handle = (LLVMJitHandle *) linitial(llvm_context->handles); - llvm_context->handles = list_delete_first(llvm_context->handles); + jit_handle = (LLVMJitHandle *) linitial(llvm_context->handles); + llvm_context->handles = list_delete_first(llvm_context->handles); +#if LLVM_VERSION_MAJOR > 11 + { + LLVMOrcExecutionSessionRef ee; + LLVMOrcSymbolStringPoolRef sp; + + LLVMOrcResourceTrackerRemove(jit_handle->resource_tracker); + LLVMOrcReleaseResourceTracker(jit_handle->resource_tracker); + + /* + * Without triggering cleanup of the string pool, we'd leak + * memory. It'd be sufficient to do this far less often, but in + * experiments the required time was small enough to just always + * do it. + */ + ee = LLVMOrcLLJITGetExecutionSession(jit_handle->lljit); + sp = LLVMOrcExecutionSessionGetSymbolStringPool(ee); + LLVMOrcSymbolStringPoolClearDeadEntries(sp); + } +#else /* LLVM_VERSION_MAJOR > 11 */ + { LLVMOrcRemoveModule(jit_handle->stack, jit_handle->orc_handle); - pfree(jit_handle); } +#endif /* LLVM_VERSION_MAJOR > 11 */ + + pfree(jit_handle); } + + llvm_leave_fatal_on_oom(); } /* @@ -243,8 +286,8 @@ llvm_expand_funcname(struct LLVMJitContext *context, const char *basename) void * llvm_get_function(LLVMJitContext *context, const char *funcname) { - LLVMOrcTargetAddress addr = 0; -#if defined(HAVE_DECL_LLVMORCGETSYMBOLADDRESSIN) && HAVE_DECL_LLVMORCGETSYMBOLADDRESSIN +#if LLVM_VERSION_MAJOR > 11 || \ + defined(HAVE_DECL_LLVMORCGETSYMBOLADDRESSIN) && HAVE_DECL_LLVMORCGETSYMBOLADDRESSIN ListCell *lc; #endif @@ -264,10 +307,41 @@ llvm_get_function(LLVMJitContext *context, const char *funcname) * to mangle here. */ -#if defined(HAVE_DECL_LLVMORCGETSYMBOLADDRESSIN) && HAVE_DECL_LLVMORCGETSYMBOLADDRESSIN +#if LLVM_VERSION_MAJOR > 11 foreach(lc, context->handles) { LLVMJitHandle *handle = (LLVMJitHandle *) lfirst(lc); + instr_time starttime; + instr_time endtime; + LLVMErrorRef error; + LLVMOrcJITTargetAddress addr; + + INSTR_TIME_SET_CURRENT(starttime); + + addr = 0; + error = LLVMOrcLLJITLookup(handle->lljit, &addr, funcname); + if (error) + elog(ERROR, "failed to look up symbol \"%s\": %s", + funcname, llvm_error_message(error)); + + /* + * LLJIT only actually emits code the first time a symbol is + * referenced. Thus add lookup time to emission time. That's counting + * a bit more than with older LLVM versions, but unlikely to ever + * matter. + */ + INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_ACCUM_DIFF(context->base.instr.emission_counter, + endtime, starttime); + + if (addr) + return (void *) (uintptr_t) addr; + } +#elif defined(HAVE_DECL_LLVMORCGETSYMBOLADDRESSIN) && HAVE_DECL_LLVMORCGETSYMBOLADDRESSIN + foreach(lc, context->handles) + { + LLVMOrcTargetAddress addr; + LLVMJitHandle *handle = (LLVMJitHandle *) lfirst(lc); addr = 0; if (LLVMOrcGetSymbolAddressIn(handle->stack, &addr, handle->orc_handle, funcname)) @@ -275,26 +349,29 @@ llvm_get_function(LLVMJitContext *context, const char *funcname) if (addr) return (void *) (uintptr_t) addr; } +#elif LLVM_VERSION_MAJOR < 5 + { + LLVMOrcTargetAddress addr; + if ((addr = LLVMOrcGetSymbolAddress(llvm_opt0_orc, funcname))) + return (void *) (uintptr_t) addr; + if ((addr = LLVMOrcGetSymbolAddress(llvm_opt3_orc, funcname))) + return (void *) (uintptr_t) addr; + } #else + { + LLVMOrcTargetAddress addr; -#if LLVM_VERSION_MAJOR < 5 - if ((addr = LLVMOrcGetSymbolAddress(llvm_opt0_orc, funcname))) - return (void *) (uintptr_t) addr; - if ((addr = LLVMOrcGetSymbolAddress(llvm_opt3_orc, funcname))) - return (void *) (uintptr_t) addr; -#else - if (LLVMOrcGetSymbolAddress(llvm_opt0_orc, &addr, funcname)) - elog(ERROR, "failed to look up symbol \"%s\"", funcname); - if (addr) - return (void *) (uintptr_t) addr; - if (LLVMOrcGetSymbolAddress(llvm_opt3_orc, &addr, funcname)) - elog(ERROR, "failed to look up symbol \"%s\"", funcname); - if (addr) - return (void *) (uintptr_t) addr; -#endif /* LLVM_VERSION_MAJOR */ - -#endif /* HAVE_DECL_LLVMORCGETSYMBOLADDRESSIN */ + if (LLVMOrcGetSymbolAddress(llvm_opt0_orc, &addr, funcname)) + elog(ERROR, "failed to look up symbol \"%s\"", funcname); + if (addr) + return (void *) (uintptr_t) addr; + if (LLVMOrcGetSymbolAddress(llvm_opt3_orc, &addr, funcname)) + elog(ERROR, "failed to look up symbol \"%s\"", funcname); + if (addr) + return (void *) (uintptr_t) addr; + } +#endif elog(ERROR, "failed to JIT: %s", funcname); @@ -327,26 +404,55 @@ llvm_get_decl(LLVMModuleRef mod, LLVMValueRef v_src) } /* - * Copy attributes from one function to another. + * Copy attributes from one function to another, for a specific index (an + * index can reference return value, function and parameter attributes). */ -void -llvm_copy_attributes(LLVMValueRef v_from, LLVMValueRef v_to) +static void +llvm_copy_attributes_at_index(LLVMValueRef v_from, LLVMValueRef v_to, uint32 index) { int num_attributes; - int attno; LLVMAttributeRef *attrs; + int attno; - num_attributes = - LLVMGetAttributeCountAtIndex(v_from, LLVMAttributeFunctionIndex); + num_attributes = LLVMGetAttributeCountAtIndexPG(v_from, index); + + /* + * Not just for efficiency: LLVM <= 3.9 crashes when + * LLVMGetAttributesAtIndex() is called for an index with 0 attributes. + */ + if (num_attributes == 0) + return; attrs = palloc(sizeof(LLVMAttributeRef) * num_attributes); - LLVMGetAttributesAtIndex(v_from, LLVMAttributeFunctionIndex, attrs); + LLVMGetAttributesAtIndex(v_from, index, attrs); for (attno = 0; attno < num_attributes; attno++) - { - LLVMAddAttributeAtIndex(v_to, LLVMAttributeFunctionIndex, - attrs[attno]); - } + LLVMAddAttributeAtIndex(v_to, index, attrs[attno]); + + pfree(attrs); +} + +/* + * Copy all attributes from one function to another. I.e. function, return and + * parameters will be copied. + */ +void +llvm_copy_attributes(LLVMValueRef v_from, LLVMValueRef v_to) +{ + uint32 param_count; + int paramidx; + + /* copy function attributes */ + llvm_copy_attributes_at_index(v_from, v_to, LLVMAttributeFunctionIndex); + + /* and the return value attributes */ + llvm_copy_attributes_at_index(v_from, v_to, LLVMAttributeReturnIndex); + + /* and each function parameter's attribute */ + param_count = LLVMCountParams(v_from); + + for (paramidx = 1; paramidx <= param_count; paramidx++) + llvm_copy_attributes_at_index(v_from, v_to, paramidx); } /* @@ -396,6 +502,8 @@ llvm_function_reference(LLVMJitContext *context, v_fn = LLVMAddGlobal(mod, TypePGFunction, funcname); LLVMSetInitializer(v_fn, v_fn_addr); LLVMSetGlobalConstant(v_fn, true); + LLVMSetLinkage(v_fn, LLVMPrivateLinkage); + LLVMSetUnnamedAddr(v_fn, true); return LLVMBuildLoad(builder, v_fn, ""); } @@ -487,11 +595,15 @@ llvm_optimize_module(LLVMJitContext *context, LLVMModuleRef module) static void llvm_compile_module(LLVMJitContext *context) { - LLVMOrcModuleHandle orc_handle; + LLVMJitHandle *handle; MemoryContext oldcontext; - static LLVMOrcJITStackRef compile_orc; instr_time starttime; instr_time endtime; +#if LLVM_VERSION_MAJOR > 11 + LLVMOrcLLJITRef compile_orc; +#else + LLVMOrcJITStackRef compile_orc; +#endif if (context->base.flags & PGJIT_OPT3) compile_orc = llvm_opt3_orc; @@ -538,6 +650,9 @@ llvm_compile_module(LLVMJitContext *context) pfree(filename); } + handle = (LLVMJitHandle *) + MemoryContextAlloc(TopMemoryContext, sizeof(LLVMJitHandle)); + /* * Emit the code. Note that this can, depending on the optimization * settings, take noticeable resources as code emission executes low-level @@ -545,13 +660,42 @@ llvm_compile_module(LLVMJitContext *context) * faster instruction selection mechanism is used. */ INSTR_TIME_SET_CURRENT(starttime); -#if LLVM_VERSION_MAJOR > 6 +#if LLVM_VERSION_MAJOR > 11 + { + LLVMOrcThreadSafeModuleRef ts_module; + LLVMErrorRef error; + LLVMOrcJITDylibRef jd = LLVMOrcLLJITGetMainJITDylib(compile_orc); + + ts_module = LLVMOrcCreateNewThreadSafeModule(context->module, llvm_ts_context); + + handle->lljit = compile_orc; + handle->resource_tracker = LLVMOrcJITDylibCreateResourceTracker(jd); + + /* + * NB: This doesn't actually emit code. That happens lazily the first + * time a symbol defined in the module is requested. Due to that + * llvm_get_function() also accounts for emission time. + */ + + context->module = NULL; /* will be owned by LLJIT */ + error = LLVMOrcLLJITAddLLVMIRModuleWithRT(compile_orc, + handle->resource_tracker, + ts_module); + + if (error) + elog(ERROR, "failed to JIT module: %s", + llvm_error_message(error)); + + handle->lljit = compile_orc; + + /* LLVMOrcLLJITAddLLVMIRModuleWithRT takes ownership of the module */ + } +#elif LLVM_VERSION_MAJOR > 6 { - if (LLVMOrcAddEagerlyCompiledIR(compile_orc, &orc_handle, context->module, + handle->stack = compile_orc; + if (LLVMOrcAddEagerlyCompiledIR(compile_orc, &handle->orc_handle, context->module, llvm_resolve_symbol, NULL)) - { elog(ERROR, "failed to JIT module"); - } /* LLVMOrcAddEagerlyCompiledIR takes ownership of the module */ } @@ -560,20 +704,23 @@ llvm_compile_module(LLVMJitContext *context) LLVMSharedModuleRef smod; smod = LLVMOrcMakeSharedModule(context->module); - if (LLVMOrcAddEagerlyCompiledIR(compile_orc, &orc_handle, smod, + handle->stack = compile_orc; + if (LLVMOrcAddEagerlyCompiledIR(compile_orc, &handle->orc_handle, smod, llvm_resolve_symbol, NULL)) - { elog(ERROR, "failed to JIT module"); - } + LLVMOrcDisposeSharedModuleRef(smod); } #else /* LLVM 4.0 and 3.9 */ { - orc_handle = LLVMOrcAddEagerlyCompiledIR(compile_orc, context->module, - llvm_resolve_symbol, NULL); + handle->stack = compile_orc; + handle->orc_handle = LLVMOrcAddEagerlyCompiledIR(compile_orc, context->module, + llvm_resolve_symbol, NULL); + LLVMDisposeModule(context->module); } #endif + INSTR_TIME_SET_CURRENT(endtime); INSTR_TIME_ACCUM_DIFF(context->base.instr.emission_counter, endtime, starttime); @@ -583,15 +730,7 @@ llvm_compile_module(LLVMJitContext *context) /* remember emitted code for cleanup and lookups */ oldcontext = MemoryContextSwitchTo(TopMemoryContext); - { - LLVMJitHandle *handle; - - handle = (LLVMJitHandle *) palloc(sizeof(LLVMJitHandle)); - handle->stack = compile_orc; - handle->orc_handle = orc_handle; - - context->handles = lappend(context->handles, handle); - } + context->handles = lappend(context->handles, handle); MemoryContextSwitchTo(oldcontext); ereport(DEBUG1, @@ -613,6 +752,8 @@ llvm_session_initialize(void) char *error = NULL; char *cpu = NULL; char *features = NULL; + LLVMTargetMachineRef opt0_tm; + LLVMTargetMachineRef opt3_tm; if (llvm_session_initialized) return; @@ -623,6 +764,16 @@ llvm_session_initialize(void) LLVMInitializeNativeAsmPrinter(); LLVMInitializeNativeAsmParser(); + /* + * When targeting an LLVM version with opaque pointers enabled by + * default, turn them off for the context we build our code in. We don't + * need to do so for other contexts (e.g. llvm_ts_context). Once the IR is + * generated, it carries the necessary information. + */ +#if LLVM_VERSION_MAJOR > 14 + LLVMContextSetOpaquePointers(LLVMGetGlobalContext(), false); +#endif + /* * Synchronize types early, as that also includes inferring the target * triple. @@ -631,7 +782,7 @@ llvm_session_initialize(void) if (LLVMGetTargetFromTriple(llvm_triple, &llvm_targetref, &error) != 0) { - elog(FATAL, "failed to query triple %s\n", error); + elog(FATAL, "failed to query triple %s", error); } /* @@ -645,12 +796,12 @@ llvm_session_initialize(void) elog(DEBUG2, "LLVMJIT detected CPU \"%s\", with features \"%s\"", cpu, features); - llvm_opt0_targetmachine = + opt0_tm = LLVMCreateTargetMachine(llvm_targetref, llvm_triple, cpu, features, LLVMCodeGenLevelNone, LLVMRelocDefault, LLVMCodeModelJITDefault); - llvm_opt3_targetmachine = + opt3_tm = LLVMCreateTargetMachine(llvm_targetref, llvm_triple, cpu, features, LLVMCodeGenLevelAggressive, LLVMRelocDefault, @@ -664,27 +815,41 @@ llvm_session_initialize(void) /* force symbols in main binary to be loaded */ LLVMLoadLibraryPermanently(NULL); - llvm_opt0_orc = LLVMOrcCreateInstance(llvm_opt0_targetmachine); - llvm_opt3_orc = LLVMOrcCreateInstance(llvm_opt3_targetmachine); - -#if defined(HAVE_DECL_LLVMCREATEGDBREGISTRATIONLISTENER) && HAVE_DECL_LLVMCREATEGDBREGISTRATIONLISTENER - if (jit_debugging_support) +#if LLVM_VERSION_MAJOR > 11 { - LLVMJITEventListenerRef l = LLVMCreateGDBRegistrationListener(); + llvm_ts_context = LLVMOrcCreateNewThreadSafeContext(); - LLVMOrcRegisterJITEventListener(llvm_opt0_orc, l); - LLVMOrcRegisterJITEventListener(llvm_opt3_orc, l); + llvm_opt0_orc = llvm_create_jit_instance(opt0_tm); + opt0_tm = 0; + + llvm_opt3_orc = llvm_create_jit_instance(opt3_tm); + opt3_tm = 0; } +#else /* LLVM_VERSION_MAJOR > 11 */ + { + llvm_opt0_orc = LLVMOrcCreateInstance(opt0_tm); + llvm_opt3_orc = LLVMOrcCreateInstance(opt3_tm); + +#if defined(HAVE_DECL_LLVMCREATEGDBREGISTRATIONLISTENER) && HAVE_DECL_LLVMCREATEGDBREGISTRATIONLISTENER + if (jit_debugging_support) + { + LLVMJITEventListenerRef l = LLVMCreateGDBRegistrationListener(); + + LLVMOrcRegisterJITEventListener(llvm_opt0_orc, l); + LLVMOrcRegisterJITEventListener(llvm_opt3_orc, l); + } #endif #if defined(HAVE_DECL_LLVMCREATEPERFJITEVENTLISTENER) && HAVE_DECL_LLVMCREATEPERFJITEVENTLISTENER - if (jit_profiling_support) - { - LLVMJITEventListenerRef l = LLVMCreatePerfJITEventListener(); + if (jit_profiling_support) + { + LLVMJITEventListenerRef l = LLVMCreatePerfJITEventListener(); - LLVMOrcRegisterJITEventListener(llvm_opt0_orc, l); - LLVMOrcRegisterJITEventListener(llvm_opt3_orc, l); - } + LLVMOrcRegisterJITEventListener(llvm_opt0_orc, l); + LLVMOrcRegisterJITEventListener(llvm_opt3_orc, l); + } #endif + } +#endif /* LLVM_VERSION_MAJOR > 11 */ before_shmem_exit(llvm_shutdown, 0); @@ -696,27 +861,63 @@ llvm_session_initialize(void) static void llvm_shutdown(int code, Datum arg) { - /* unregister profiling support, needs to be flushed to be useful */ + /* + * If llvm_shutdown() is reached while in a fatal-on-oom section an error + * has occurred in the middle of LLVM code. It is not safe to call back + * into LLVM (which is why a FATAL error was thrown). + * + * We do need to shutdown LLVM in other shutdown cases, otherwise + * e.g. profiling data won't be written out. + */ + if (llvm_in_fatal_on_oom()) + { + Assert(proc_exit_inprogress); + return; + } - if (llvm_opt3_orc) +#if LLVM_VERSION_MAJOR > 11 + { + if (llvm_opt3_orc) + { + LLVMOrcDisposeLLJIT(llvm_opt3_orc); + llvm_opt3_orc = NULL; + } + if (llvm_opt0_orc) + { + LLVMOrcDisposeLLJIT(llvm_opt0_orc); + llvm_opt0_orc = NULL; + } + if (llvm_ts_context) + { + LLVMOrcDisposeThreadSafeContext(llvm_ts_context); + llvm_ts_context = NULL; + } + } +#else /* LLVM_VERSION_MAJOR > 11 */ { + /* unregister profiling support, needs to be flushed to be useful */ + + if (llvm_opt3_orc) + { #if defined(HAVE_DECL_LLVMORCREGISTERPERF) && HAVE_DECL_LLVMORCREGISTERPERF - if (jit_profiling_support) - LLVMOrcUnregisterPerf(llvm_opt3_orc); + if (jit_profiling_support) + LLVMOrcUnregisterPerf(llvm_opt3_orc); #endif - LLVMOrcDisposeInstance(llvm_opt3_orc); - llvm_opt3_orc = NULL; - } + LLVMOrcDisposeInstance(llvm_opt3_orc); + llvm_opt3_orc = NULL; + } - if (llvm_opt0_orc) - { + if (llvm_opt0_orc) + { #if defined(HAVE_DECL_LLVMORCREGISTERPERF) && HAVE_DECL_LLVMORCREGISTERPERF - if (jit_profiling_support) - LLVMOrcUnregisterPerf(llvm_opt0_orc); + if (jit_profiling_support) + LLVMOrcUnregisterPerf(llvm_opt0_orc); #endif - LLVMOrcDisposeInstance(llvm_opt0_orc); - llvm_opt0_orc = NULL; + LLVMOrcDisposeInstance(llvm_opt0_orc); + llvm_opt0_orc = NULL; + } } +#endif /* LLVM_VERSION_MAJOR > 11 */ } /* helper for llvm_create_types, returning a global var's type */ @@ -912,3 +1113,156 @@ llvm_resolve_symbol(const char *symname, void *ctx) return (uint64_t) addr; } + +#if LLVM_VERSION_MAJOR > 11 + +static LLVMErrorRef +llvm_resolve_symbols(LLVMOrcDefinitionGeneratorRef GeneratorObj, void *Ctx, + LLVMOrcLookupStateRef *LookupState, LLVMOrcLookupKind Kind, + LLVMOrcJITDylibRef JD, LLVMOrcJITDylibLookupFlags JDLookupFlags, + LLVMOrcCLookupSet LookupSet, size_t LookupSetSize) +{ +#if LLVM_VERSION_MAJOR > 14 + LLVMOrcCSymbolMapPairs symbols = palloc0(sizeof(LLVMOrcCSymbolMapPair) * LookupSetSize); +#else + LLVMOrcCSymbolMapPairs symbols = palloc0(sizeof(LLVMJITCSymbolMapPair) * LookupSetSize); +#endif + LLVMErrorRef error; + LLVMOrcMaterializationUnitRef mu; + + for (int i = 0; i < LookupSetSize; i++) + { + const char *name = LLVMOrcSymbolStringPoolEntryStr(LookupSet[i].Name); + +#if LLVM_VERSION_MAJOR > 12 + LLVMOrcRetainSymbolStringPoolEntry(LookupSet[i].Name); +#endif + symbols[i].Name = LookupSet[i].Name; + symbols[i].Sym.Address = llvm_resolve_symbol(name, NULL); + symbols[i].Sym.Flags.GenericFlags = LLVMJITSymbolGenericFlagsExported; + } + + mu = LLVMOrcAbsoluteSymbols(symbols, LookupSetSize); + error = LLVMOrcJITDylibDefine(JD, mu); + if (error != LLVMErrorSuccess) + LLVMOrcDisposeMaterializationUnit(mu); + + pfree(symbols); + + return error; +} + +/* + * We cannot throw errors through LLVM (without causing a FATAL at least), so + * just use WARNING here. That's OK anyway, as the error is also reported at + * the top level action (with less detail) and there might be multiple + * invocations of errors with details. + * + * This doesn't really happen during normal operation, but in cases like + * symbol resolution breakage. So just using elog(WARNING) is fine. + */ +static void +llvm_log_jit_error(void *ctx, LLVMErrorRef error) +{ + elog(WARNING, "error during JITing: %s", + llvm_error_message(error)); +} + +/* + * Create our own object layer, so we can add event listeners. + */ +static LLVMOrcObjectLayerRef +llvm_create_object_layer(void *Ctx, LLVMOrcExecutionSessionRef ES, const char *Triple) +{ + LLVMOrcObjectLayerRef objlayer = + LLVMOrcCreateRTDyldObjectLinkingLayerWithSectionMemoryManager(ES); + +#if defined(HAVE_DECL_LLVMCREATEGDBREGISTRATIONLISTENER) && HAVE_DECL_LLVMCREATEGDBREGISTRATIONLISTENER + if (jit_debugging_support) + { + LLVMJITEventListenerRef l = LLVMCreateGDBRegistrationListener(); + + LLVMOrcRTDyldObjectLinkingLayerRegisterJITEventListener(objlayer, l); + } +#endif + +#if defined(HAVE_DECL_LLVMCREATEPERFJITEVENTLISTENER) && HAVE_DECL_LLVMCREATEPERFJITEVENTLISTENER + if (jit_profiling_support) + { + LLVMJITEventListenerRef l = LLVMCreatePerfJITEventListener(); + + LLVMOrcRTDyldObjectLinkingLayerRegisterJITEventListener(objlayer, l); + } +#endif + + return objlayer; +} + +/* + * Create LLJIT instance, using the passed in target machine. Note that the + * target machine afterwards is owned by the LLJIT instance. + */ +static LLVMOrcLLJITRef +llvm_create_jit_instance(LLVMTargetMachineRef tm) +{ + LLVMOrcLLJITRef lljit; + LLVMOrcJITTargetMachineBuilderRef tm_builder; + LLVMOrcLLJITBuilderRef lljit_builder; + LLVMErrorRef error; + LLVMOrcDefinitionGeneratorRef main_gen; + LLVMOrcDefinitionGeneratorRef ref_gen; + + lljit_builder = LLVMOrcCreateLLJITBuilder(); + tm_builder = LLVMOrcJITTargetMachineBuilderCreateFromTargetMachine(tm); + LLVMOrcLLJITBuilderSetJITTargetMachineBuilder(lljit_builder, tm_builder); + + LLVMOrcLLJITBuilderSetObjectLinkingLayerCreator(lljit_builder, + llvm_create_object_layer, + NULL); + + error = LLVMOrcCreateLLJIT(&lljit, lljit_builder); + if (error) + elog(ERROR, "failed to create lljit instance: %s", + llvm_error_message(error)); + + LLVMOrcExecutionSessionSetErrorReporter(LLVMOrcLLJITGetExecutionSession(lljit), + llvm_log_jit_error, NULL); + + /* + * Symbol resolution support for symbols in the postgres binary / + * libraries already loaded. + */ + error = LLVMOrcCreateDynamicLibrarySearchGeneratorForProcess(&main_gen, + LLVMOrcLLJITGetGlobalPrefix(lljit), + 0, NULL); + if (error) + elog(ERROR, "failed to create generator: %s", + llvm_error_message(error)); + LLVMOrcJITDylibAddGenerator(LLVMOrcLLJITGetMainJITDylib(lljit), main_gen); + + /* + * Symbol resolution support for "special" functions, e.g. a call into an + * SQL callable function. + */ +#if LLVM_VERSION_MAJOR > 14 + ref_gen = LLVMOrcCreateCustomCAPIDefinitionGenerator(llvm_resolve_symbols, NULL, NULL); +#else + ref_gen = LLVMOrcCreateCustomCAPIDefinitionGenerator(llvm_resolve_symbols, NULL); +#endif + LLVMOrcJITDylibAddGenerator(LLVMOrcLLJITGetMainJITDylib(lljit), ref_gen); + + return lljit; +} + +static char * +llvm_error_message(LLVMErrorRef error) +{ + char *orig = LLVMGetErrorMessage(error); + char *msg = pstrdup(orig); + + LLVMDisposeErrorMessage(orig); + + return msg; +} + +#endif /* LLVM_VERSION_MAJOR > 11 */ diff --git a/src/backend/jit/llvm/llvmjit_error.cpp b/src/backend/jit/llvm/llvmjit_error.cpp index baabfc4eb4c..b9bdb8576d6 100644 --- a/src/backend/jit/llvm/llvmjit_error.cpp +++ b/src/backend/jit/llvm/llvmjit_error.cpp @@ -23,15 +23,22 @@ extern "C" #include "jit/llvmjit.h" +#include static int fatal_new_handler_depth = 0; static std::new_handler old_new_handler = NULL; static void fatal_system_new_handler(void); #if LLVM_VERSION_MAJOR > 4 +static void fatal_llvm_new_handler(void *user_data, const char *reason, bool gen_crash_diag); +#if LLVM_VERSION_MAJOR < 14 static void fatal_llvm_new_handler(void *user_data, const std::string& reason, bool gen_crash_diag); #endif +#endif +static void fatal_llvm_error_handler(void *user_data, const char *reason, bool gen_crash_diag); +#if LLVM_VERSION_MAJOR < 14 static void fatal_llvm_error_handler(void *user_data, const std::string& reason, bool gen_crash_diag); +#endif /* @@ -83,6 +90,16 @@ llvm_leave_fatal_on_oom(void) } } +/* + * Are we currently in an fatal-on-oom section? Useful to skip cleanup in case + * of errors. + */ +bool +llvm_in_fatal_on_oom(void) +{ + return fatal_new_handler_depth > 0; +} + /* * Reset fatal error handling. This should only be called in error recovery * loops like PostgresMain()'s. @@ -119,23 +136,41 @@ fatal_system_new_handler(void) #if LLVM_VERSION_MAJOR > 4 static void fatal_llvm_new_handler(void *user_data, - const std::string& reason, + const char *reason, bool gen_crash_diag) { ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"), - errdetail("While in LLVM: %s", reason.c_str()))); + errdetail("While in LLVM: %s", reason))); +} +#if LLVM_VERSION_MAJOR < 14 +static void +fatal_llvm_new_handler(void *user_data, + const std::string& reason, + bool gen_crash_diag) +{ + fatal_llvm_new_handler(user_data, reason.c_str(), gen_crash_diag); } #endif +#endif static void fatal_llvm_error_handler(void *user_data, - const std::string& reason, + const char *reason, bool gen_crash_diag) { ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("fatal llvm error: %s", - reason.c_str()))); + errmsg("fatal llvm error: %s", reason))); +} + +#if LLVM_VERSION_MAJOR < 14 +static void +fatal_llvm_error_handler(void *user_data, + const std::string& reason, + bool gen_crash_diag) +{ + fatal_llvm_error_handler(user_data, reason.c_str(), gen_crash_diag); } +#endif diff --git a/src/backend/jit/llvm/llvmjit_expr.c b/src/backend/jit/llvm/llvmjit_expr.c index 0da318218fd..12138e49577 100644 --- a/src/backend/jit/llvm/llvmjit_expr.c +++ b/src/backend/jit/llvm/llvmjit_expr.c @@ -152,7 +152,7 @@ llvm_compile_expr(ExprState *state) param_types[0] = l_ptr(StructExprState); /* state */ param_types[1] = l_ptr(StructExprContext); /* econtext */ - param_types[2] = l_ptr(TypeParamBool); /* isnull */ + param_types[2] = l_ptr(TypeStorageBool); /* isnull */ eval_sig = LLVMFunctionType(TypeSizeT, param_types, lengthof(param_types), @@ -259,8 +259,6 @@ llvm_compile_expr(ExprState *state) v_tmpvalue = LLVMBuildLoad(b, v_tmpvaluep, ""); v_tmpisnull = LLVMBuildLoad(b, v_tmpisnullp, ""); - v_tmpisnull = - LLVMBuildTrunc(b, v_tmpisnull, TypeParamBool, ""); LLVMBuildStore(b, v_tmpisnull, v_isnullp); diff --git a/src/backend/jit/llvm/llvmjit_inline.cpp b/src/backend/jit/llvm/llvmjit_inline.cpp index c489a632ff9..c7deff0c5b3 100644 --- a/src/backend/jit/llvm/llvmjit_inline.cpp +++ b/src/backend/jit/llvm/llvmjit_inline.cpp @@ -62,6 +62,7 @@ extern "C" #include #include #include +#include /* @@ -594,7 +595,11 @@ function_inlinable(llvm::Function &F, if (F.materialize()) elog(FATAL, "failed to materialize metadata"); - if (F.getAttributes().hasFnAttribute(llvm::Attribute::NoInline)) +#if LLVM_VERSION_MAJOR < 14 +#define hasFnAttr hasFnAttribute +#endif + + if (F.getAttributes().hasFnAttr(llvm::Attribute::NoInline)) { ilog(DEBUG1, "ineligibile to import %s due to noinline", F.getName().data()); @@ -608,6 +613,17 @@ function_inlinable(llvm::Function &F, if (rv->materialize()) elog(FATAL, "failed to materialize metadata"); + /* + * Don't inline functions that access thread local variables. That + * doesn't work on current LLVM releases (but might in future). + */ + if (rv->isThreadLocal()) + { + ilog(DEBUG1, "cannot inline %s due to thread-local variable %s", + F.getName().data(), rv->getName().data()); + return false; + } + /* * Never want to inline externally visible vars, cheap enough to * reference. @@ -860,7 +876,9 @@ create_redirection_function(std::unique_ptr &importMod, llvm::Function *AF; llvm::BasicBlock *BB; llvm::CallInst *fwdcall; +#if LLVM_VERSION_MAJOR < 14 llvm::Attribute inlineAttribute; +#endif AF = llvm::Function::Create(F->getFunctionType(), LinkageTypes::AvailableExternallyLinkage, @@ -869,9 +887,13 @@ create_redirection_function(std::unique_ptr &importMod, Builder.SetInsertPoint(BB); fwdcall = Builder.CreateCall(F, &*AF->arg_begin()); +#if LLVM_VERSION_MAJOR < 14 inlineAttribute = llvm::Attribute::get(Context, llvm::Attribute::AlwaysInline); fwdcall->addAttribute(~0U, inlineAttribute); +#else + fwdcall->addFnAttr(llvm::Attribute::AlwaysInline); +#endif Builder.CreateRet(fwdcall); return AF; diff --git a/src/backend/jit/llvm/llvmjit_wrap.cpp b/src/backend/jit/llvm/llvmjit_wrap.cpp index 7490805fdca..9fc813dc949 100644 --- a/src/backend/jit/llvm/llvmjit_wrap.cpp +++ b/src/backend/jit/llvm/llvmjit_wrap.cpp @@ -16,6 +16,13 @@ extern "C" #include "postgres.h" } +#include + +/* Avoid macro clash with LLVM's C++ headers */ +#undef Min + +#include +#include #include #include @@ -44,3 +51,28 @@ char *LLVMGetHostCPUFeatures(void) { return strdup(Features.getString().c_str()); } #endif + +/* + * Like LLVM's LLVMGetAttributeCountAtIndex(), works around a bug in LLVM 3.9. + * + * In LLVM <= 3.9, LLVMGetAttributeCountAtIndex() segfaults if there are no + * attributes at an index (fixed in LLVM commit ce9bb1097dc2). + */ +unsigned +LLVMGetAttributeCountAtIndexPG(LLVMValueRef F, uint32 Idx) +{ + /* + * This is more expensive, so only do when using a problematic LLVM + * version. + */ +#if LLVM_VERSION_MAJOR < 4 + if (!llvm::unwrap(F)->getAttributes().hasAttributes(Idx)) + return 0; +#endif + + /* + * There is no nice public API to determine the count nicely, so just + * always fall back to LLVM's C API. + */ + return LLVMGetAttributeCountAtIndex(F, Idx); +} diff --git a/src/backend/optimizer/util/tlist.c b/src/backend/optimizer/util/tlist.c index 4cce6b58122..1d609afbd9f 100644 --- a/src/backend/optimizer/util/tlist.c +++ b/src/backend/optimizer/util/tlist.c @@ -1013,7 +1013,7 @@ apply_pathtarget_labeling_to_tlist(List *tlist, PathTarget *target) * * The outputs of this function are two parallel lists, one a list of * PathTargets and the other an integer list of bool flags indicating - * whether the corresponding PathTarget contains any evaluatable SRFs. + * whether the corresponding PathTarget contains any evaluable SRFs. * The lists are given in the order they'd need to be evaluated in, with * the "lowest" PathTarget first. So the last list entry is always the * originally given PathTarget, and any entries before it indicate evaluation diff --git a/src/bin/pg_upgrade/info.c b/src/bin/pg_upgrade/info.c index fd0b44c3ce9..968db0d777d 100644 --- a/src/bin/pg_upgrade/info.c +++ b/src/bin/pg_upgrade/info.c @@ -433,11 +433,10 @@ get_rel_infos(ClusterInfo *cluster, DbInfo *dbinfo) query[0] = '\0'; /* initialize query string to empty */ /* - * Create a CTE that collects OIDs of regular user tables, including - * matviews and sequences, but excluding toast tables and indexes. We - * assume that relations with OIDs >= FirstNormalObjectId belong to the - * user. (That's probably redundant with the namespace-name exclusions, - * but let's be safe.) + * Create a CTE that collects OIDs of regular user tables and matviews, + * but excluding toast tables and indexes. We assume that relations with + * OIDs >= FirstNormalObjectId belong to the user. (That's probably + * redundant with the namespace-name exclusions, but let's be safe.) * * pg_largeobject contains user data that does not appear in pg_dump * output, so we have to copy that system table. It's easiest to do that diff --git a/src/include/jit/llvmjit.h b/src/include/jit/llvmjit.h index 94e6612823b..c111ed802a4 100644 --- a/src/include/jit/llvmjit.h +++ b/src/include/jit/llvmjit.h @@ -90,6 +90,7 @@ extern LLVMValueRef FuncExecAggInitGroup; extern void llvm_enter_fatal_on_oom(void); extern void llvm_leave_fatal_on_oom(void); +extern bool llvm_in_fatal_on_oom(void); extern void llvm_reset_after_error(void); extern void llvm_assert_in_fatal_section(void); @@ -133,6 +134,8 @@ extern char *LLVMGetHostCPUName(void); extern char *LLVMGetHostCPUFeatures(void); #endif +extern unsigned LLVMGetAttributeCountAtIndexPG(LLVMValueRef F, uint32 Idx); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/src/include/jit/llvmjit_emit.h b/src/include/jit/llvmjit_emit.h index fdc51b1e907..949be7ee078 100644 --- a/src/include/jit/llvmjit_emit.h +++ b/src/include/jit/llvmjit_emit.h @@ -1,6 +1,6 @@ /* * llvmjit_emit.h - * Helpers to make emitting LLVM IR a it more concise and pgindent proof. + * Helpers to make emitting LLVM IR a bit more concise and pgindent proof. * * Copyright (c) 2018, PostgreSQL Global Development Group * diff --git a/src/test/regress/expected/expressions.out b/src/test/regress/expected/expressions.out index 719455b0ebb..42d129c58ae 100644 --- a/src/test/regress/expected/expressions.out +++ b/src/test/regress/expected/expressions.out @@ -45,7 +45,7 @@ SELECT now()::timestamp::text = localtimestamp::text; t (1 row) --- current_role/user/user is tested in rolnames.sql +-- current_role/user/user is tested in rolenames.sql -- current database / catalog SELECT current_catalog = current_database(); ?column? diff --git a/src/test/regress/sql/expressions.sql b/src/test/regress/sql/expressions.sql index 3427fdfdd72..1d7601fe031 100644 --- a/src/test/regress/sql/expressions.sql +++ b/src/test/regress/sql/expressions.sql @@ -22,7 +22,7 @@ SELECT length(current_timestamp::text) >= length(current_timestamp(0)::text); -- localtimestamp SELECT now()::timestamp::text = localtimestamp::text; --- current_role/user/user is tested in rolnames.sql +-- current_role/user/user is tested in rolenames.sql -- current database / catalog SELECT current_catalog = current_database(); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 224908e11e7..635f9319ecd 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1133,6 +1133,7 @@ LLVMJitHandle LLVMMemoryBufferRef LLVMModuleRef LLVMOrcJITStackRef +LLVMOrcLookupStateRef LLVMOrcModuleHandle LLVMOrcTargetAddress LLVMPassManagerBuilderRef