From a605b3160f27b2afac75ae9aa887f770a5ce7618 Mon Sep 17 00:00:00 2001 From: "xueyan.li" Date: Wed, 14 Oct 2020 09:34:58 +0800 Subject: [PATCH] [Docs] update data types doc and fix some typo (#4712) * update data types doc and fix some typo * update data types doc and fix some typo Co-authored-by: lixueyan07 --- docs/.vuepress/sidebar/en.js | 3 +- docs/.vuepress/sidebar/zh-CN.js | 1 + .../sql-statements/Data Types/BITMAP.md | 48 ++++++++++++++++++ .../Data Types/HLL(HyperLogLog).md | 35 ------------- .../sql-statements/Data Types/HLL.md | 49 +++++++++++++++++++ .../sql-statements/Data Types/VARCHAR.md | 4 +- .../sql-statements/Data Types/BITMAP.md | 48 ++++++++++++++++++ .../sql-statements/Data Types/HLL.md | 20 ++++++-- .../sql-statements/Data Types/VARCHAR.md | 4 +- 9 files changed, 170 insertions(+), 42 deletions(-) create mode 100644 docs/en/sql-reference/sql-statements/Data Types/BITMAP.md delete mode 100644 docs/en/sql-reference/sql-statements/Data Types/HLL(HyperLogLog).md create mode 100644 docs/en/sql-reference/sql-statements/Data Types/HLL.md create mode 100644 docs/zh-CN/sql-reference/sql-statements/Data Types/BITMAP.md diff --git a/docs/.vuepress/sidebar/en.js b/docs/.vuepress/sidebar/en.js index 8e271115d2b22c..d6541cddff4d17 100644 --- a/docs/.vuepress/sidebar/en.js +++ b/docs/.vuepress/sidebar/en.js @@ -487,6 +487,7 @@ module.exports = [ directoryPath: "Data Types/", children: [ "BIGINT", + "BITMAP", "BOOLEAN", "CHAR", "DATE", @@ -494,7 +495,7 @@ module.exports = [ "DECIMAL", "DOUBLE", "FLOAT", - "HLL(HyperLogLog)", + "HLL", "INT", "SMALLINT", "TINYINT", diff --git a/docs/.vuepress/sidebar/zh-CN.js b/docs/.vuepress/sidebar/zh-CN.js index c513e8d00f3a99..b1486916c8ed11 100644 --- a/docs/.vuepress/sidebar/zh-CN.js +++ b/docs/.vuepress/sidebar/zh-CN.js @@ -492,6 +492,7 @@ module.exports = [ directoryPath: "Data Types/", children: [ "BIGINT", + "BITMAP", "BOOLEAN", "CHAR", "DATE", diff --git a/docs/en/sql-reference/sql-statements/Data Types/BITMAP.md b/docs/en/sql-reference/sql-statements/Data Types/BITMAP.md new file mode 100644 index 00000000000000..29d8a75331e74b --- /dev/null +++ b/docs/en/sql-reference/sql-statements/Data Types/BITMAP.md @@ -0,0 +1,48 @@ +--- +{ + "title": "BITMAP", + "language": "en" +} +--- + + + +#BITMAP +## Description +BITMAP + +BITMAP cannot be used as a key column, and the aggregation type is BITMAP_UNION when building the table. +The user does not need to specify the length and default value. The length is controlled within the system according to the degree of data aggregation. +And the BITMAP column can only be queried or used by supporting functions such as bitmap_union_count, bitmap_union, and bitmap_hash. + +The use of BITMAP in offline scenarios will affect the import speed. In the case of a large amount of data, the query speed will be slower than HLL and better than Count Distinct. +Note: If BITMAP does not use a global dictionary in real-time scenarios, using bitmap_hash() may cause an error of about one-thousandth. + +## example + + select hour, BITMAP_UNION_COUNT(pv) over(order by hour) uv from( + select hour, BITMAP_UNION(device_id) as pv + from metric_table -- Query the accumulated UV per hour + where datekey=20200922 + group by hour order by 1 + ) final; + +## keyword +BITMAP diff --git a/docs/en/sql-reference/sql-statements/Data Types/HLL(HyperLogLog).md b/docs/en/sql-reference/sql-statements/Data Types/HLL(HyperLogLog).md deleted file mode 100644 index 7a511e288ebc95..00000000000000 --- a/docs/en/sql-reference/sql-statements/Data Types/HLL(HyperLogLog).md +++ /dev/null @@ -1,35 +0,0 @@ ---- -{ - "title": "HLL (Hyloglog)", - "language": "en" -} ---- - - - -#HLL (Hyloglog) -## Description -MARKETING (M) -A variable length string, M represents the length of a variable length string. The range of M is 1-16385. -Users do not need to specify length and default values. Length is controlled within the system according to the aggregation degree of data -And HLL columns can only be queried or used by matching hll_union_agg, hll_raw_agg, hll_cardinality, hll_hash. - -## keyword -High loglog, hll, hyloglog diff --git a/docs/en/sql-reference/sql-statements/Data Types/HLL.md b/docs/en/sql-reference/sql-statements/Data Types/HLL.md new file mode 100644 index 00000000000000..999a897a88f88a --- /dev/null +++ b/docs/en/sql-reference/sql-statements/Data Types/HLL.md @@ -0,0 +1,49 @@ +--- +{ + "title": "HLL (HyperLogLog)", + "language": "en" +} +--- + + + +#HLL (HyperLogLog) +## Description +HLL + +HLL cannot be used as a key column, and the aggregation type is HLL_UNION when create table. +The user does not need to specify the length and default value. +The length is controlled within the system according to the degree of data aggregation. +And HLL columns can only be queried or used through the matching hll_union_agg, hll_raw_agg, hll_cardinality, and hll_hash. + +HLL is approximate count of distinct elements, and its performance is better than Count Distinct when the amount of data is large. +The error of HLL is usually around 1%, sometimes up to 2%. + +## example + + select hour, HLL_UNION_AGG(pv) over(order by hour) uv from( + select hour, HLL_RAW_AGG(device_id) as pv + from metric_table -- Query the accumulated UV per hour + where datekey=20200922 + group by hour order by 1 + ) final; + +## keyword +HLL,HYPERLOGLOG diff --git a/docs/en/sql-reference/sql-statements/Data Types/VARCHAR.md b/docs/en/sql-reference/sql-statements/Data Types/VARCHAR.md index 268a56222df954..ae7fa8599d3e93 100644 --- a/docs/en/sql-reference/sql-statements/Data Types/VARCHAR.md +++ b/docs/en/sql-reference/sql-statements/Data Types/VARCHAR.md @@ -27,7 +27,9 @@ under the License. # VARCHAR ## Description MARKETING (M) -A variable length string, M represents the length of a variable length string. The range of M is 1-65535. +A variable length string, M represents the length of a variable length string. The range of M is 1-65533. + +Note: Variable length strings are stored in UTF-8 encoding, so usually English characters occupies 1 byte, and Chinese characters occupies 3 bytes. ## keyword VARCHAR diff --git a/docs/zh-CN/sql-reference/sql-statements/Data Types/BITMAP.md b/docs/zh-CN/sql-reference/sql-statements/Data Types/BITMAP.md new file mode 100644 index 00000000000000..c92e20b99407b5 --- /dev/null +++ b/docs/zh-CN/sql-reference/sql-statements/Data Types/BITMAP.md @@ -0,0 +1,48 @@ +--- +{ + "title": "BITMAP", + "language": "zh-CN" +} +--- + + + +# BITMAP +## description + BITMAP + BITMAP不能作为key列使用,建表时配合聚合类型为BITMAP_UNION。 + 用户不需要指定长度和默认值。长度根据数据的聚合程度系统内控制。 + 并且BITMAP列只能通过配套的bitmap_union_count、bitmap_union、bitmap_hash等函数进行查询或使用。 + + 离线场景下使用BITMAP会影响导入速度,在数据量大的情况下查询速度会慢于HLL,并优于Count Distinct。 + 注意:实时场景下BITMAP如果不使用全局字典,使用了bitmap_hash()可能会导致有千分之一左右的误差。 + +## example + + select hour, BITMAP_UNION_COUNT(pv) over(order by hour) uv from( + select hour, BITMAP_UNION(device_id) as pv + from metric_table -- 查询每小时的累计UV + where datekey=20200622 + group by hour order by 1 + ) final; + +## keyword + + BITMAP diff --git a/docs/zh-CN/sql-reference/sql-statements/Data Types/HLL.md b/docs/zh-CN/sql-reference/sql-statements/Data Types/HLL.md index 7357f4e1e3130a..b261495d7ec384 100644 --- a/docs/zh-CN/sql-reference/sql-statements/Data Types/HLL.md +++ b/docs/zh-CN/sql-reference/sql-statements/Data Types/HLL.md @@ -26,10 +26,22 @@ under the License. # HLL(HyperLogLog) ## description - VARCHAR(M) - 变长字符串,M代表的是变长字符串的长度。M的范围是1-16385 - 用户不需要指定长度和默认值。长度根据数据的聚合程度系统内控制 - 并且HLL列只能通过配套的hll_union_agg、hll_raw_agg、hll_cardinality、hll_hash进行查询或使用 + HLL + HLL不能作为key列使用,建表时配合聚合类型为HLL_UNION。 + 用户不需要指定长度和默认值。长度根据数据的聚合程度系统内控制。 + 并且HLL列只能通过配套的hll_union_agg、hll_raw_agg、hll_cardinality、hll_hash进行查询或使用。 + + HLL是模糊去重,在数据量大的情况性能优于Count Distinct。 + HLL的误差通常在1%左右,有时会达到2%。 + +## example + + select hour, HLL_UNION_AGG(pv) over(order by hour) uv from( + select hour, HLL_RAW_AGG(device_id) as pv + from metric_table -- 查询每小时的累计UV + where datekey=20200622 + group by hour order by 1 + ) final; ## keyword diff --git a/docs/zh-CN/sql-reference/sql-statements/Data Types/VARCHAR.md b/docs/zh-CN/sql-reference/sql-statements/Data Types/VARCHAR.md index 59416788a3ec14..178b56fb56f308 100644 --- a/docs/zh-CN/sql-reference/sql-statements/Data Types/VARCHAR.md +++ b/docs/zh-CN/sql-reference/sql-statements/Data Types/VARCHAR.md @@ -27,7 +27,9 @@ under the License. # VARCHAR ## description VARCHAR(M) - 变长字符串,M代表的是变长字符串的长度。M的范围是1-65535 + 变长字符串,M代表的是变长字符串的长度。M的范围是1-65533。 + + 注意:变长字符串是以UTF-8编码存储的,因此通常英文字符占1个字节,中文字符占3个字节。 ## keyword