From de3c991a58bdca387d9ee4fcc82b3d75f1e97757 Mon Sep 17 00:00:00 2001 From: Seaven Date: Mon, 11 Nov 2024 19:58:04 +0800 Subject: [PATCH] update sample statistic Signed-off-by: Seaven --- .../statistic/HyperStatisticsCollectJob.java | 19 +- .../StatisticsCollectJobFactory.java | 27 ++- .../statistic/base/ColumnClassifier.java | 17 +- .../starrocks/statistic/base/ColumnStats.java | 15 +- .../base/ComplexTypeColumnStats.java | 17 +- .../base/DistributionColumnStats.java | 29 ---- .../statistic/base/PartitionSampler.java | 164 ++++++++++++++++++ .../base/PrimitiveTypeColumnStats.java | 22 ++- .../statistic/base/SubFieldColumnStats.java | 23 ++- .../statistic/base/TabletSampler.java | 71 ++++++++ .../statistic/hyper/FullQueryJob.java | 40 ++--- .../statistic/hyper/HyperQueryJob.java | 53 +++++- .../statistic/hyper/MetaQueryJob.java | 79 ++++----- .../statistic/hyper/SampleQueryJob.java | 58 +++++++ .../statistic/hyper/StatisticSQLs.java | 124 +++++++++++-- .../statistic/sample/SampleInfo.java | 37 ++-- .../statistic/sample/TabletStats.java | 6 - 17 files changed, 614 insertions(+), 187 deletions(-) delete mode 100644 fe/fe-core/src/main/java/com/starrocks/statistic/base/DistributionColumnStats.java create mode 100644 fe/fe-core/src/main/java/com/starrocks/statistic/base/PartitionSampler.java create mode 100644 fe/fe-core/src/main/java/com/starrocks/statistic/base/TabletSampler.java create mode 100644 fe/fe-core/src/main/java/com/starrocks/statistic/hyper/SampleQueryJob.java diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/HyperStatisticsCollectJob.java b/fe/fe-core/src/main/java/com/starrocks/statistic/HyperStatisticsCollectJob.java index b803fafef8f281..4783cad2027fb8 100644 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/HyperStatisticsCollectJob.java +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/HyperStatisticsCollectJob.java @@ -34,6 +34,7 @@ import com.starrocks.sql.ast.QueryStatement; import com.starrocks.sql.ast.StatementBase; import com.starrocks.sql.ast.ValuesRelation; +import com.starrocks.statistic.base.PartitionSampler; import com.starrocks.statistic.hyper.HyperQueryJob; import org.apache.commons.lang.StringUtils; import org.apache.logging.log4j.LogManager; @@ -47,7 +48,7 @@ public class HyperStatisticsCollectJob extends StatisticsCollectJob { private final List partitionIdList; - private final int batchLimit; + private final int batchRowsLimit; private final List sqlBuffer = Lists.newArrayList(); private final List> rowsBuffer = Lists.newArrayList(); @@ -57,7 +58,7 @@ public HyperStatisticsCollectJob(Database db, Table table, List partitionI super(db, table, columns, type, scheduleType, properties); this.partitionIdList = partitionIdList; // hll serialize to hex, about 32kb - this.batchLimit = (int) Math.max(1, Config.statistic_full_collect_buffer / 33 / 1024); + this.batchRowsLimit = (int) Math.max(1, Config.statistic_full_collect_buffer / 33 / 1024); } public HyperStatisticsCollectJob(Database db, Table table, List partitionIdList, List columnNames, @@ -65,7 +66,7 @@ public HyperStatisticsCollectJob(Database db, Table table, List partitionI StatsConstants.ScheduleType scheduleType, Map properties) { super(db, table, columnNames, columnTypes, type, scheduleType, properties); this.partitionIdList = partitionIdList; - this.batchLimit = (int) Math.max(1, Config.statistic_full_collect_buffer / 33 / 1024); + this.batchRowsLimit = (int) Math.max(1, Config.statistic_full_collect_buffer / 33 / 1024); } @Override @@ -76,8 +77,16 @@ public void collect(ConnectContext context, AnalyzeStatus analyzeStatus) throws context.getSessionVariable().setEnableAnalyzePhasePruneColumns(true); context.getSessionVariable().setPipelineDop(context.getSessionVariable().getStatisticCollectParallelism()); - List queryJobs = HyperQueryJob.createFullQueryJobs(context, db, table, columnNames, - columnTypes, partitionIdList, batchLimit); + int splitSize = Math.max(1, batchRowsLimit / columnNames.size()); + List queryJobs; + if (type == StatsConstants.AnalyzeType.FULL) { + queryJobs = HyperQueryJob.createFullQueryJobs(context, db, table, columnNames, columnTypes, + partitionIdList, splitSize); + } else { + PartitionSampler sampler = PartitionSampler.create(table, partitionIdList, properties); + queryJobs = HyperQueryJob.createSampleQueryJobs(context, db, table, columnNames, columnTypes, + partitionIdList, splitSize, sampler); + } long queryTotals = 0; long queryFailures = 0; diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/StatisticsCollectJobFactory.java b/fe/fe-core/src/main/java/com/starrocks/statistic/StatisticsCollectJobFactory.java index 1227b97bc8153f..97a16ab77f42d6 100644 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/StatisticsCollectJobFactory.java +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/StatisticsCollectJobFactory.java @@ -83,7 +83,7 @@ public static List buildStatisticsCollectJob(NativeAnalyze return Collections.emptyList(); } createJob(statsJobs, nativeAnalyzeJob, db, GlobalStateMgr.getCurrentState().getLocalMetastore() - .getTable(db.getId(), nativeAnalyzeJob.getTableId()), + .getTable(db.getId(), nativeAnalyzeJob.getTableId()), nativeAnalyzeJob.getColumns(), nativeAnalyzeJob.getColumnTypes()); } @@ -108,8 +108,13 @@ public static StatisticsCollectJob buildStatisticsCollectJob(Database db, Table LOG.debug("statistics job work on table: {}, type: {}", table.getName(), analyzeType.name()); if (analyzeType.equals(StatsConstants.AnalyzeType.SAMPLE)) { - return new SampleStatisticsCollectJob(db, table, columnNames, columnTypes, - StatsConstants.AnalyzeType.SAMPLE, scheduleType, properties); + if (Config.statistic_use_meta_statistics) { + return new HyperStatisticsCollectJob(db, table, partitionIdList, columnNames, columnTypes, + StatsConstants.AnalyzeType.SAMPLE, scheduleType, properties); + } else { + return new SampleStatisticsCollectJob(db, table, columnNames, columnTypes, + StatsConstants.AnalyzeType.SAMPLE, scheduleType, properties); + } } else if (analyzeType.equals(StatsConstants.AnalyzeType.HISTOGRAM)) { return new HistogramStatisticsCollectJob(db, table, columnNames, columnTypes, scheduleType, properties); } else { @@ -197,7 +202,8 @@ public static StatisticsCollectJob buildExternalStatisticsCollectJob(String cata StatsConstants.AnalyzeType analyzeType, StatsConstants.ScheduleType scheduleType, Map properties) { - List columnTypes = columnNames.stream().map(col -> table.getColumn(col).getType()).collect(Collectors.toList()); + List columnTypes = + columnNames.stream().map(col -> table.getColumn(col).getType()).collect(Collectors.toList()); return buildExternalStatisticsCollectJob(catalogName, db, table, partitionNames, columnNames, columnTypes, analyzeType, scheduleType, properties); } @@ -251,8 +257,9 @@ private static void createExternalAnalyzeJob(List allTable } } - ExternalBasicStatsMeta basicStatsMeta = GlobalStateMgr.getCurrentState().getAnalyzeMgr().getExternalBasicStatsMetaMap() - .get(new AnalyzeMgr.StatsMetaKey(job.getCatalogName(), db.getFullName(), table.getName())); + ExternalBasicStatsMeta basicStatsMeta = + GlobalStateMgr.getCurrentState().getAnalyzeMgr().getExternalBasicStatsMetaMap() + .get(new AnalyzeMgr.StatsMetaKey(job.getCatalogName(), db.getFullName(), table.getName())); if (basicStatsMeta != null) { // check table last update time, if last collect time is after last update time, skip this table LocalDateTime statisticsUpdateTime = basicStatsMeta.getUpdateTime(); @@ -269,10 +276,12 @@ private static void createExternalAnalyzeJob(List allTable // check table row count if (columnNames == null || columnNames.isEmpty()) { columnNames = StatisticUtils.getCollectibleColumns(table); - columnTypes = columnNames.stream().map(col -> table.getColumn(col).getType()).collect(Collectors.toList()); + columnTypes = + columnNames.stream().map(col -> table.getColumn(col).getType()).collect(Collectors.toList()); } List columnStatisticList = - GlobalStateMgr.getCurrentState().getStatisticStorage().getConnectorTableStatisticsSync(table, columnNames); + GlobalStateMgr.getCurrentState().getStatisticStorage() + .getConnectorTableStatisticsSync(table, columnNames); List validColumnStatistics = columnStatisticList.stream(). filter(columnStatistic -> !columnStatistic.isUnknown()).collect(Collectors.toList()); @@ -399,7 +408,7 @@ private static void createJob(List allTableJobMap, NativeA job.getAnalyzeType() == StatsConstants.AnalyzeType.HISTOGRAM ? Config.statistic_auto_collect_histogram_interval : (sumDataSize > Config.statistic_auto_collect_small_table_size ? - Config.statistic_auto_collect_large_table_interval : + Config.statistic_auto_collect_large_table_interval : Config.statistic_auto_collect_small_table_interval); long timeInterval = job.getProperties().get(StatsConstants.STATISTIC_AUTO_COLLECT_INTERVAL) != null ? diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/base/ColumnClassifier.java b/fe/fe-core/src/main/java/com/starrocks/statistic/base/ColumnClassifier.java index 0999ed5db8e96c..fcd1100193664a 100644 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/base/ColumnClassifier.java +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/base/ColumnClassifier.java @@ -19,7 +19,6 @@ import com.starrocks.catalog.StructType; import com.starrocks.catalog.Table; import com.starrocks.catalog.Type; -import com.starrocks.statistic.sample.SampleInfo; import java.util.List; @@ -29,28 +28,20 @@ public class ColumnClassifier { private final List unSupportStats = Lists.newArrayList(); - public static ColumnClassifier of(List columnNames, List columnTypes, Table table, - SampleInfo sampleInfo) { + public static ColumnClassifier of(List columnNames, List columnTypes, Table table) { ColumnClassifier columnClassifier = new ColumnClassifier(); - columnClassifier.classifyColumnStats(columnNames, columnTypes, table, sampleInfo); + columnClassifier.classifyColumnStats(columnNames, columnTypes, table); return columnClassifier; } - private void classifyColumnStats(List columnNames, List columnTypes, Table table, - SampleInfo sampleInfo) { - boolean onlyOneDistributionCol = table.getDistributionColumnNames().size() == 1; + private void classifyColumnStats(List columnNames, List columnTypes, Table table) { for (int i = 0; i < columnNames.size(); i++) { String columnName = columnNames.get(i); Type columnType = columnTypes.get(i); if (table.getColumn(columnName) != null) { if (columnType.canStatistic()) { - if (onlyOneDistributionCol && table.getDistributionColumnNames().contains(columnName)) { - columnStats.add(new DistributionColumnStats(columnName, columnType, sampleInfo)); - onlyOneDistributionCol = false; - } else { - columnStats.add(new PrimitiveTypeColumnStats(columnName, columnType)); - } + columnStats.add(new PrimitiveTypeColumnStats(columnName, columnType)); } else { unSupportStats.add(new ComplexTypeColumnStats(columnName, columnType)); } diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/base/ColumnStats.java b/fe/fe-core/src/main/java/com/starrocks/statistic/base/ColumnStats.java index 4d253bfc4f284e..9011f832168f09 100644 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/base/ColumnStats.java +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/base/ColumnStats.java @@ -15,6 +15,7 @@ package com.starrocks.statistic.base; import com.starrocks.catalog.Type; +import com.starrocks.statistic.sample.SampleInfo; import org.apache.commons.lang.StringEscapeUtils; public abstract class ColumnStats { @@ -48,13 +49,9 @@ public String getQuotedColumnName() { return "`" + columnName + "`"; } - public abstract String getFullMax(); + public abstract String getMax(); - public abstract String getFullMin(); - - public String getFullRowCount() { - return "COUNT(*)"; - } + public abstract String getMin(); public abstract String getFullDateSize(); @@ -62,6 +59,10 @@ public String getFullNullCount() { return "COUNT(*) - COUNT(" + getQuotedColumnName() + ")"; } - public abstract String getFullNDV(); + public abstract String getNDV(); + + public abstract String getSampleDateSize(SampleInfo info); + + public abstract String getSampleNullCount(SampleInfo info); } diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/base/ComplexTypeColumnStats.java b/fe/fe-core/src/main/java/com/starrocks/statistic/base/ComplexTypeColumnStats.java index 5c13fae277d4a9..d4e8a56d49f6b1 100644 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/base/ComplexTypeColumnStats.java +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/base/ComplexTypeColumnStats.java @@ -15,6 +15,7 @@ package com.starrocks.statistic.base; import com.starrocks.catalog.Type; +import com.starrocks.statistic.sample.SampleInfo; public class ComplexTypeColumnStats extends ColumnStats { @@ -28,12 +29,22 @@ public String getFullDateSize() { } @Override - public String getFullMax() { + public String getSampleDateSize(SampleInfo info) { + return columnType.getTypeSize() + " * " + info.getTotalRowCount(); + } + + @Override + public String getSampleNullCount(SampleInfo info) { + return "0"; + } + + @Override + public String getMax() { return "''"; } @Override - public String getFullMin() { + public String getMin() { return "''"; } @@ -43,7 +54,7 @@ public String getFullNullCount() { } @Override - public String getFullNDV() { + public String getNDV() { return "00"; } } diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/base/DistributionColumnStats.java b/fe/fe-core/src/main/java/com/starrocks/statistic/base/DistributionColumnStats.java deleted file mode 100644 index a82c72cb034b99..00000000000000 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/base/DistributionColumnStats.java +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2021-present StarRocks, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package com.starrocks.statistic.base; - -import com.starrocks.catalog.Type; -import com.starrocks.statistic.sample.SampleInfo; - -public class DistributionColumnStats extends PrimitiveTypeColumnStats { - - private final SampleInfo sampleInfo; - - public DistributionColumnStats(String columnName, Type columnType, SampleInfo sampleInfo) { - super(columnName, columnType); - this.sampleInfo = sampleInfo; - } - -} diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/base/PartitionSampler.java b/fe/fe-core/src/main/java/com/starrocks/statistic/base/PartitionSampler.java new file mode 100644 index 00000000000000..15fa14684b4c60 --- /dev/null +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/base/PartitionSampler.java @@ -0,0 +1,164 @@ +// Copyright 2021-present StarRocks, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.starrocks.statistic.base; + +import com.google.common.collect.Maps; +import com.starrocks.catalog.Partition; +import com.starrocks.catalog.Table; +import com.starrocks.catalog.Tablet; +import com.starrocks.common.Config; +import com.starrocks.statistic.StatsConstants; +import com.starrocks.statistic.sample.SampleInfo; +import com.starrocks.statistic.sample.TabletStats; + +import java.util.List; +import java.util.Map; + +public class PartitionSampler { + private static final double HIGH_WEIGHT_READ_RATIO = 0.001; + private static final double MEDIUM_HIGH_WEIGHT_READ_RATIO = 0.01; + private static final double MEDIUM_LOW_WEIGHT_READ_RATIO = 0.1; + private static final double LOW_WEIGHT_READ_RATIO = 0.8; + private static final long HIGH_WEIGHT_ROWS_THRESHOLD = 10000000L; + private static final long MEDIUM_HIGH_WEIGHT_ROWS_THRESHOLD = 1000000L; + private static final long MEDIUM_LOW_WEIGHT_ROWS_THRESHOLD = 100000L; + + private final double highRatio; + private final double mediumHighRatio; + private final double mediumLowRatio; + private final double lowRatio; + private final int maxSize; + + private final long sampleRowsLimit; + + private final Map partitionSampleMaps = Maps.newHashMap(); + + public PartitionSampler(double highSampleRatio, double mediumHighRatio, double mediumLowRatio, double lowRatio, + int maxSize, long sampleRowLimit) { + this.highRatio = highSampleRatio; + this.mediumHighRatio = mediumHighRatio; + this.mediumLowRatio = mediumLowRatio; + this.lowRatio = lowRatio; + this.maxSize = maxSize; + + this.sampleRowsLimit = sampleRowLimit; + } + + public double getHighRatio() { + return highRatio; + } + + public double getMediumHighRatio() { + return mediumHighRatio; + } + + public double getMediumLowRatio() { + return mediumLowRatio; + } + + public double getLowRatio() { + return lowRatio; + } + + public int getMaxSize() { + return maxSize; + } + + public long getSampleRowsLimit() { + return sampleRowsLimit; + } + + public SampleInfo getSampleInfo(long pid) { + return partitionSampleMaps.get(pid); + } + + public void classifyPartitions(Table table, List partitions) { + for (Long partitionId : partitions) { + Partition p = table.getPartition(partitionId); + if (p == null || !p.hasData()) { + continue; + } + + TabletSampler high = new TabletSampler(highRatio, HIGH_WEIGHT_READ_RATIO, maxSize, sampleRowsLimit); + TabletSampler mediumHigh = + new TabletSampler(mediumHighRatio, MEDIUM_HIGH_WEIGHT_READ_RATIO, maxSize, sampleRowsLimit); + TabletSampler mediumLow = + new TabletSampler(mediumLowRatio, MEDIUM_LOW_WEIGHT_READ_RATIO, maxSize, sampleRowsLimit); + TabletSampler low = new TabletSampler(lowRatio, LOW_WEIGHT_READ_RATIO, maxSize, sampleRowsLimit); + + for (Tablet tablet : p.getBaseIndex().getTablets()) { + long rowCount = tablet.getFuzzyRowCount(); + if (rowCount <= 0) { + continue; + } + if (rowCount >= HIGH_WEIGHT_ROWS_THRESHOLD) { + high.addTabletStats(new TabletStats(tablet.getId(), partitionId, rowCount)); + } else if (rowCount >= MEDIUM_HIGH_WEIGHT_ROWS_THRESHOLD) { + mediumHigh.addTabletStats(new TabletStats(tablet.getId(), partitionId, rowCount)); + } else if (rowCount >= MEDIUM_LOW_WEIGHT_ROWS_THRESHOLD) { + mediumLow.addTabletStats(new TabletStats(tablet.getId(), partitionId, rowCount)); + } else { + low.addTabletStats(new TabletStats(tablet.getId(), partitionId, rowCount)); + } + } + + long totalRows = high.getTotalRows() + mediumHigh.getTotalRows() + mediumLow.getTotalRows() + + low.getTotalRows(); + long sampleRows = high.getSampleRows() + mediumHigh.getSampleRows() + mediumLow.getSampleRows() + + low.getSampleRows(); + + if (totalRows <= 0) { + continue; + } + + List highSampleTablets = high.sample(); + List mediumHighSampleTablets = mediumHigh.sample(); + List mediumLowSampleTablets = mediumLow.sample(); + List lowSampleTablets = low.sample(); + + long totalTablets = high.getTotalTablets() + mediumHigh.getTotalTablets() + mediumLow.getTotalTablets() + + low.getTotalTablets(); + long sampleTablets = highSampleTablets.size() + mediumHighSampleTablets.size() + + mediumLowSampleTablets.size() + lowSampleTablets.size(); + + partitionSampleMaps.put(partitionId, + new SampleInfo(null, null, + sampleTablets * 1.0 / totalTablets, sampleRows, totalRows, highSampleTablets, + mediumHighSampleTablets, mediumLowSampleTablets, lowSampleTablets)); + } + } + + public static PartitionSampler create(Table table, List partitions, Map properties) { + double highSampleRatio = Double.parseDouble(properties.getOrDefault(StatsConstants.HIGH_WEIGHT_SAMPLE_RATIO, + "0.5")); + double mediumHighRatio = + Double.parseDouble(properties.getOrDefault(StatsConstants.MEDIUM_HIGH_WEIGHT_SAMPLE_RATIO, + "0.45")); + double mediumLowRatio = + Double.parseDouble(properties.getOrDefault(StatsConstants.MEDIUM_LOW_WEIGHT_SAMPLE_RATIO, + "0.35")); + double lowRatio = Double.parseDouble(properties.getOrDefault(StatsConstants.LOW_WEIGHT_SAMPLE_RATIO, + "0.3")); + int maxSize = Integer.parseInt(properties.getOrDefault(StatsConstants.MAX_SAMPLE_TABLET_NUM, + "5000")); + long sampleRowLimit = Long.parseLong(properties.getOrDefault(StatsConstants.STATISTIC_SAMPLE_COLLECT_ROWS, + String.valueOf(Config.statistic_sample_collect_rows))); + + PartitionSampler sampler = new PartitionSampler(highSampleRatio, mediumHighRatio, mediumLowRatio, lowRatio, + maxSize, sampleRowLimit); + sampler.classifyPartitions(table, partitions); + return sampler; + } +} diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/base/PrimitiveTypeColumnStats.java b/fe/fe-core/src/main/java/com/starrocks/statistic/base/PrimitiveTypeColumnStats.java index b26c29fc5465e6..80d2b0bdbd4b48 100644 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/base/PrimitiveTypeColumnStats.java +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/base/PrimitiveTypeColumnStats.java @@ -15,6 +15,7 @@ package com.starrocks.statistic.base; import com.starrocks.catalog.Type; +import com.starrocks.statistic.sample.SampleInfo; public class PrimitiveTypeColumnStats extends ColumnStats { public PrimitiveTypeColumnStats(String columnName, Type columnType) { @@ -31,7 +32,22 @@ public String getFullDateSize() { } @Override - public String getFullMax() { + public String getSampleDateSize(SampleInfo info) { + if (columnType.getPrimitiveType().isCharFamily()) { + return "IFNULL(SUM(CHAR_LENGTH(" + getQuotedColumnName() + ")) * " + + info.getTotalRowCount() + "/ COUNT(*), 0)"; + } + long typeSize = columnType.getTypeSize(); + return typeSize + " * " + info.getTotalRowCount(); + } + + @Override + public String getSampleNullCount(SampleInfo info) { + return getFullNullCount() + " * " + info.getTotalRowCount() + " / COUNT(*)"; + } + + @Override + public String getMax() { String fn = "MAX"; if (columnType.getPrimitiveType().isCharFamily()) { fn = fn + "(LEFT(" + getQuotedColumnName() + ", 200))"; @@ -43,7 +59,7 @@ public String getFullMax() { } @Override - public String getFullMin() { + public String getMin() { String fn = "MIN"; if (columnType.getPrimitiveType().isCharFamily()) { fn = fn + "(LEFT(" + getQuotedColumnName() + ", 200))"; @@ -55,7 +71,7 @@ public String getFullMin() { } @Override - public String getFullNDV() { + public String getNDV() { return "hex(hll_serialize(IFNULL(hll_raw(" + getQuotedColumnName() + "), hll_empty())))"; } } diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/base/SubFieldColumnStats.java b/fe/fe-core/src/main/java/com/starrocks/statistic/base/SubFieldColumnStats.java index 257f9e622ddcfa..5be18b97f54c4b 100644 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/base/SubFieldColumnStats.java +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/base/SubFieldColumnStats.java @@ -15,6 +15,7 @@ package com.starrocks.statistic.base; import com.starrocks.catalog.Type; +import com.starrocks.statistic.sample.SampleInfo; import java.util.List; import java.util.stream.Collectors; @@ -45,13 +46,13 @@ public String getQuotedColumnName() { } @Override - public String getFullMax() { - return columnStats.getFullMax(); + public String getMax() { + return columnStats.getMax(); } @Override - public String getFullMin() { - return columnStats.getFullMin(); + public String getMin() { + return columnStats.getMin(); } @Override @@ -60,7 +61,17 @@ public String getFullDateSize() { } @Override - public String getFullNDV() { - return columnStats.getFullNDV(); + public String getNDV() { + return columnStats.getNDV(); + } + + @Override + public String getSampleDateSize(SampleInfo info) { + return columnStats.getSampleDateSize(info); + } + + @Override + public String getSampleNullCount(SampleInfo info) { + return columnStats.getSampleNullCount(info); } } diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/base/TabletSampler.java b/fe/fe-core/src/main/java/com/starrocks/statistic/base/TabletSampler.java new file mode 100644 index 00000000000000..b0610126927f32 --- /dev/null +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/base/TabletSampler.java @@ -0,0 +1,71 @@ +// Copyright 2021-present StarRocks, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.starrocks.statistic.base; + +import com.google.common.collect.Lists; +import com.starrocks.statistic.sample.TabletStats; + +import java.util.List; +import java.util.stream.Collectors; + +public class TabletSampler { + private final double tabletsSampleRatio; + + private final double tabletReadRatio; + + private final int maxSize; + + private final List tablets = Lists.newArrayList(); + + private final long sampleRowsLimit; + + private long totalRows; + + public TabletSampler(double tabletsSampleRatio, double tabletReadRatio, int maxSize, long sampleRowsLimit) { + this.tabletsSampleRatio = tabletsSampleRatio; + this.tabletReadRatio = tabletReadRatio; + this.maxSize = maxSize; + this.sampleRowsLimit = sampleRowsLimit; + } + + public void addTabletStats(TabletStats tablet) { + this.tablets.add(tablet); + this.totalRows += tablet.getRowCount(); + } + + public List sample() { + int number = (int) Math.ceil(tablets.size() * tabletsSampleRatio); + number = Math.min(number, maxSize); + // sort by row count in descending order + return tablets.stream().sorted((o1, o2) -> Long.compare(o2.getRowCount(), o1.getRowCount())) + .limit(number).collect(Collectors.toList()); + } + + public long getTotalRows() { + return totalRows; + } + + public long getSampleRows() { + return (long) Math.min(sampleRowsLimit, Math.max(totalRows * tabletReadRatio, 1L)); + } + + public long getTotalTablets() { + return tablets.size(); + } + + public void clear() { + + } +} diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/FullQueryJob.java b/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/FullQueryJob.java index 9ee3bbc16cdfab..5af307a0d1dabd 100644 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/FullQueryJob.java +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/FullQueryJob.java @@ -20,8 +20,7 @@ import com.starrocks.catalog.Table; import com.starrocks.qe.ConnectContext; import com.starrocks.statistic.base.ColumnStats; -import com.starrocks.thrift.TStatisticData; -import org.apache.commons.lang.StringEscapeUtils; +import org.apache.velocity.VelocityContext; import java.util.List; import java.util.stream.Collectors; @@ -35,31 +34,7 @@ protected FullQueryJob(ConnectContext context, Database db, } @Override - public void queryStatistics() { - queryFullData(); - } - - private void queryFullData() { - String tableName = StringEscapeUtils.escapeSql(db.getOriginName() + "." + table.getName()); - - List metaSQL = buildBatchFullQuerySQL(columnStats); - for (String sql : metaSQL) { - // execute sql - List dataList = executeStatisticsQuery(sql, context); - - for (TStatisticData data : dataList) { - Partition partition = table.getPartition(data.getPartitionId()); - if (partition == null) { - continue; - } - String partitionName = StringEscapeUtils.escapeSql(partition.getName()); - sqlBuffer.add(createInsertValueSQL(data, tableName, partitionName)); - rowsBuffer.add(createInsertValueExpr(data, tableName, partitionName)); - } - } - } - - private List buildBatchFullQuerySQL(List queryColumns) { + protected List buildQuerySQL() { List metaSQL = Lists.newArrayList(); for (Long partitionId : partitionIdList) { Partition partition = table.getPartition(partitionId); @@ -68,9 +43,14 @@ private List buildBatchFullQuerySQL(List queryColumns) { continue; } - for (ColumnStats columnStat : queryColumns) { - String sql = StatisticSQLs.buildFullSQL(db, table, partition, columnStat, - StatisticSQLs.BATCH_FULL_STATISTIC_TEMPLATE); + for (ColumnStats columnStat : columnStats) { + VelocityContext context = StatisticSQLs.buildBaseContext(db, table, partition, columnStat); + context.put("dataSize", columnStat.getFullDateSize()); + context.put("countNullFunction", columnStat.getFullNullCount()); + context.put("hllFunction", columnStat.getNDV()); + context.put("maxFunction", columnStat.getMax()); + context.put("minFunction", columnStat.getMin()); + String sql = StatisticSQLs.build(context, StatisticSQLs.BATCH_FULL_STATISTIC_TEMPLATE); metaSQL.add(sql); } } diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/HyperQueryJob.java b/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/HyperQueryJob.java index e56f76dc0d4200..ac4e7f6e9d9a05 100644 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/HyperQueryJob.java +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/HyperQueryJob.java @@ -23,6 +23,7 @@ import com.starrocks.catalog.Function; import com.starrocks.catalog.FunctionSet; import com.starrocks.catalog.OlapTable; +import com.starrocks.catalog.Partition; import com.starrocks.catalog.Table; import com.starrocks.catalog.Type; import com.starrocks.qe.ConnectContext; @@ -30,7 +31,7 @@ import com.starrocks.statistic.StatisticExecutor; import com.starrocks.statistic.base.ColumnClassifier; import com.starrocks.statistic.base.ColumnStats; -import com.starrocks.statistic.sample.SampleInfo; +import com.starrocks.statistic.base.PartitionSampler; import com.starrocks.thrift.TStatisticData; import org.apache.commons.lang.StringEscapeUtils; import org.apache.logging.log4j.LogManager; @@ -70,7 +71,28 @@ protected HyperQueryJob(ConnectContext context, Database db, Table table, List sqlList = buildQuerySQL(); + for (String sql : sqlList) { + // execute sql + List dataList = executeStatisticsQuery(sql, context); + + for (TStatisticData data : dataList) { + Partition partition = table.getPartition(data.getPartitionId()); + if (partition == null) { + continue; + } + String partitionName = StringEscapeUtils.escapeSql(partition.getName()); + sqlBuffer.add(createInsertValueSQL(data, tableName, partitionName)); + rowsBuffer.add(createInsertValueExpr(data, tableName, partitionName)); + } + } + } + + protected List buildQuerySQL() { + return Collections.emptyList(); + } public List> getStatisticsData() { List> r = rowsBuffer; @@ -203,10 +225,27 @@ public String toString() { public static List createFullQueryJobs(ConnectContext context, Database db, Table table, List columnNames, List columnTypes, List partitionIdList, int batchLimit) { + ColumnClassifier classifier = ColumnClassifier.of(columnNames, columnTypes, table); + + List supportedStats = classifier.getColumnStats(); + List dataCollectColumns = + supportedStats.stream().filter(ColumnStats::supportData).collect(Collectors.toList()); - ColumnClassifier classifier = ColumnClassifier.of(columnNames, columnTypes, table, new SampleInfo()); - int splitSize = Math.max(1, batchLimit / columnNames.size()); + List> pids = Lists.partition(partitionIdList, batchLimit); + List jobs = Lists.newArrayList(); + for (List pid : pids) { + if (!dataCollectColumns.isEmpty()) { + jobs.add(new FullQueryJob(context, db, table, dataCollectColumns, pid)); + } + } + return jobs; + } + public static List createSampleQueryJobs(ConnectContext context, Database db, Table table, + List columnNames, List columnTypes, + List partitionIdList, int batchLimit, + PartitionSampler sampler) { + ColumnClassifier classifier = ColumnClassifier.of(columnNames, columnTypes, table); List supportedStats = classifier.getColumnStats(); List metaCollectColumns = @@ -214,14 +253,14 @@ public static List createFullQueryJobs(ConnectContext context, Da List dataCollectColumns = supportedStats.stream().filter(c -> !c.supportMeta() && c.supportData()).collect(Collectors.toList()); - List> pids = Lists.partition(partitionIdList, splitSize); + List> pids = Lists.partition(partitionIdList, batchLimit); List jobs = Lists.newArrayList(); for (List pid : pids) { if (!metaCollectColumns.isEmpty()) { - jobs.add(new MetaQueryJob(context, db, table, metaCollectColumns, pid)); + jobs.add(new MetaQueryJob(context, db, table, metaCollectColumns, pid, sampler)); } if (!dataCollectColumns.isEmpty()) { - jobs.add(new FullQueryJob(context, db, table, dataCollectColumns, pid)); + jobs.add(new SampleQueryJob(context, db, table, dataCollectColumns, pid, sampler)); } } return jobs; diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/MetaQueryJob.java b/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/MetaQueryJob.java index 145c8b71789d41..f9f908876f8f8c 100644 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/MetaQueryJob.java +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/MetaQueryJob.java @@ -16,47 +16,48 @@ import com.google.common.collect.Lists; import com.google.common.collect.Maps; -import com.starrocks.analysis.Expr; import com.starrocks.catalog.Database; import com.starrocks.catalog.Partition; import com.starrocks.catalog.Table; import com.starrocks.qe.ConnectContext; import com.starrocks.statistic.base.ColumnStats; +import com.starrocks.statistic.base.PartitionSampler; import com.starrocks.thrift.TStatisticData; import org.apache.commons.lang.StringEscapeUtils; +import org.apache.velocity.VelocityContext; -import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; +/* + * Split sample statistics query: + * 1. max/max/count(1) to meta query + * 2. length/count null/ndv to data query + */ public class MetaQueryJob extends HyperQueryJob { // column_partition -> rows index, for find row private final Map rowsIndex = Maps.newHashMap(); - - private final List tempSqlBuffer = Lists.newArrayList(); - private final List> tempRowsBuffer = Lists.newArrayList(); + private final List tempRowsBuffer = Lists.newArrayList(); + private final PartitionSampler sampler; protected MetaQueryJob(ConnectContext context, Database db, Table table, List columnStats, - List partitionIdList) { + List partitionIdList, PartitionSampler sampler) { super(context, db, table, columnStats, partitionIdList); + this.sampler = sampler; } @Override public void queryStatistics() { - tempSqlBuffer.clear(); tempRowsBuffer.clear(); - queryMeta(columnStats); - queryDataNDV(columnStats); + queryMetaMetric(columnStats); + queryDataMetric(columnStats); - tempSqlBuffer.clear(); tempRowsBuffer.clear(); } - private void queryMeta(List queryColumns) { - String tableName = StringEscapeUtils.escapeSql(db.getOriginName() + "." + table.getName()); - + private void queryMetaMetric(List queryColumns) { List metaSQL = buildBatchMetaQuerySQL(queryColumns); for (String sql : metaSQL) { // execute sql @@ -66,13 +67,10 @@ private void queryMeta(List queryColumns) { if (partition == null) { continue; } - String partitionName = StringEscapeUtils.escapeSql(partition.getName()); // init rowsIndex.put(data.getColumnName() + "_" + data.getPartitionId(), tempRowsBuffer.size()); - - tempSqlBuffer.add(createInsertValueSQL(data, tableName, partitionName)); - tempRowsBuffer.add(createInsertValueExpr(data, tableName, partitionName)); + tempRowsBuffer.add(data); } } } @@ -87,8 +85,10 @@ private List buildBatchMetaQuerySQL(List queryColumns) { } for (ColumnStats columnStat : queryColumns) { - String sql = StatisticSQLs.buildFullSQL(db, table, partition, columnStat, - StatisticSQLs.BATCH_META_STATISTIC_TEMPLATE); + VelocityContext context = StatisticSQLs.buildBaseContext(db, table, partition, columnStat); + context.put("maxFunction", columnStat.getMax()); + context.put("minFunction", columnStat.getMin()); + String sql = StatisticSQLs.build(context, StatisticSQLs.BATCH_META_STATISTIC_TEMPLATE); metaSQL.add(sql); } } @@ -98,7 +98,9 @@ private List buildBatchMetaQuerySQL(List queryColumns) { return l.stream().map(sql -> String.join(" UNION ALL ", sql)).collect(Collectors.toList()); } - private void queryDataNDV(List queryColumns) { + private void queryDataMetric(List queryColumns) { + String tableName = StringEscapeUtils.escapeSql(db.getOriginName() + "." + table.getName()); + List metaSQL = buildBatchNDVQuerySQL(queryColumns); for (String sql : metaSQL) { // execute sql @@ -112,17 +114,24 @@ private void queryDataNDV(List queryColumns) { if (!rowsIndex.containsKey(key)) { continue; } + String partitionName = StringEscapeUtils.escapeSql(partition.getName()); int index = rowsIndex.get(key); - tempRowsBuffer.get(index).set(8, hllDeserialize(data.getHll())); // real hll - - sqlBuffer.add(tempSqlBuffer.get(index)); - rowsBuffer.add(tempRowsBuffer.get(index)); + TStatisticData tempData = tempRowsBuffer.get(index); + tempData.setNullCount(data.getNullCount()); // real null count + tempData.setDataSize(data.getDataSize()); // real data size + tempData.setHll(data.getHll()); // real hll + sqlBuffer.add(createInsertValueSQL(tempData, tableName, partitionName)); + rowsBuffer.add(createInsertValueExpr(tempData, tableName, partitionName)); } } } private List buildBatchNDVQuerySQL(List queryColumns) { + int parts = Math.max(1, context.getSessionVariable().getStatisticCollectParallelism()); + List> partColumns = Lists.partition(queryColumns, parts); + pipelineDop = partColumns.size() < parts ? parts / partColumns.size() : 1; + List metaSQL = Lists.newArrayList(); for (Long partitionId : partitionIdList) { Partition partition = table.getPartition(partitionId); @@ -130,26 +139,12 @@ private List buildBatchNDVQuerySQL(List queryColumns) { // statistics job doesn't lock DB, partition may be dropped, skip it continue; } - - for (ColumnStats columnStat : queryColumns) { - if (!rowsIndex.containsKey(columnStat.getColumnNameStr() + "_" + partition.getId())) { - continue; - } - - String sql = StatisticSQLs.buildFullSQL(db, table, partition, columnStat, - StatisticSQLs.BATCH_NDV_STATISTIC_TEMPLATE); + for (List part : partColumns) { + String sql = StatisticSQLs.buildSampleSQL(db, table, partition, part, sampler, + StatisticSQLs.BATCH_DATA_STATISTIC_SELECT_TEMPLATE); metaSQL.add(sql); } } - - if (metaSQL.isEmpty()) { - return Collections.emptyList(); - } - - int parts = Math.max(1, context.getSessionVariable().getStatisticCollectParallelism()); - List> l = Lists.partition(metaSQL, parts); - pipelineDop = l.size() < parts ? parts / l.size() : 1; - return l.stream().map(sql -> String.join(" UNION ALL ", sql)).collect(Collectors.toList()); - + return metaSQL; } } diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/SampleQueryJob.java b/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/SampleQueryJob.java new file mode 100644 index 00000000000000..0ec108207f4f33 --- /dev/null +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/SampleQueryJob.java @@ -0,0 +1,58 @@ +// Copyright 2021-present StarRocks, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.starrocks.statistic.hyper; + +import com.google.common.collect.Lists; +import com.starrocks.catalog.Database; +import com.starrocks.catalog.Partition; +import com.starrocks.catalog.Table; +import com.starrocks.qe.ConnectContext; +import com.starrocks.statistic.base.ColumnStats; +import com.starrocks.statistic.base.PartitionSampler; + +import java.util.List; + +public class SampleQueryJob extends HyperQueryJob { + private final PartitionSampler sampler; + + protected SampleQueryJob(ConnectContext context, Database db, + Table table, + List columnStats, + List partitionIdList, PartitionSampler sampler) { + super(context, db, table, columnStats, partitionIdList); + this.sampler = sampler; + } + + @Override + protected List buildQuerySQL() { + int parts = Math.max(1, context.getSessionVariable().getStatisticCollectParallelism()); + + List> partColumns = Lists.partition(columnStats, parts); + List sampleSQLs = Lists.newArrayList(); + for (Long partitionId : partitionIdList) { + Partition partition = table.getPartition(partitionId); + if (partition == null) { + // statistics job doesn't lock DB, partition may be dropped, skip it + continue; + } + for (List stats : partColumns) { + String sql = StatisticSQLs.buildSampleSQL(db, table, partition, stats, sampler, + StatisticSQLs.BATCH_SAMPLE_STATISTIC_SELECT_TEMPLATE); + sampleSQLs.add(sql); + } + } + return sampleSQLs; + } +} diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/StatisticSQLs.java b/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/StatisticSQLs.java index 270e126ec9cc66..2d6fa9529e2a93 100644 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/StatisticSQLs.java +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/hyper/StatisticSQLs.java @@ -14,15 +14,23 @@ package com.starrocks.statistic.hyper; +import com.google.common.collect.Lists; import com.starrocks.catalog.Database; import com.starrocks.catalog.Partition; import com.starrocks.catalog.Table; +import com.starrocks.common.Config; import com.starrocks.statistic.StatsConstants; import com.starrocks.statistic.base.ColumnStats; +import com.starrocks.statistic.base.PartitionSampler; +import com.starrocks.statistic.sample.SampleInfo; +import com.starrocks.statistic.sample.TabletStats; import org.apache.velocity.VelocityContext; import org.apache.velocity.app.VelocityEngine; import java.io.StringWriter; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; public class StatisticSQLs { private static final VelocityEngine DEFAULT_VELOCITY_ENGINE; @@ -62,24 +70,46 @@ public class StatisticSQLs { ", cast($partitionId as BIGINT)" + // BIGINT, partition_id ", '$columnNameStr'" + // VARCHAR, column_name ", cast(COUNT(*) as BIGINT)" + // BIGINT, row_count - ", cast($dataSize as BIGINT)" + // BIGINT, data_size + ", cast(0 as BIGINT)" + // BIGINT, data_size ", '00'" + // VARBINARY, ndv - ", cast($countNullFunction as BIGINT)" + // BIGINT, null_count + ", cast(0 as BIGINT)" + // BIGINT, null_count ", $maxFunction" + // VARCHAR, max ", $minFunction " + // VARCHAR, min " FROM `$dbName`.`$tableName` partitions(`$partitionName`) [_META_]"; - public static final String BATCH_NDV_STATISTIC_TEMPLATE = "SELECT cast($version as INT)" + + public static final String BATCH_DATA_STATISTIC_TEMPLATE = "SELECT cast($version as INT)" + ", cast($partitionId as BIGINT)" + // BIGINT, partition_id ", '$columnNameStr'" + // VARCHAR, column_name ", cast(0 as BIGINT)" + // BIGINT, row_count - ", cast(0 as BIGINT)" + // BIGINT, data_size + ", cast($dataSize as BIGINT)" + // BIGINT, data_size ", $hllFunction" + // VARBINARY, ndv - ", cast(0 as BIGINT)" + // BIGINT, null_count + ", cast($countNullFunction as BIGINT)" + // BIGINT, null_count ", ''" + // VARCHAR, max ", '' " + // VARCHAR, min " FROM `$dbName`.`$tableName` partitions(`$partitionName`)"; + public static final String BATCH_DATA_STATISTIC_SELECT_TEMPLATE = "SELECT cast($version as INT)" + + ", cast($partitionId as BIGINT)" + // BIGINT, partition_id + ", '$columnNameStr'" + // VARCHAR, column_name + ", cast(0 as BIGINT)" + // BIGINT, row_count + ", cast($dataSize as BIGINT)" + // BIGINT, data_size + ", $hllFunction" + // VARBINARY, ndv + ", cast($countNullFunction as BIGINT)" + // BIGINT, null_count + ", ''" + // VARCHAR, max + ", '' " + // VARCHAR, min + " FROM base_cte_table "; + + public static final String BATCH_SAMPLE_STATISTIC_SELECT_TEMPLATE = "SELECT cast($version as INT)" + + ", cast($partitionId as BIGINT)" + // BIGINT + ", '$columnNameStr'" + // VARCHAR + ", cast($rowCount as BIGINT)" + // BIGINT + ", cast($dataSize as BIGINT)" + // BIGINT + ", $hllFunction" + // VARBINARY + ", cast($countNullFunction as BIGINT)" + // BIGINT + ", $maxFunction" + // VARCHAR + ", $minFunction " + // VARCHAR + " FROM base_cte_table "; + public static String build(VelocityContext context, String template) { StringWriter sw = new StringWriter(); DEFAULT_VELOCITY_ENGINE.evaluate(context, sw, "", template); @@ -87,9 +117,7 @@ public static String build(VelocityContext context, String template) { } public static String buildFullSQL(Database db, Table table, Partition p, ColumnStats stats, String template) { - StringBuilder builder = new StringBuilder(); VelocityContext context = new VelocityContext(); - String columnNameStr = stats.getColumnNameStr(); String quoteColumnName = stats.getQuotedColumnName(); context.put("version", StatsConstants.STATISTIC_BATCH_VERSION); @@ -101,10 +129,82 @@ public static String buildFullSQL(Database db, Table table, Partition p, ColumnS context.put("tableName", table.getName()); context.put("quoteColumnName", quoteColumnName); context.put("countNullFunction", stats.getFullNullCount()); - context.put("hllFunction", stats.getFullNDV()); - context.put("maxFunction", stats.getFullMax()); - context.put("minFunction", stats.getFullMin()); - builder.append(StatisticSQLs.build(context, template)); - return builder.toString(); + context.put("hllFunction", stats.getNDV()); + context.put("maxFunction", stats.getMax()); + context.put("minFunction", stats.getMin()); + return StatisticSQLs.build(context, template); + } + + public static VelocityContext buildBaseContext(Database db, Table table, Partition p, ColumnStats stats) { + VelocityContext context = new VelocityContext(); + String columnNameStr = stats.getColumnNameStr(); + String quoteColumnName = stats.getQuotedColumnName(); + context.put("version", StatsConstants.STATISTIC_BATCH_VERSION); + context.put("partitionId", p.getId()); + context.put("columnNameStr", columnNameStr); + context.put("partitionName", p.getName()); + context.put("dbName", db.getOriginName()); + context.put("tableName", table.getName()); + context.put("quoteColumnName", quoteColumnName); + return context; + } + + public static String buildSampleSQL(Database db, Table table, Partition p, List stats, + PartitionSampler sampler, String template) { + if (p == null || !p.hasData()) { + return null; + } + String tableName = "`" + db.getOriginName() + "`.`" + table.getName() + "`"; + + SampleInfo info = sampler.getSampleInfo(p.getId()); + List groupSQLs = Lists.newArrayList(); + StringBuilder sqlBuilder = new StringBuilder(); + groupSQLs.add(generateRatioTable(tableName, sampler.getSampleRowsLimit(), info.getHighWeightTablets(), + sampler.getHighRatio(), "t_high")); + groupSQLs.add(generateRatioTable(tableName, sampler.getSampleRowsLimit(), info.getMediumHighWeightTablets(), + sampler.getMediumHighRatio(), "t_medium_high")); + groupSQLs.add(generateRatioTable(tableName, sampler.getSampleRowsLimit(), info.getMediumLowWeightTablets(), + sampler.getMediumLowRatio(), "t_medium_low")); + groupSQLs.add(generateRatioTable(tableName, sampler.getSampleRowsLimit(), info.getLowWeightTablets(), + sampler.getLowRatio(), "t_low")); + if (groupSQLs.stream().allMatch(Objects::isNull)) { + groupSQLs.add("SELECT * FROM " + tableName + " LIMIT " + Config.statistic_sample_collect_rows); + } + + sqlBuilder.append("with base_cte_table as ("); + sqlBuilder.append(groupSQLs.stream().filter(Objects::nonNull).collect(Collectors.joining(" UNION ALL "))); + sqlBuilder.append(") "); + + groupSQLs.clear(); + + VelocityContext context = new VelocityContext(); + for (ColumnStats stat : stats) { + String columnNameStr = stat.getColumnNameStr(); + context.put("version", StatsConstants.STATISTIC_BATCH_VERSION); + context.put("columnNameStr", columnNameStr); + context.put("rowCount", info.getTotalRowCount()); + context.put("dataSize", stat.getSampleDateSize(info)); + context.put("hllFunction", stat.getNDV()); + context.put("countNullFunction", stat.getSampleNullCount(info)); + context.put("maxFunction", stat.getMax()); + context.put("minFunction", stat.getMin()); + groupSQLs.add(StatisticSQLs.build(context, template)); + } + sqlBuilder.append(String.join(" UNION ALL ", groupSQLs)); + return sqlBuilder.toString(); + } + + private static String generateRatioTable(String table, long limit, + List tablets, double ratio, String alias) { + if (tablets.isEmpty()) { + return null; + } + return String.format(" SELECT * FROM (SELECT * " + + " FROM %s tablet(%s) " + + " WHERE rand() <= %f " + + " LIMIT %d) %s", + table, + tablets.stream().map(t -> String.valueOf(t.getTabletId())).collect(Collectors.joining(", ")), + ratio, limit, alias); } } diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/sample/SampleInfo.java b/fe/fe-core/src/main/java/com/starrocks/statistic/sample/SampleInfo.java index 10c579c8ba57bd..709be9a8a487e5 100644 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/sample/SampleInfo.java +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/sample/SampleInfo.java @@ -25,11 +25,6 @@ import static com.starrocks.statistic.StatsConstants.STATISTICS_DB_NAME; public class SampleInfo { - - private final String dbName; - - private final String tableName; - private final double tabletSampleRatio; private final long sampleRowCount; @@ -47,8 +42,6 @@ public class SampleInfo { private final List lowWeightTablets; public SampleInfo() { - this.dbName = null; - this.tableName = null; this.tabletSampleRatio = 1; this.sampleRowCount = 1; this.totalRowCount = 1; @@ -65,8 +58,6 @@ public SampleInfo(String dbName, String tableName, double tabletSampleRatio, List mediumHighWeightTablets, List mediumLowWeightTablets, List lowWeightTablets) { - this.dbName = dbName; - this.tableName = tableName; this.tabletSampleRatio = tabletSampleRatio; this.sampleRowCount = sampleRowCount; this.totalRowCount = totalRowCount; @@ -77,20 +68,36 @@ public SampleInfo(String dbName, String tableName, double tabletSampleRatio, this.lowWeightTablets = lowWeightTablets; } - public double getTabletSampleRatio() { - return tabletSampleRatio; + public long getTotalRowCount() { + return totalRowCount; } public double getRowSampleRatio() { return rowSampleRatio; } - public long getSampleRowCount() { - return sampleRowCount; + public List getHighWeightTablets() { + return highWeightTablets; } - public long getTotalRowCount() { - return totalRowCount; + public List getMediumHighWeightTablets() { + return mediumHighWeightTablets; + } + + public List getMediumLowWeightTablets() { + return mediumLowWeightTablets; + } + + public List getLowWeightTablets() { + return lowWeightTablets; + } + + public double getTabletSampleRatio() { + return tabletSampleRatio; + } + + public long getSampleRowCount() { + return sampleRowCount; } public int getMaxSampleTabletNum() { diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/sample/TabletStats.java b/fe/fe-core/src/main/java/com/starrocks/statistic/sample/TabletStats.java index 2d65a333219f3a..7920669abded44 100644 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/sample/TabletStats.java +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/sample/TabletStats.java @@ -17,12 +17,10 @@ public class TabletStats { private final long tabletId; - private final long partitionId; private final long rowCount; public TabletStats(long tabletId, long partitionId, long rowCount) { this.tabletId = tabletId; - this.partitionId = partitionId; this.rowCount = rowCount; } @@ -30,10 +28,6 @@ public long getTabletId() { return tabletId; } - public long getPartitionId() { - return partitionId; - } - public long getRowCount() { return rowCount; }