Skip to content

Commit

Permalink
update sample statistic
Browse files Browse the repository at this point in the history
Signed-off-by: Seaven <[email protected]>
  • Loading branch information
Seaven committed Nov 11, 2024
1 parent 94f4ee9 commit 1f75bcf
Show file tree
Hide file tree
Showing 17 changed files with 609 additions and 184 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import com.starrocks.sql.ast.QueryStatement;
import com.starrocks.sql.ast.StatementBase;
import com.starrocks.sql.ast.ValuesRelation;
import com.starrocks.statistic.base.PartitionSampler;
import com.starrocks.statistic.hyper.HyperQueryJob;
import org.apache.commons.lang.StringUtils;
import org.apache.logging.log4j.LogManager;
Expand All @@ -47,7 +48,7 @@ public class HyperStatisticsCollectJob extends StatisticsCollectJob {

private final List<Long> partitionIdList;

private final int batchLimit;
private final int batchRowsLimit;
private final List<String> sqlBuffer = Lists.newArrayList();
private final List<List<Expr>> rowsBuffer = Lists.newArrayList();

Expand All @@ -57,15 +58,15 @@ public HyperStatisticsCollectJob(Database db, Table table, List<Long> partitionI
super(db, table, columns, type, scheduleType, properties);
this.partitionIdList = partitionIdList;
// hll serialize to hex, about 32kb
this.batchLimit = (int) Math.max(1, Config.statistic_full_collect_buffer / 33 / 1024);
this.batchRowsLimit = (int) Math.max(1, Config.statistic_full_collect_buffer / 33 / 1024);
}

public HyperStatisticsCollectJob(Database db, Table table, List<Long> partitionIdList, List<String> columnNames,
List<Type> columnTypes, StatsConstants.AnalyzeType type,
StatsConstants.ScheduleType scheduleType, Map<String, String> properties) {
super(db, table, columnNames, columnTypes, type, scheduleType, properties);
this.partitionIdList = partitionIdList;
this.batchLimit = (int) Math.max(1, Config.statistic_full_collect_buffer / 33 / 1024);
this.batchRowsLimit = (int) Math.max(1, Config.statistic_full_collect_buffer / 33 / 1024);
}

@Override
Expand All @@ -76,8 +77,16 @@ public void collect(ConnectContext context, AnalyzeStatus analyzeStatus) throws
context.getSessionVariable().setEnableAnalyzePhasePruneColumns(true);
context.getSessionVariable().setPipelineDop(context.getSessionVariable().getStatisticCollectParallelism());

List<HyperQueryJob> queryJobs = HyperQueryJob.createFullQueryJobs(context, db, table, columnNames,
columnTypes, partitionIdList, batchLimit);
int splitSize = Math.max(1, batchRowsLimit / columnNames.size());
List<HyperQueryJob> queryJobs;
if (type == StatsConstants.AnalyzeType.FULL) {
queryJobs = HyperQueryJob.createFullQueryJobs(context, db, table, columnNames, columnTypes,
partitionIdList, splitSize);
} else {
PartitionSampler sampler = PartitionSampler.create(table, partitionIdList, properties);
queryJobs = HyperQueryJob.createSampleQueryJobs(context, db, table, columnNames, columnTypes,
partitionIdList, splitSize, sampler);
}

long queryTotals = 0;
long queryFailures = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ public static List<StatisticsCollectJob> buildStatisticsCollectJob(NativeAnalyze
return Collections.emptyList();
}
createJob(statsJobs, nativeAnalyzeJob, db, GlobalStateMgr.getCurrentState().getLocalMetastore()
.getTable(db.getId(), nativeAnalyzeJob.getTableId()),
.getTable(db.getId(), nativeAnalyzeJob.getTableId()),
nativeAnalyzeJob.getColumns(), nativeAnalyzeJob.getColumnTypes());
}

Expand All @@ -108,8 +108,13 @@ public static StatisticsCollectJob buildStatisticsCollectJob(Database db, Table

LOG.debug("statistics job work on table: {}, type: {}", table.getName(), analyzeType.name());
if (analyzeType.equals(StatsConstants.AnalyzeType.SAMPLE)) {
return new SampleStatisticsCollectJob(db, table, columnNames, columnTypes,
StatsConstants.AnalyzeType.SAMPLE, scheduleType, properties);
if (Config.statistic_use_meta_statistics) {
return new HyperStatisticsCollectJob(db, table, partitionIdList, columnNames, columnTypes,
StatsConstants.AnalyzeType.SAMPLE, scheduleType, properties);
} else {
return new SampleStatisticsCollectJob(db, table, columnNames, columnTypes,
StatsConstants.AnalyzeType.SAMPLE, scheduleType, properties);
}
} else if (analyzeType.equals(StatsConstants.AnalyzeType.HISTOGRAM)) {
return new HistogramStatisticsCollectJob(db, table, columnNames, columnTypes, scheduleType, properties);
} else {
Expand Down Expand Up @@ -196,7 +201,8 @@ public static StatisticsCollectJob buildExternalStatisticsCollectJob(String cata
StatsConstants.AnalyzeType analyzeType,
StatsConstants.ScheduleType scheduleType,
Map<String, String> properties) {
List<Type> columnTypes = columnNames.stream().map(col -> table.getColumn(col).getType()).collect(Collectors.toList());
List<Type> columnTypes =
columnNames.stream().map(col -> table.getColumn(col).getType()).collect(Collectors.toList());
return buildExternalStatisticsCollectJob(catalogName, db, table, partitionNames, columnNames, columnTypes,
analyzeType, scheduleType, properties);
}
Expand Down Expand Up @@ -300,7 +306,8 @@ private static void createExternalAnalyzeJob(List<StatisticsCollectJob> allTable
if (basicStatsMeta != null) {
// check table row count
List<ConnectorTableColumnStats> columnStatisticList =
GlobalStateMgr.getCurrentState().getStatisticStorage().getConnectorTableStatisticsSync(table, columnNames);
GlobalStateMgr.getCurrentState().getStatisticStorage()
.getConnectorTableStatisticsSync(table, columnNames);
List<ConnectorTableColumnStats> validColumnStatistics = columnStatisticList.stream().
filter(columnStatistic -> !columnStatistic.isUnknown()).collect(Collectors.toList());

Expand Down Expand Up @@ -459,7 +466,7 @@ private static void createJob(List<StatisticsCollectJob> allTableJobMap, NativeA
job.getAnalyzeType() == StatsConstants.AnalyzeType.HISTOGRAM ?
Config.statistic_auto_collect_histogram_interval :
(sumDataSize > Config.statistic_auto_collect_small_table_size ?
Config.statistic_auto_collect_large_table_interval :
Config.statistic_auto_collect_large_table_interval :
Config.statistic_auto_collect_small_table_interval);

long timeInterval = job.getProperties().get(StatsConstants.STATISTIC_AUTO_COLLECT_INTERVAL) != null ?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import com.starrocks.catalog.StructType;
import com.starrocks.catalog.Table;
import com.starrocks.catalog.Type;
import com.starrocks.statistic.sample.SampleInfo;

import java.util.List;

Expand All @@ -29,28 +28,20 @@ public class ColumnClassifier {

private final List<ColumnStats> unSupportStats = Lists.newArrayList();

public static ColumnClassifier of(List<String> columnNames, List<Type> columnTypes, Table table,
SampleInfo sampleInfo) {
public static ColumnClassifier of(List<String> columnNames, List<Type> columnTypes, Table table) {
ColumnClassifier columnClassifier = new ColumnClassifier();
columnClassifier.classifyColumnStats(columnNames, columnTypes, table, sampleInfo);
columnClassifier.classifyColumnStats(columnNames, columnTypes, table);
return columnClassifier;
}

private void classifyColumnStats(List<String> columnNames, List<Type> columnTypes, Table table,
SampleInfo sampleInfo) {
boolean onlyOneDistributionCol = table.getDistributionColumnNames().size() == 1;
private void classifyColumnStats(List<String> columnNames, List<Type> columnTypes, Table table) {
for (int i = 0; i < columnNames.size(); i++) {
String columnName = columnNames.get(i);
Type columnType = columnTypes.get(i);

if (table.getColumn(columnName) != null) {
if (columnType.canStatistic()) {
if (onlyOneDistributionCol && table.getDistributionColumnNames().contains(columnName)) {
columnStats.add(new DistributionColumnStats(columnName, columnType, sampleInfo));
onlyOneDistributionCol = false;
} else {
columnStats.add(new PrimitiveTypeColumnStats(columnName, columnType));
}
columnStats.add(new PrimitiveTypeColumnStats(columnName, columnType));
} else {
unSupportStats.add(new ComplexTypeColumnStats(columnName, columnType));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package com.starrocks.statistic.base;

import com.starrocks.catalog.Type;
import com.starrocks.statistic.sample.SampleInfo;
import org.apache.commons.lang.StringEscapeUtils;

public abstract class ColumnStats {
Expand Down Expand Up @@ -48,20 +49,20 @@ public String getQuotedColumnName() {
return "`" + columnName + "`";
}

public abstract String getFullMax();
public abstract String getMax();

public abstract String getFullMin();

public String getFullRowCount() {
return "COUNT(*)";
}
public abstract String getMin();

public abstract String getFullDateSize();

public String getFullNullCount() {
return "COUNT(*) - COUNT(" + getQuotedColumnName() + ")";
}

public abstract String getFullNDV();
public abstract String getNDV();

public abstract String getSampleDateSize(SampleInfo info);

public abstract String getSampleNullCount(SampleInfo info);

}
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package com.starrocks.statistic.base;

import com.starrocks.catalog.Type;
import com.starrocks.statistic.sample.SampleInfo;

public class ComplexTypeColumnStats extends ColumnStats {

Expand All @@ -28,12 +29,22 @@ public String getFullDateSize() {
}

@Override
public String getFullMax() {
public String getSampleDateSize(SampleInfo info) {
return columnType.getTypeSize() + " * " + info.getTotalRowCount();
}

@Override
public String getSampleNullCount(SampleInfo info) {
return "0";
}

@Override
public String getMax() {
return "''";
}

@Override
public String getFullMin() {
public String getMin() {
return "''";
}

Expand All @@ -43,7 +54,7 @@ public String getFullNullCount() {
}

@Override
public String getFullNDV() {
public String getNDV() {
return "00";
}
}

This file was deleted.

Loading

0 comments on commit 1f75bcf

Please sign in to comment.