Skip to content

Commit

Permalink
[ML] Fix data drift calculating inaccurate p value when range is not …
Browse files Browse the repository at this point in the history
…of uniform distribution (elastic#168757)
  • Loading branch information
qn895 authored and hop-dev committed Oct 18, 2023
1 parent d9ff342 commit f3facd7
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,6 @@ export const DataDriftPage: FC<Props> = ({ initialSettings }) => {
label={comparisonIndexPatternLabel}
randomSampler={randomSamplerProd}
reload={forceRefresh}
brushSelectionUpdateHandler={brushSelectionUpdate}
documentCountStats={documentStatsProd.documentCountStats}
documentCountStatsSplit={documentStatsProd.documentCountStatsCompare}
isBrushCleared={isBrushCleared}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ export interface DocumentCountContentProps
| 'interval'
| 'chartPointsSplitLabel'
> {
brushSelectionUpdateHandler: BrushSelectionUpdateHandler;
brushSelectionUpdateHandler?: BrushSelectionUpdateHandler;
documentCountStats?: DocumentCountStats;
documentCountStatsSplit?: DocumentCountStats;
documentCountStatsSplitLabel?: string;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import { isDefined } from '@kbn/ml-is-defined';
import { computeChi2PValue, type Histogram } from '@kbn/ml-chi2test';
import { mapAndFlattenFilters } from '@kbn/data-plugin/public';

import type { AggregationsRangeBucketKeys } from '@elastic/elasticsearch/lib/api/types';
import { createMergedEsQuery } from '../index_data_visualizer/utils/saved_search_utils';
import { useDataVisualizerKibana } from '../kibana_context';

Expand Down Expand Up @@ -378,6 +379,7 @@ const fetchComparisonDriftedData = async ({
fields,
baselineResponseAggs,
baseRequest,
baselineRequest,
randomSamplerWrapper,
signal,
}: {
Expand All @@ -387,10 +389,19 @@ const fetchComparisonDriftedData = async ({
randomSamplerWrapper: RandomSamplerWrapper;
signal: AbortSignal;
baselineResponseAggs: object;
baselineRequest: EsRequestParams;
}) => {
const driftedRequest = { ...baseRequest };

const driftedRequestAggs: Record<string, estypes.AggregationsAggregationContainer> = {};

// Since aggregation is not able to split the values into distinct 5% intervals,
// this breaks our assumption of uniform distributed fractions in the`ks_test`.
// So, to fix this in the general case, we need to run an additional ranges agg to get the doc count for the ranges
// that we get from the percentiles aggregation
// and use it in the bucket_count_ks_test
const rangesRequestAggs: Record<string, estypes.AggregationsAggregationContainer> = {};

for (const { field, type } of fields) {
if (
isPopulatedObject(baselineResponseAggs, [`${field}_percentiles`]) &&
Expand All @@ -410,19 +421,16 @@ const fetchComparisonDriftedData = async ({
ranges.push({ from: percentiles[idx - 1], to: val });
}
});
// add range and bucket_count_ks_test to the request
driftedRequestAggs[`${field}_ranges`] = {
const rangeAggs = {
range: {
field,
ranges,
},
};
driftedRequestAggs[`${field}_ks_test`] = {
bucket_count_ks_test: {
buckets_path: `${field}_ranges>_count`,
alternative: ['two_sided'],
},
};
// add range and bucket_count_ks_test to the request
rangesRequestAggs[`${field}_ranges`] = rangeAggs;
driftedRequestAggs[`${field}_ranges`] = rangeAggs;

// add stats aggregation to the request
driftedRequestAggs[`${field}_stats`] = {
stats: {
Expand All @@ -441,13 +449,66 @@ const fetchComparisonDriftedData = async ({
}
}

// Compute fractions based on results of ranges
const rangesResp = await dataSearch(
{
...baselineRequest,
body: { ...baselineRequest.body, aggs: randomSamplerWrapper.wrap(rangesRequestAggs) },
},
signal
);

const fieldsWithNoOverlap = new Set<string>();
for (const { field } of fields) {
if (rangesResp.aggregations[`${field}_ranges`]) {
const buckets = rangesResp.aggregations[`${field}_ranges`]
.buckets as AggregationsRangeBucketKeys[];

if (buckets) {
const totalSumOfAllBuckets = buckets.reduce((acc, bucket) => acc + bucket.doc_count, 0);

const fractions = buckets.map((bucket) => ({
...bucket,
fraction: bucket.doc_count / totalSumOfAllBuckets,
}));

if (totalSumOfAllBuckets > 0) {
driftedRequestAggs[`${field}_ks_test`] = {
bucket_count_ks_test: {
buckets_path: `${field}_ranges>_count`,
alternative: ['two_sided'],
...(totalSumOfAllBuckets > 0
? { fractions: fractions.map((bucket) => Number(bucket.fraction.toFixed(3))) }
: {}),
},
};
} else {
// If all doc_counts are 0, that means there's no overlap whatsoever
// in which case we don't need to make the ks test agg, because it defaults to astronomically small value
fieldsWithNoOverlap.add(field);
}
}
}
}

const driftedResp = await dataSearch(
{
...driftedRequest,
body: { ...driftedRequest.body, aggs: randomSamplerWrapper.wrap(driftedRequestAggs) },
},
signal
);

fieldsWithNoOverlap.forEach((field) => {
if (driftedResp.aggregations) {
driftedResp.aggregations[`${field}_ks_test`] = {
// Setting -Infinity to represent astronomically small number
// which would be represented as < 0.000001 in table
two_sided: -Infinity,
};
}
});

return driftedResp;
};

Expand Down Expand Up @@ -678,7 +739,7 @@ export const useFetchDataComparisonResult = (

setResult({ data: undefined, status: FETCH_STATUS.LOADING, error: undefined });

// Place holder for when there might be difference data views in the future
// Placeholder for when there might be difference data views in the future
const referenceIndex = initialSettings
? initialSettings.reference
: currentDataView?.getIndexPattern();
Expand Down Expand Up @@ -802,6 +863,7 @@ export const useFetchDataComparisonResult = (
fetchComparisonDriftedData({
dataSearch,
baseRequest: driftedRequest,
baselineRequest,
baselineResponseAggs,
fields: chunkedFields,
randomSamplerWrapper: prodRandomSamplerWrapper,
Expand Down

0 comments on commit f3facd7

Please sign in to comment.