From 1d74b8fa0278e58b96010b4ebf9d187379d0c411 Mon Sep 17 00:00:00 2001 From: Nazar Kovtun Date: Thu, 12 Sep 2024 18:40:10 +0300 Subject: [PATCH] HCK-7920: implemented variant subtype resolve logic on REing from instance --- reverse_engineering/api.js | 17 ++-- .../helpers/columnsREHelper.js | 10 +++ reverse_engineering/helpers/utils.js | 37 +++++++++ .../variantPropertiesSubTypeResolveHelper.js | 83 +++++++++++++++++++ 4 files changed, 142 insertions(+), 5 deletions(-) create mode 100644 reverse_engineering/helpers/variantPropertiesSubTypeResolveHelper.js diff --git a/reverse_engineering/api.js b/reverse_engineering/api.js index 881b14f8..244f767d 100644 --- a/reverse_engineering/api.js +++ b/reverse_engineering/api.js @@ -22,6 +22,7 @@ const { parseDDLStatements } = require('./parseDDLStatements'); const { isSupportUnityCatalog } = require('./helpers/databricksHelper'); const unityTagsHelper = require('./helpers/unityTagsHelper'); const { adaptJsonSchema } = require('./adaptJsonSchema'); +const { getVariantColumnsWithResolvedSubType } = require('./helpers/variantPropertiesSubTypeResolveHelper'); const DEFAULT_DATABRICKS_CATALOG_NAME = 'hive_metastore'; @@ -265,12 +266,14 @@ module.exports = { logger, ); - const columnsOfTypeString = (tableData.properties || []).filter( - property => property.mode === 'string', + const columnsPotentiallyContainingJSON = (tableData.properties || []).filter( + property => property.mode === 'string' || property.mode === 'var', + ); + const hasPotentiallyContainingJSONColumns = !dependencies.lodash.isEmpty( + columnsPotentiallyContainingJSON, ); - const hasColumnsOfTypeString = !dependencies.lodash.isEmpty(columnsOfTypeString); let documents = []; - if (hasColumnsOfTypeString) { + if (hasPotentiallyContainingJSONColumns) { progress({ message: 'Start getting documents from table', containerName: 'databases', @@ -280,10 +283,14 @@ module.exports = { connectionInfo: connectionData, dbName, tableName: table.name, - fields: columnsOfTypeString, + fields: columnsPotentiallyContainingJSON, recordSamplingSettings: data.recordSamplingSettings, logger, }); + tableData.schema = getVariantColumnsWithResolvedSubType({ + propertiesSchema: tableData.schema, + documents, + }); progress({ message: 'Documents retrieved successfully', containerName: 'databases', diff --git a/reverse_engineering/helpers/columnsREHelper.js b/reverse_engineering/helpers/columnsREHelper.js index 9cf098ee..1d4172f8 100644 --- a/reverse_engineering/helpers/columnsREHelper.js +++ b/reverse_engineering/helpers/columnsREHelper.js @@ -73,6 +73,14 @@ const handleType = typeContainer => { ...handleSubtype(typeContainer.val, 'map'), }; } + if (typeContainer.type === 'variant') { + return { + type: 'document', + childType: 'variant', + variantType: 'JSON', + mode: 'var', + }; + } switch (typeContainer.type) { case 'tinyint': case 'smallint': @@ -118,4 +126,6 @@ const reverseTableColumn = column => { }; }; +const handleVariantType = typeContainer => {}; + module.exports = { reverseTableColumn }; diff --git a/reverse_engineering/helpers/utils.js b/reverse_engineering/helpers/utils.js index f68e6536..37ff84e1 100644 --- a/reverse_engineering/helpers/utils.js +++ b/reverse_engineering/helpers/utils.js @@ -98,6 +98,42 @@ const getTemplateDocByJsonSchema = schema => { }, {}); }; +/** + * + * @param {Array} list + * @returns {number | string | boolean} + */ +const getMostFrequentValueInList = list => { + const itemsToFrequencyMap = {}; + + list.forEach(item => { + if (itemsToFrequencyMap[item]) { + itemsToFrequencyMap[item] += 1; + return; + } + + itemsToFrequencyMap[item] = 1; + }); + + const frequencyMapEntries = Object.entries(itemsToFrequencyMap); + const defaultMostFrequentEntry = frequencyMapEntries[0]; + + const mostFrequentItemEntry = frequencyMapEntries.reduce( + ([mostFrequentItemValue, mostFrequentItemFrequency], [item, frequency]) => { + if (frequency > mostFrequentItemFrequency) { + return [item, frequency]; + } + + return [mostFrequentItemValue, mostFrequentItemFrequency]; + }, + defaultMostFrequentEntry, + ); + + const [mostFrequentItemValue] = mostFrequentItemEntry; + + return mostFrequentItemValue; +}; + module.exports = { prepareNamesForInsertionIntoScalaCode, splitTableAndViewNames, @@ -110,4 +146,5 @@ module.exports = { isSupportGettingListOfViews, removeParentheses, getTemplateDocByJsonSchema, + getMostFrequentValueInList, }; diff --git a/reverse_engineering/helpers/variantPropertiesSubTypeResolveHelper.js b/reverse_engineering/helpers/variantPropertiesSubTypeResolveHelper.js new file mode 100644 index 00000000..15e3ad33 --- /dev/null +++ b/reverse_engineering/helpers/variantPropertiesSubTypeResolveHelper.js @@ -0,0 +1,83 @@ +const { getMostFrequentValueInList } = require('./utils'); + +/** + * @typedef { (string | number | boolean) } Primitive + */ + +/** + * + * @param {{ + * propertiesSchema: object, + * documents: object[] + * }} param + * @returns {object} + */ +const getVariantColumnsWithResolvedSubType = ({ propertiesSchema, documents = [] }) => { + const propertiesEntriesWithUpdatedSubtypes = Object.entries(propertiesSchema).map(([propertyName, propertyData]) => + getVariantColumnWithResolvedSubType({ propertyName, propertyData, documents }), + ); + + return Object.fromEntries(propertiesEntriesWithUpdatedSubtypes); +}; + +/** + * + * @param {{ + * propertyName: string, + * propertyValue: object, + * documents: object[] + * }} param + * @returns {[string, object]} + */ +const getVariantColumnWithResolvedSubType = ({ propertyName, propertyData, documents }) => { + if (propertyData?.mode !== 'var') { + return [propertyName, propertyData]; + } + + const propertyDocumentsRecords = documents.map(document => document[propertyName]); + const parsedDocumentsRecords = propertyDocumentsRecords.map(getParsedVariantRecord); + const parsedDocumentsRecordsTypes = parsedDocumentsRecords.map(getDocumentRecordType); + const mostFrequentType = getMostFrequentValueInList(parsedDocumentsRecordsTypes); + + const updatedPropertyValue = { + ...propertyData, + subtype: mostFrequentType, + }; + + return [propertyName, updatedPropertyValue]; +}; + +/** + * + * @param {string} record + * @returns {Primitive | object | Array} + */ +const getParsedVariantRecord = record => { + try { + const parsedRecord = JSON.parse(record); + return parsedRecord; + } catch { + return {}; + } +}; + +/** + * + * @param {Primitive | object | Array} parsedRecord + * @returns {string} + */ +const getDocumentRecordType = parsedRecord => { + if (Array.isArray(parsedRecord)) { + return 'array'; + } + + if (typeof parsedRecord === 'object') { + return parsedRecord ? 'object' : null; + } + + return typeof parsedRecord; +}; + +module.exports = { + getVariantColumnsWithResolvedSubType, +};