diff --git a/ecllibrary/std/DataPatterns/Profile.ecl b/ecllibrary/std/DataPatterns/Profile.ecl index 5129e246e3f..760f5ecf31b 100644 --- a/ecllibrary/std/DataPatterns/Profile.ecl +++ b/ecllibrary/std/DataPatterns/Profile.ecl @@ -202,8 +202,10 @@ EXPORT Profile(inFile, #UNIQUENAME(fullName); // The full name of an attribute #UNIQUENAME(needsDelim); // Boolean indicating whether we need to insert a delimiter somewhere #UNIQUENAME(namePos); // Contains character offset information, for parsing delimited strings + #UNIQUENAME(namePos2); // Contains character offset information, for parsing delimited strings #UNIQUENAME(numValue); // Extracted numeric value from a string #UNIQUENAME(nameValue); // Extracted string value from a string + #UNIQUENAME(nameValue2); // Extracted string value from a string IMPORT Std; @@ -278,7 +280,9 @@ EXPORT Profile(inFile, %ungroupedInFile% ); - // Slim the dataset if the caller provided an explicit set of attributes + // Slim the dataset if the caller provided an explicit set of attributes; + // note that explicit attributes within a top-level child dataset will + // cause the entire top-level child dataset to be retained #UNIQUENAME(workingInFile); LOCAL %workingInFile% := #IF(%trimmedFieldList% = '') @@ -290,14 +294,23 @@ EXPORT Profile(inFile, { #SET(needsDelim, 0) #SET(namePos, 1) + #SET(nameValue2, '') #LOOP #SET(temp, REGEXFIND('^([^,]+)', %trimmedFieldList%[%namePos%..], 1)) #IF(%'temp'% != '') - #IF(%needsDelim% = 1) , #END + #SET(nameValue, REGEXFIND('^([^\\.]+)', %'temp'%, 1)) + #IF(NOT REGEXFIND('\\b' + %'nameValue'% + '\\b', %'nameValue2'%)) + #IF(%'nameValue2'% != '') + #APPEND(nameValue2, ',') + #END + #APPEND(nameValue2, %'nameValue'%) - TYPEOF(%sampledData%.%temp%) %temp% := %temp% + #IF(%needsDelim% = 1) , #END + + TYPEOF(%sampledData%.%nameValue%) %nameValue% := %nameValue% - #SET(needsDelim, 1) + #SET(needsDelim, 1) + #END #SET(namePos, %namePos% + LENGTH(%'temp'%) + 1) #ELSE #BREAK @@ -377,14 +390,14 @@ EXPORT Profile(inFile, // Define the record layout that will be used by the inner _Inner_Profile() call LOCAL ModeRec := RECORD - STRING value; + UTF8 value; UNSIGNED4 rec_count; END; LOCAL PatternCountRec := RECORD STRING data_pattern; UNSIGNED4 rec_count; - STRING example; + UTF8 example; END; LOCAL CorrelationRec := RECORD @@ -514,6 +527,67 @@ EXPORT Profile(inFile, #UNIQUENAME(_MakeAttr); LOCAL %_MakeAttr%(STRING attr) := REGEXREPLACE('\\.', attr, '_'); + // Determine if a UTF-8 string really contains UTF-8 characters + #UNIQUENAME(IsUTF8); + LOCAL BOOLEAN %IsUTF8%(UTF8 str) := EMBED(C++) + if (lenStr == 0) + return false; + + const unsigned char* bytes = reinterpret_cast(str); + const unsigned char* endPtr = bytes + lenStr; + + while (bytes < endPtr) + { + if (bytes[0] == 0x09 || bytes[0] == 0x0A || bytes[0] == 0x0D || (0x20 <= bytes[0] && bytes[0] <= 0x7E)) + { + // ASCII; continue scan + bytes += 1; + } + else if ((0xC2 <= bytes[0] && bytes[0] <= 0xDF) && (0x80 <= bytes[1] && bytes[1] <= 0xBF)) + { + // Valid non-overlong 2-byte + return true; + } + else if (bytes[0] == 0xE0 && (0xA0 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF)) + { + // Valid excluding overlongs + return true; + } + else if (((0xE1 <= bytes[0] && bytes[0] <= 0xEC) || bytes[0] == 0xEE || bytes[0] == 0xEF) && (0x80 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF)) + { + // Valid straight 3-byte + return true; + } + else if (bytes[0] == 0xED && (0x80 <= bytes[1] && bytes[1] <= 0x9F) && (0x80 <= bytes[2] && bytes[2] <= 0xBF)) + { + // Valid excluding surrogates + return true; + } + else if (bytes[0] == 0xF0 && (0x90 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF)) + { + // Valid planes 1-3 + return true; + } + else if ((0xF1 <= bytes[0] && bytes[0] <= 0xF3) && (0x80 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF)) + { + // Valid planes 4-15 + return true; + } + else if (bytes[0] == 0xF4 && (0x80 <= bytes[1] && bytes[1] <= 0x8F) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF)) + { + // Valid plane 16 + return true; + } + else + { + // Invalid; abort + return false; + } + } + + return false; + ENDEMBED; + // Pattern mapping a STRING datatype #UNIQUENAME(_MapAllStr); LOCAL STRING %_MapAllStr%(STRING s) := EMBED(C++) @@ -539,9 +613,9 @@ EXPORT Profile(inFile, // Pattern mapping a UNICODE datatype; using regex due to the complexity // of the character set #UNIQUENAME(_MapUpperCharUni); - LOCAL %_MapUpperCharUni%(UNICODE s) := REGEXREPLACE(u'[[:upper:]]', s, u'A'); + LOCAL %_MapUpperCharUni%(UNICODE s) := REGEXREPLACE(u'\\p{Uppercase_Letter}', s, u'A'); #UNIQUENAME(_MapLowerCharUni); - LOCAL %_MapLowerCharUni%(UNICODE s) := REGEXREPLACE(u'[[:lower:]]', s, u'a'); + LOCAL %_MapLowerCharUni%(UNICODE s) := REGEXREPLACE(u'[[\\p{Lowercase_Letter}][\\p{Titlecase_Letter}][\\p{Modifier_Letter}][\\p{Other_Letter}]]', s, u'a'); #UNIQUENAME(_MapDigitUni); LOCAL %_MapDigitUni%(UNICODE s) := REGEXREPLACE(u'[1-9]', s, u'9'); // Leave '0' as-is and replace with '9' later #UNIQUENAME(_MapAllUni); @@ -616,7 +690,7 @@ EXPORT Profile(inFile, #UNIQUENAME(DataPattern_t); LOCAL %DataPattern_t% := #EXPAND('STRING' + %'foundMaxPatternLen'%); #UNIQUENAME(StringValue_t); - LOCAL %StringValue_t% := #EXPAND('STRING' + %'foundMaxPatternLen'%); + LOCAL %StringValue_t% := #EXPAND('UTF8_' + %'foundMaxPatternLen'%); // Create a dataset containing pattern information, string length, and // booleans indicating filled and numeric datatypes for each processed @@ -640,6 +714,7 @@ EXPORT Profile(inFile, UNSIGNED4 data_length; BOOLEAN is_filled; BOOLEAN is_number; + BOOLEAN is_unicode; END; #UNIQUENAME(dataInfo); @@ -684,9 +759,9 @@ EXPORT Profile(inFile, #ELSEIF(REGEXFIND('(integer)|(unsigned)|(decimal)|(real)|(boolean)', %'@type'%)) (%StringValue_t%)_inFile.#EXPAND(%'namePrefix'% + %'@name'%) #ELSEIF(REGEXFIND('string', %'@type'%)) - %_TrimmedStr%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%)) + %_TrimmedUni%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%)) #ELSE - %_TrimmedStr%((%StringValue_t%)_inFile.#EXPAND(%'namePrefix'% + %'@name'%)) + %_TrimmedUni%((%StringValue_t%)_inFile.#EXPAND(%'namePrefix'% + %'@name'%)) #END, UNSIGNED4 value_count := COUNT(GROUP), %DataPattern_t% data_pattern := @@ -744,6 +819,14 @@ EXPORT Profile(inFile, TRUE #ELSE FALSE + #END, + BOOLEAN is_unicode := + #IF(%_IsSetType%(%'@type'%)) + FALSE + #ELSEIF(REGEXFIND('(unicode)|(utf)', %'@type'%)) + %IsUTF8%((UTF8)_inFile.#EXPAND(%'namePrefix'% + %'@name'%)) + #ELSE + FALSE #END }, _inFile.#EXPAND(%'namePrefix'% + %'@name'%), @@ -834,11 +917,12 @@ EXPORT Profile(inFile, given_attribute_type, data_pattern, data_length, + is_unicode, %DataTypeEnum% type_flag := %BestTypeFlag%(TRIM(data_pattern), given_attribute_type), UNSIGNED4 min_data_length := 0 // will be populated within %attributesWithTypeFlagsSummary% }, - attribute, given_attribute_type, data_pattern, data_length, + attribute, given_attribute_type, data_pattern, data_length, is_unicode, MERGE ); @@ -860,6 +944,7 @@ EXPORT Profile(inFile, RECORDOF(%attributeTypePatterns%), SELF.data_length := MAX(LEFT.data_length, RIGHT.data_length), SELF.min_data_length := %MinNotZero%(LEFT.data_length, RIGHT.data_length), + SELF.is_unicode := LEFT.is_unicode OR RIGHT.is_unicode, SELF.type_flag := IF(TRIM(RIGHT.attribute) != '', LEFT.type_flag & RIGHT.type_flag, LEFT.type_flag), SELF := LEFT ), @@ -868,6 +953,7 @@ EXPORT Profile(inFile, RECORDOF(%attributeTypePatterns%), SELF.data_length := MAX(RIGHT1.data_length, RIGHT2.data_length), SELF.min_data_length := %MinNotZero%(RIGHT1.data_length, RIGHT2.data_length), + SELF.is_unicode := RIGHT1.is_unicode OR RIGHT2.is_unicode, SELF.type_flag := RIGHT1.type_flag & RIGHT2.type_flag, SELF := RIGHT1 ), @@ -898,7 +984,8 @@ EXPORT Profile(inFile, (LEFT.type_flag & %DataTypeEnum%.SignedInteger) != 0 => 'integer' + %Len2Size%(LEFT.data_length), (LEFT.type_flag & %DataTypeEnum%.FloatingPoint) != 0 => 'real' + IF(LEFT.data_length < 8, '4', '8'), (LEFT.type_flag & %DataTypeEnum%.ExpNotation) != 0 => 'real8', - REGEXFIND('utf', LEFT.given_attribute_type) => LEFT.given_attribute_type, + REGEXFIND('utf', LEFT.given_attribute_type) AND LEFT.is_unicode => LEFT.given_attribute_type, + REGEXFIND('utf', LEFT.given_attribute_type) => 'string' + IF(LEFT.data_length > 0 AND (LEFT.data_length < (LEFT.min_data_length * 1000)), (STRING)LEFT.data_length, ''), REGEXREPLACE('\\d+$', TRIM(LEFT.given_attribute_type), '') + IF(LEFT.data_length > 0 AND (LEFT.data_length < (LEFT.min_data_length * 1000)), (STRING)LEFT.data_length, '') ), SELF := LEFT @@ -1051,7 +1138,7 @@ EXPORT Profile(inFile, TRANSFORM ( ModeRec, - SELF.value := LEFT.string_value, + SELF.value := (UTF8)LEFT.string_value, SELF.rec_count := LEFT.rec_count ), SMART @@ -1198,7 +1285,7 @@ EXPORT Profile(inFile, { attribute, data_pattern, - STRING example := string_value[..%foundMaxPatternLen%], + UTF8 example := string_value[..%foundMaxPatternLen%], UNSIGNED4 rec_count := SUM(GROUP, value_count) }, attribute, data_pattern, @@ -1577,6 +1664,26 @@ EXPORT Profile(inFile, #IF(%'dsNameValue'% != '') #SET(numValue, REGEXFIND('^(\\d+):', %'dsNameValue'%, 1)) #SET(nameValue, REGEXFIND(':([^:]+)$', %'dsNameValue'%, 1)) + // Extract a list of fields within this child dataset if necessary + #SET(explicitScalarFields, '') + #SET(needsDelim, 0) + #SET(namePos2, 1) + #LOOP + #SET(temp, REGEXFIND('^([^,]+)', %trimmedFieldList%[%namePos2%..], 1)) + #IF(%'temp'% != '') + #SET(nameValue2, REGEXFIND('^' + %'nameValue'% + '\\.([^,]+)', %'temp'%, 1)) + #IF(%'nameValue2'% != '') + #IF(%needsDelim% = 1) + #APPEND(explicitScalarFields, ',') + #END + #APPEND(explicitScalarFields, %'nameValue2'%) + #SET(needsDelim, 1) + #END + #SET(namePos2, %namePos2% + LENGTH(%'temp'%) + 1) + #ELSE + #BREAK + #END + #END // The child dataset should have been extracted into its own // local attribute; reference it during our call to the inner // profile function macro @@ -1584,7 +1691,7 @@ EXPORT Profile(inFile, + _Inner_Profile ( GLOBAL(%temp%), - '', + %'explicitScalarFields'%, maxPatterns, maxPatternLen, %lowCardinalityThreshold%,