Skip to content

Commit

Permalink
HPCC-30559 Update DataPatterns.Profile to v1.9.3
Browse files Browse the repository at this point in the history
Changes include:

* Support UTF-8 strings in Mode values and example text patterns

* Security updates

* Better identify upper- and lower-case Unicode characters in text patterns

* Scan Unicode and UTF-8 strings to see if they can be represented with a STRING data type instead

Signed-off-by: Dan S. Camper <[email protected]>
  • Loading branch information
dcamper committed Oct 17, 2023
1 parent a05215b commit d9b9b17
Showing 1 changed file with 123 additions and 16 deletions.
139 changes: 123 additions & 16 deletions ecllibrary/std/DataPatterns/Profile.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,10 @@ EXPORT Profile(inFile,
#UNIQUENAME(fullName); // The full name of an attribute
#UNIQUENAME(needsDelim); // Boolean indicating whether we need to insert a delimiter somewhere
#UNIQUENAME(namePos); // Contains character offset information, for parsing delimited strings
#UNIQUENAME(namePos2); // Contains character offset information, for parsing delimited strings
#UNIQUENAME(numValue); // Extracted numeric value from a string
#UNIQUENAME(nameValue); // Extracted string value from a string
#UNIQUENAME(nameValue2); // Extracted string value from a string

IMPORT Std;

Expand Down Expand Up @@ -278,7 +280,9 @@ EXPORT Profile(inFile,
%ungroupedInFile%
);

// Slim the dataset if the caller provided an explicit set of attributes
// Slim the dataset if the caller provided an explicit set of attributes;
// note that explicit attributes within a top-level child dataset will
// cause the entire top-level child dataset to be retained
#UNIQUENAME(workingInFile);
LOCAL %workingInFile% :=
#IF(%trimmedFieldList% = '')
Expand All @@ -290,14 +294,23 @@ EXPORT Profile(inFile,
{
#SET(needsDelim, 0)
#SET(namePos, 1)
#SET(nameValue2, '')
#LOOP
#SET(temp, REGEXFIND('^([^,]+)', %trimmedFieldList%[%namePos%..], 1))
#IF(%'temp'% != '')
#IF(%needsDelim% = 1) , #END
#SET(nameValue, REGEXFIND('^([^\\.]+)', %'temp'%, 1))
#IF(NOT REGEXFIND('\\b' + %'nameValue'% + '\\b', %'nameValue2'%))
#IF(%'nameValue2'% != '')
#APPEND(nameValue2, ',')
#END
#APPEND(nameValue2, %'nameValue'%)

TYPEOF(%sampledData%.%temp%) %temp% := %temp%
#IF(%needsDelim% = 1) , #END

TYPEOF(%sampledData%.%nameValue%) %nameValue% := %nameValue%

#SET(needsDelim, 1)
#SET(needsDelim, 1)
#END
#SET(namePos, %namePos% + LENGTH(%'temp'%) + 1)
#ELSE
#BREAK
Expand Down Expand Up @@ -377,14 +390,14 @@ EXPORT Profile(inFile,

// Define the record layout that will be used by the inner _Inner_Profile() call
LOCAL ModeRec := RECORD
STRING value;
UTF8 value;
UNSIGNED4 rec_count;
END;

LOCAL PatternCountRec := RECORD
STRING data_pattern;
UNSIGNED4 rec_count;
STRING example;
UTF8 example;
END;

LOCAL CorrelationRec := RECORD
Expand Down Expand Up @@ -514,6 +527,67 @@ EXPORT Profile(inFile,
#UNIQUENAME(_MakeAttr);
LOCAL %_MakeAttr%(STRING attr) := REGEXREPLACE('\\.', attr, '_');

// Determine if a UTF-8 string really contains UTF-8 characters
#UNIQUENAME(IsUTF8);
LOCAL BOOLEAN %IsUTF8%(UTF8 str) := EMBED(C++)
if (lenStr == 0)
return false;
const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str);
const unsigned char* endPtr = bytes + lenStr;
while (bytes < endPtr)
{
if (bytes[0] == 0x09 || bytes[0] == 0x0A || bytes[0] == 0x0D || (0x20 <= bytes[0] && bytes[0] <= 0x7E))
{
// ASCII; continue scan
bytes += 1;
}
else if ((0xC2 <= bytes[0] && bytes[0] <= 0xDF) && (0x80 <= bytes[1] && bytes[1] <= 0xBF))
{
// Valid non-overlong 2-byte
return true;
}
else if (bytes[0] == 0xE0 && (0xA0 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF))
{
// Valid excluding overlongs
return true;
}
else if (((0xE1 <= bytes[0] && bytes[0] <= 0xEC) || bytes[0] == 0xEE || bytes[0] == 0xEF) && (0x80 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF))
{
// Valid straight 3-byte
return true;
}
else if (bytes[0] == 0xED && (0x80 <= bytes[1] && bytes[1] <= 0x9F) && (0x80 <= bytes[2] && bytes[2] <= 0xBF))
{
// Valid excluding surrogates
return true;
}
else if (bytes[0] == 0xF0 && (0x90 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF))
{
// Valid planes 1-3
return true;
}
else if ((0xF1 <= bytes[0] && bytes[0] <= 0xF3) && (0x80 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF))
{
// Valid planes 4-15
return true;
}
else if (bytes[0] == 0xF4 && (0x80 <= bytes[1] && bytes[1] <= 0x8F) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF))
{
// Valid plane 16
return true;
}
else
{
// Invalid; abort
return false;
}
}
return false;
ENDEMBED;

// Pattern mapping a STRING datatype
#UNIQUENAME(_MapAllStr);
LOCAL STRING %_MapAllStr%(STRING s) := EMBED(C++)
Expand All @@ -539,9 +613,9 @@ EXPORT Profile(inFile,
// Pattern mapping a UNICODE datatype; using regex due to the complexity
// of the character set
#UNIQUENAME(_MapUpperCharUni);
LOCAL %_MapUpperCharUni%(UNICODE s) := REGEXREPLACE(u'[[:upper:]]', s, u'A');
LOCAL %_MapUpperCharUni%(UNICODE s) := REGEXREPLACE(u'\\p{Uppercase_Letter}', s, u'A');
#UNIQUENAME(_MapLowerCharUni);
LOCAL %_MapLowerCharUni%(UNICODE s) := REGEXREPLACE(u'[[:lower:]]', s, u'a');
LOCAL %_MapLowerCharUni%(UNICODE s) := REGEXREPLACE(u'[[\\p{Lowercase_Letter}][\\p{Titlecase_Letter}][\\p{Modifier_Letter}][\\p{Other_Letter}]]', s, u'a');
#UNIQUENAME(_MapDigitUni);
LOCAL %_MapDigitUni%(UNICODE s) := REGEXREPLACE(u'[1-9]', s, u'9'); // Leave '0' as-is and replace with '9' later
#UNIQUENAME(_MapAllUni);
Expand Down Expand Up @@ -616,7 +690,7 @@ EXPORT Profile(inFile,
#UNIQUENAME(DataPattern_t);
LOCAL %DataPattern_t% := #EXPAND('STRING' + %'foundMaxPatternLen'%);
#UNIQUENAME(StringValue_t);
LOCAL %StringValue_t% := #EXPAND('STRING' + %'foundMaxPatternLen'%);
LOCAL %StringValue_t% := #EXPAND('UTF8_' + %'foundMaxPatternLen'%);

// Create a dataset containing pattern information, string length, and
// booleans indicating filled and numeric datatypes for each processed
Expand All @@ -640,6 +714,7 @@ EXPORT Profile(inFile,
UNSIGNED4 data_length;
BOOLEAN is_filled;
BOOLEAN is_number;
BOOLEAN is_unicode;
END;

#UNIQUENAME(dataInfo);
Expand Down Expand Up @@ -684,9 +759,9 @@ EXPORT Profile(inFile,
#ELSEIF(REGEXFIND('(integer)|(unsigned)|(decimal)|(real)|(boolean)', %'@type'%))
(%StringValue_t%)_inFile.#EXPAND(%'namePrefix'% + %'@name'%)
#ELSEIF(REGEXFIND('string', %'@type'%))
%_TrimmedStr%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
%_TrimmedUni%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
#ELSE
%_TrimmedStr%((%StringValue_t%)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
%_TrimmedUni%((%StringValue_t%)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
#END,
UNSIGNED4 value_count := COUNT(GROUP),
%DataPattern_t% data_pattern :=
Expand Down Expand Up @@ -744,6 +819,14 @@ EXPORT Profile(inFile,
TRUE
#ELSE
FALSE
#END,
BOOLEAN is_unicode :=
#IF(%_IsSetType%(%'@type'%))
FALSE
#ELSEIF(REGEXFIND('(unicode)|(utf)', %'@type'%))
%IsUTF8%((UTF8)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
#ELSE
FALSE
#END
},
_inFile.#EXPAND(%'namePrefix'% + %'@name'%),
Expand Down Expand Up @@ -834,11 +917,12 @@ EXPORT Profile(inFile,
given_attribute_type,
data_pattern,
data_length,
is_unicode,
%DataTypeEnum% type_flag := %BestTypeFlag%(TRIM(data_pattern), given_attribute_type),
UNSIGNED4 min_data_length := 0 // will be populated within %attributesWithTypeFlagsSummary%

},
attribute, given_attribute_type, data_pattern, data_length,
attribute, given_attribute_type, data_pattern, data_length, is_unicode,
MERGE
);

Expand All @@ -860,6 +944,7 @@ EXPORT Profile(inFile,
RECORDOF(%attributeTypePatterns%),
SELF.data_length := MAX(LEFT.data_length, RIGHT.data_length),
SELF.min_data_length := %MinNotZero%(LEFT.data_length, RIGHT.data_length),
SELF.is_unicode := LEFT.is_unicode OR RIGHT.is_unicode,
SELF.type_flag := IF(TRIM(RIGHT.attribute) != '', LEFT.type_flag & RIGHT.type_flag, LEFT.type_flag),
SELF := LEFT
),
Expand All @@ -868,6 +953,7 @@ EXPORT Profile(inFile,
RECORDOF(%attributeTypePatterns%),
SELF.data_length := MAX(RIGHT1.data_length, RIGHT2.data_length),
SELF.min_data_length := %MinNotZero%(RIGHT1.data_length, RIGHT2.data_length),
SELF.is_unicode := RIGHT1.is_unicode OR RIGHT2.is_unicode,
SELF.type_flag := RIGHT1.type_flag & RIGHT2.type_flag,
SELF := RIGHT1
),
Expand Down Expand Up @@ -898,7 +984,8 @@ EXPORT Profile(inFile,
(LEFT.type_flag & %DataTypeEnum%.SignedInteger) != 0 => 'integer' + %Len2Size%(LEFT.data_length),
(LEFT.type_flag & %DataTypeEnum%.FloatingPoint) != 0 => 'real' + IF(LEFT.data_length < 8, '4', '8'),
(LEFT.type_flag & %DataTypeEnum%.ExpNotation) != 0 => 'real8',
REGEXFIND('utf', LEFT.given_attribute_type) => LEFT.given_attribute_type,
REGEXFIND('utf', LEFT.given_attribute_type) AND LEFT.is_unicode => LEFT.given_attribute_type,
REGEXFIND('utf', LEFT.given_attribute_type) => 'string' + IF(LEFT.data_length > 0 AND (LEFT.data_length < (LEFT.min_data_length * 1000)), (STRING)LEFT.data_length, ''),
REGEXREPLACE('\\d+$', TRIM(LEFT.given_attribute_type), '') + IF(LEFT.data_length > 0 AND (LEFT.data_length < (LEFT.min_data_length * 1000)), (STRING)LEFT.data_length, '')
),
SELF := LEFT
Expand Down Expand Up @@ -1051,7 +1138,7 @@ EXPORT Profile(inFile,
TRANSFORM
(
ModeRec,
SELF.value := LEFT.string_value,
SELF.value := (UTF8)LEFT.string_value,
SELF.rec_count := LEFT.rec_count
),
SMART
Expand Down Expand Up @@ -1198,7 +1285,7 @@ EXPORT Profile(inFile,
{
attribute,
data_pattern,
STRING example := string_value[..%foundMaxPatternLen%],
UTF8 example := string_value[..%foundMaxPatternLen%],
UNSIGNED4 rec_count := SUM(GROUP, value_count)
},
attribute, data_pattern,
Expand Down Expand Up @@ -1577,14 +1664,34 @@ EXPORT Profile(inFile,
#IF(%'dsNameValue'% != '')
#SET(numValue, REGEXFIND('^(\\d+):', %'dsNameValue'%, 1))
#SET(nameValue, REGEXFIND(':([^:]+)$', %'dsNameValue'%, 1))
// Extract a list of fields within this child dataset if necessary
#SET(explicitScalarFields, '')
#SET(needsDelim, 0)
#SET(namePos2, 1)
#LOOP
#SET(temp, REGEXFIND('^([^,]+)', %trimmedFieldList%[%namePos2%..], 1))
#IF(%'temp'% != '')
#SET(nameValue2, REGEXFIND('^' + %'nameValue'% + '\\.([^,]+)', %'temp'%, 1))
#IF(%'nameValue2'% != '')
#IF(%needsDelim% = 1)
#APPEND(explicitScalarFields, ',')
#END
#APPEND(explicitScalarFields, %'nameValue2'%)
#SET(needsDelim, 1)
#END
#SET(namePos2, %namePos2% + LENGTH(%'temp'%) + 1)
#ELSE
#BREAK
#END
#END
// The child dataset should have been extracted into its own
// local attribute; reference it during our call to the inner
// profile function macro
#SET(temp, #MANGLE(%'nameValue'%))
+ _Inner_Profile
(
GLOBAL(%temp%),
'',
%'explicitScalarFields'%,
maxPatterns,
maxPatternLen,
%lowCardinalityThreshold%,
Expand Down

0 comments on commit d9b9b17

Please sign in to comment.