-
Notifications
You must be signed in to change notification settings - Fork 57
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DO NOT MERGE: Make binary string encoding configurable, add support for base64 #770
base: master
Are you sure you want to change the base?
Changes from 2 commits
776e52d
0e3e5f1
0ea5339
8d89112
93b8280
b54555c
3519904
b838014
c98252e
f0100f1
252d4c5
f9b5b80
5ffbc0b
d1be60a
30468e7
ecaec0f
334767d
9656537
985b396
c2162b9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,17 +24,14 @@ | |
import java.time.ZoneOffset; | ||
import java.time.ZonedDateTime; | ||
import java.time.format.DateTimeParseException; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Optional; | ||
import java.util.Set; | ||
import java.util.*; | ||
import java.util.function.Supplier; | ||
import net.snowflake.client.jdbc.internal.google.common.collect.Sets; | ||
import net.snowflake.client.jdbc.internal.snowflake.common.core.SnowflakeDateTimeFormat; | ||
import net.snowflake.client.jdbc.internal.snowflake.common.util.Power10; | ||
import net.snowflake.ingest.streaming.internal.serialization.ByteArraySerializer; | ||
import net.snowflake.ingest.streaming.internal.serialization.ZonedDateTimeSerializer; | ||
import net.snowflake.ingest.utils.Constants; | ||
import net.snowflake.ingest.utils.ErrorCode; | ||
import net.snowflake.ingest.utils.SFException; | ||
import org.apache.commons.codec.DecoderException; | ||
|
@@ -615,7 +612,7 @@ static int validateAndParseDate(String columnName, Object input, long insertRowI | |
* @return Validated array | ||
*/ | ||
static byte[] validateAndParseBinary( | ||
String columnName, Object input, Optional<Integer> maxLengthOptional, long insertRowIndex) { | ||
String columnName, Object input, Optional<Integer> maxLengthOptional, long insertRowIndex, Constants.BinaryStringEncoding binaryStringEncoding) { | ||
byte[] output; | ||
if (input instanceof byte[]) { | ||
// byte[] is a mutable object, we need to create a defensive copy to protect against | ||
|
@@ -625,12 +622,30 @@ static byte[] validateAndParseBinary( | |
output = new byte[originalInputArray.length]; | ||
System.arraycopy(originalInputArray, 0, output, 0, originalInputArray.length); | ||
} else if (input instanceof String) { | ||
try { | ||
String stringInput = ((String) input).trim(); | ||
output = Hex.decodeHex(stringInput); | ||
} catch (DecoderException e) { | ||
if(binaryStringEncoding == Constants.BinaryStringEncoding.BASE64) { | ||
try { | ||
String stringInput = ((String) input).trim(); | ||
// Remove double quotes if present | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should not strip surrounding quotes. We don't do it for hex and neither does it work in Snowflake Worksheets. |
||
if (stringInput.length() >= 2 && stringInput.startsWith("\"") && stringInput.endsWith("\"")) { | ||
stringInput = stringInput.substring(1, stringInput.length() - 1); | ||
} | ||
Base64.Decoder decoder = Base64.getDecoder(); | ||
output = decoder.decode(stringInput); | ||
} catch (IllegalArgumentException e) { | ||
throw valueFormatNotAllowedException( | ||
columnName, "BINARY", "Not a valid base64 string", insertRowIndex); | ||
} | ||
} else if (binaryStringEncoding == Constants.BinaryStringEncoding.HEX) { | ||
try { | ||
String stringInput = ((String) input).trim(); | ||
output = Hex.decodeHex(stringInput); | ||
} catch (DecoderException e) { | ||
throw valueFormatNotAllowedException( | ||
columnName, "BINARY", "Not a valid hex string", insertRowIndex); | ||
} | ||
} else { | ||
throw valueFormatNotAllowedException( | ||
columnName, "BINARY", "Not a valid hex string", insertRowIndex); | ||
columnName, "BINARY", "Unsupported binary string format " + binaryStringEncoding.name(), insertRowIndex); | ||
} | ||
} else { | ||
throw typeNotAllowedException( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,6 +39,9 @@ public class ParameterProvider { | |
public static final String BDEC_PARQUET_COMPRESSION_ALGORITHM = | ||
"BDEC_PARQUET_COMPRESSION_ALGORITHM".toLowerCase(); | ||
|
||
public static final String BINARY_STRING_ENCODING = | ||
"BINARY_STRING_ENCODING".toLowerCase(); | ||
|
||
// Default values | ||
public static final long BUFFER_FLUSH_CHECK_INTERVAL_IN_MILLIS_DEFAULT = 100; | ||
public static final long INSERT_THROTTLE_INTERVAL_IN_MILLIS_DEFAULT = 1000; | ||
|
@@ -64,6 +67,9 @@ public class ParameterProvider { | |
public static final Constants.BdecParquetCompression BDEC_PARQUET_COMPRESSION_ALGORITHM_DEFAULT = | ||
Constants.BdecParquetCompression.GZIP; | ||
|
||
public static final Constants.BinaryStringEncoding BINARY_STRING_ENCODING_DEFAULT = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add tests to |
||
Constants.BinaryStringEncoding.HEX; | ||
|
||
/* Parameter that enables using internal Parquet buffers for buffering of rows before serializing. | ||
It reduces memory consumption compared to using Java Objects for buffering.*/ | ||
public static final boolean ENABLE_PARQUET_INTERNAL_BUFFERING_DEFAULT = false; | ||
|
@@ -188,6 +194,13 @@ private void setParameterMap(Map<String, Object> parameterOverrides, Properties | |
BDEC_PARQUET_COMPRESSION_ALGORITHM_DEFAULT, | ||
parameterOverrides, | ||
props); | ||
|
||
this.updateValue( | ||
BINARY_STRING_ENCODING, | ||
BINARY_STRING_ENCODING_DEFAULT, | ||
parameterOverrides, | ||
props); | ||
|
||
} | ||
|
||
/** @return Longest interval in milliseconds between buffer flushes */ | ||
|
@@ -407,6 +420,18 @@ public Constants.BdecParquetCompression getBdecParquetCompressionAlgorithm() { | |
return Constants.BdecParquetCompression.fromName((String) val); | ||
} | ||
|
||
/** @return binary string encoding */ | ||
public Constants.BinaryStringEncoding getBinaryStringEncoding() { | ||
Object val = | ||
this.parameterMap.getOrDefault( | ||
BINARY_STRING_ENCODING, BINARY_STRING_ENCODING_DEFAULT); | ||
if (val instanceof Constants.BinaryStringEncoding) { | ||
return (Constants.BinaryStringEncoding) val; | ||
} | ||
return Constants.BinaryStringEncoding.fromName((String) val); | ||
} | ||
|
||
|
||
@Override | ||
public String toString() { | ||
return "ParameterProvider{" + "parameterMap=" + parameterMap + '}'; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,6 +45,8 @@ | |
import java.util.Map; | ||
import java.util.Optional; | ||
import java.util.TimeZone; | ||
|
||
import net.snowflake.ingest.utils.Constants; | ||
import net.snowflake.ingest.utils.ErrorCode; | ||
import net.snowflake.ingest.utils.SFException; | ||
import org.apache.commons.codec.DecoderException; | ||
|
@@ -888,76 +890,76 @@ public void testValidateAndParseBinary() throws DecoderException { | |
assertArrayEquals( | ||
"honk".getBytes(StandardCharsets.UTF_8), | ||
validateAndParseBinary( | ||
"COL", "honk".getBytes(StandardCharsets.UTF_8), Optional.empty(), 0)); | ||
"COL", "honk".getBytes(StandardCharsets.UTF_8), Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need the same test coverage for base64, as we have for hex. |
||
|
||
assertArrayEquals( | ||
new byte[] {-1, 0, 1}, | ||
validateAndParseBinary("COL", new byte[] {-1, 0, 1}, Optional.empty(), 0)); | ||
validateAndParseBinary("COL", new byte[] {-1, 0, 1}, Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
assertArrayEquals( | ||
Hex.decodeHex("1234567890abcdef"), // pragma: allowlist secret NOT A SECRET | ||
validateAndParseBinary( | ||
"COL", | ||
"1234567890abcdef", // pragma: allowlist secret NOT A SECRET | ||
Optional.empty(), | ||
0)); // pragma: allowlist secret NOT A SECRET | ||
0, Constants.BinaryStringEncoding.HEX)); // pragma: allowlist secret NOT A SECRET | ||
assertArrayEquals( | ||
Hex.decodeHex("1234567890abcdef"), // pragma: allowlist secret NOT A SECRET | ||
validateAndParseBinary( | ||
"COL", | ||
" 1234567890abcdef \t\n", | ||
Optional.empty(), | ||
0)); // pragma: allowlist secret NOT A SECRET | ||
0, Constants.BinaryStringEncoding.HEX)); // pragma: allowlist secret NOT A SECRET | ||
|
||
assertArrayEquals( | ||
maxAllowedArray, validateAndParseBinary("COL", maxAllowedArray, Optional.empty(), 0)); | ||
maxAllowedArray, validateAndParseBinary("COL", maxAllowedArray, Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
assertArrayEquals( | ||
maxAllowedArrayMinusOne, | ||
validateAndParseBinary("COL", maxAllowedArrayMinusOne, Optional.empty(), 0)); | ||
validateAndParseBinary("COL", maxAllowedArrayMinusOne, Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
|
||
// Too large arrays should be rejected | ||
expectError( | ||
ErrorCode.INVALID_VALUE_ROW, | ||
() -> validateAndParseBinary("COL", new byte[1], Optional.of(0), 0)); | ||
() -> validateAndParseBinary("COL", new byte[1], Optional.of(0), 0, Constants.BinaryStringEncoding.HEX)); | ||
expectError( | ||
ErrorCode.INVALID_VALUE_ROW, | ||
() -> validateAndParseBinary("COL", new byte[BYTES_8_MB + 1], Optional.empty(), 0)); | ||
() -> validateAndParseBinary("COL", new byte[BYTES_8_MB + 1], Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
expectError( | ||
ErrorCode.INVALID_VALUE_ROW, | ||
() -> validateAndParseBinary("COL", new byte[8], Optional.of(7), 0)); | ||
() -> validateAndParseBinary("COL", new byte[8], Optional.of(7), 0, Constants.BinaryStringEncoding.HEX)); | ||
expectError( | ||
ErrorCode.INVALID_VALUE_ROW, | ||
() -> validateAndParseBinary("COL", "aabb", Optional.of(1), 0)); | ||
() -> validateAndParseBinary("COL", "aabb", Optional.of(1), 0, Constants.BinaryStringEncoding.HEX)); | ||
|
||
// unsupported data types should fail | ||
expectError( | ||
ErrorCode.INVALID_VALUE_ROW, | ||
() -> validateAndParseBinary("COL", "000", Optional.empty(), 0)); | ||
() -> validateAndParseBinary("COL", "000", Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
expectError( | ||
ErrorCode.INVALID_VALUE_ROW, | ||
() -> validateAndParseBinary("COL", "abcg", Optional.empty(), 0)); | ||
() -> validateAndParseBinary("COL", "abcg", Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
expectError( | ||
ErrorCode.INVALID_VALUE_ROW, () -> validateAndParseBinary("COL", "c", Optional.empty(), 0)); | ||
ErrorCode.INVALID_VALUE_ROW, () -> validateAndParseBinary("COL", "c", Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
expectError( | ||
ErrorCode.INVALID_FORMAT_ROW, | ||
() -> | ||
validateAndParseBinary( | ||
"COL", Arrays.asList((byte) 1, (byte) 2, (byte) 3), Optional.empty(), 0)); | ||
"COL", Arrays.asList((byte) 1, (byte) 2, (byte) 3), Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
expectError( | ||
ErrorCode.INVALID_FORMAT_ROW, () -> validateAndParseBinary("COL", 1, Optional.empty(), 0)); | ||
ErrorCode.INVALID_FORMAT_ROW, () -> validateAndParseBinary("COL", 1, Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
expectError( | ||
ErrorCode.INVALID_FORMAT_ROW, () -> validateAndParseBinary("COL", 12, Optional.empty(), 0)); | ||
ErrorCode.INVALID_FORMAT_ROW, () -> validateAndParseBinary("COL", 12, Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
expectError( | ||
ErrorCode.INVALID_FORMAT_ROW, | ||
() -> validateAndParseBinary("COL", 1.5, Optional.empty(), 0)); | ||
() -> validateAndParseBinary("COL", 1.5, Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
expectError( | ||
ErrorCode.INVALID_FORMAT_ROW, | ||
() -> validateAndParseBinary("COL", BigInteger.ONE, Optional.empty(), 0)); | ||
() -> validateAndParseBinary("COL", BigInteger.ONE, Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
expectError( | ||
ErrorCode.INVALID_FORMAT_ROW, | ||
() -> validateAndParseBinary("COL", false, Optional.empty(), 0)); | ||
() -> validateAndParseBinary("COL", false, Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
expectError( | ||
ErrorCode.INVALID_FORMAT_ROW, | ||
() -> validateAndParseBinary("COL", new Object(), Optional.empty(), 0)); | ||
() -> validateAndParseBinary("COL", new Object(), Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
} | ||
|
||
@Test | ||
|
@@ -1179,19 +1181,19 @@ public void testExceptionMessages() { | |
"The given row cannot be converted to the internal format: Object of type java.lang.Object" | ||
+ " cannot be ingested into Snowflake column COL of type BINARY, rowIndex:0. Allowed" | ||
+ " Java types: byte[], String", | ||
() -> validateAndParseBinary("COL", new Object(), Optional.empty(), 0)); | ||
() -> validateAndParseBinary("COL", new Object(), Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
expectErrorCodeAndMessage( | ||
ErrorCode.INVALID_VALUE_ROW, | ||
"The given row cannot be converted to the internal format due to invalid value: Value" | ||
+ " cannot be ingested into Snowflake column COL of type BINARY, rowIndex:0, reason:" | ||
+ " Binary too long: length=2 maxLength=1", | ||
() -> validateAndParseBinary("COL", new byte[] {1, 2}, Optional.of(1), 0)); | ||
() -> validateAndParseBinary("COL", new byte[] {1, 2}, Optional.of(1), 0, Constants.BinaryStringEncoding.HEX)); | ||
expectErrorCodeAndMessage( | ||
ErrorCode.INVALID_VALUE_ROW, | ||
"The given row cannot be converted to the internal format due to invalid value: Value" | ||
+ " cannot be ingested into Snowflake column COL of type BINARY, rowIndex:0, reason:" | ||
+ " Not a valid hex string", | ||
() -> validateAndParseBinary("COL", "ghi", Optional.empty(), 0)); | ||
() -> validateAndParseBinary("COL", "ghi", Optional.empty(), 0, Constants.BinaryStringEncoding.HEX)); | ||
|
||
// VARIANT | ||
expectErrorCodeAndMessage( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no wildcard imports.