-
Notifications
You must be signed in to change notification settings - Fork 57
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SNOW-902709 Limit the max allowed number of chunks in blob #580
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -452,7 +452,53 @@ ChannelsStatusResponse getChannelsStatus( | |
* @param blobs list of uploaded blobs | ||
*/ | ||
void registerBlobs(List<BlobMetadata> blobs) { | ||
this.registerBlobs(blobs, 0); | ||
for (List<BlobMetadata> blobBatch : partitionBlobListForRegistrationRequest(blobs)) { | ||
this.registerBlobs(blobBatch, 0); | ||
} | ||
} | ||
|
||
/** | ||
* Partition the collection of blobs into sub-lists, so that the total number of chunks in each | ||
* sublist does not exceed the max allowed number of chunks in one registration request. | ||
*/ | ||
List<List<BlobMetadata>> partitionBlobListForRegistrationRequest(List<BlobMetadata> blobs) { | ||
List<List<BlobMetadata>> result = new ArrayList<>(); | ||
List<BlobMetadata> currentBatch = new ArrayList<>(); | ||
int chunksInCurrentBatch = 0; | ||
int maxChunksInBlobAndRegistrationRequest = | ||
parameterProvider.getMaxChunksInBlobAndRegistrationRequest(); | ||
|
||
for (BlobMetadata blob : blobs) { | ||
if (blob.getChunks().size() > maxChunksInBlobAndRegistrationRequest) { | ||
throw new SFException( | ||
ErrorCode.INTERNAL_ERROR, | ||
String.format( | ||
"Incorrectly generated blob detected - number of chunks in the blob is larger than" | ||
+ " the max allowed number of chunks. Please report this bug to Snowflake." | ||
+ " bdec=%s chunkCount=%d maxAllowedChunkCount=%d", | ||
blob.getPath(), blob.getChunks().size(), maxChunksInBlobAndRegistrationRequest)); | ||
} | ||
|
||
if (chunksInCurrentBatch + blob.getChunks().size() > maxChunksInBlobAndRegistrationRequest) { | ||
// Newly added BDEC file would exceed the max number of chunks in a single registration | ||
// request. We put chunks collected so far into the result list and create a new batch with | ||
// the current blob | ||
result.add(currentBatch); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: This probably can't happen as long as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, I added validation that |
||
currentBatch = new ArrayList<>(); | ||
currentBatch.add(blob); | ||
chunksInCurrentBatch = blob.getChunks().size(); | ||
} else { | ||
// Newly added BDEC can be added to the current batch because it does not exceed the max | ||
// number of chunks in a single registration request, yet. | ||
currentBatch.add(blob); | ||
chunksInCurrentBatch += blob.getChunks().size(); | ||
} | ||
} | ||
|
||
if (!currentBatch.isEmpty()) { | ||
result.add(currentBatch); | ||
} | ||
return result; | ||
} | ||
|
||
/** | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
package net.snowflake.ingest.streaming.internal; | ||
|
||
import static net.snowflake.ingest.utils.Constants.ROLE; | ||
|
||
import java.sql.Connection; | ||
import java.sql.ResultSet; | ||
import java.sql.SQLException; | ||
import java.util.Collections; | ||
import java.util.Map; | ||
import java.util.Properties; | ||
import net.snowflake.ingest.TestUtils; | ||
import net.snowflake.ingest.streaming.OpenChannelRequest; | ||
import net.snowflake.ingest.streaming.SnowflakeStreamingIngestChannel; | ||
import net.snowflake.ingest.streaming.SnowflakeStreamingIngestClient; | ||
import net.snowflake.ingest.streaming.SnowflakeStreamingIngestClientFactory; | ||
import net.snowflake.ingest.utils.Constants; | ||
import net.snowflake.ingest.utils.ParameterProvider; | ||
import org.junit.After; | ||
import org.junit.Assert; | ||
import org.junit.Before; | ||
import org.junit.Test; | ||
|
||
/** | ||
* Verified that ingestion work when we ingest into large number of tables from the same client and | ||
* blobs and registration requests have to be cut, so they don't contain large number of chunks | ||
*/ | ||
public class ManyTablesIT { | ||
|
||
private static final int TABLES_COUNT = 20; | ||
private static final int TOTAL_ROWS_COUNT = 200_000; | ||
private String dbName; | ||
private SnowflakeStreamingIngestClient client; | ||
private Connection connection; | ||
private SnowflakeStreamingIngestChannel[] channels; | ||
private String[] offsetTokensPerChannel; | ||
|
||
@Before | ||
public void setUp() throws Exception { | ||
Properties props = TestUtils.getProperties(Constants.BdecVersion.THREE, false); | ||
props.put(ParameterProvider.MAX_CHUNKS_IN_BLOB_AND_REGISTRATION_REQUEST, 2); | ||
if (props.getProperty(ROLE).equals("DEFAULT_ROLE")) { | ||
props.setProperty(ROLE, "ACCOUNTADMIN"); | ||
} | ||
client = SnowflakeStreamingIngestClientFactory.builder("client1").setProperties(props).build(); | ||
connection = TestUtils.getConnection(true); | ||
dbName = String.format("sdk_it_many_tables_db_%d", System.nanoTime()); | ||
|
||
channels = new SnowflakeStreamingIngestChannel[TABLES_COUNT]; | ||
offsetTokensPerChannel = new String[TABLES_COUNT]; | ||
connection.createStatement().execute(String.format("create database %s;", dbName)); | ||
|
||
String[] tableNames = new String[TABLES_COUNT]; | ||
for (int i = 0; i < tableNames.length; i++) { | ||
tableNames[i] = String.format("table_%d", i); | ||
connection.createStatement().execute(String.format("create table table_%d(c int);", i)); | ||
channels[i] = | ||
client.openChannel( | ||
OpenChannelRequest.builder(String.format("channel-%d", i)) | ||
.setDBName(dbName) | ||
.setSchemaName("public") | ||
.setTableName(tableNames[i]) | ||
.setOnErrorOption(OpenChannelRequest.OnErrorOption.ABORT) | ||
.build()); | ||
} | ||
} | ||
|
||
@After | ||
public void tearDown() throws Exception { | ||
connection.createStatement().execute(String.format("drop database %s;", dbName)); | ||
client.close(); | ||
connection.close(); | ||
} | ||
|
||
@Test | ||
public void testIngestionIntoManyTables() throws InterruptedException, SQLException { | ||
for (int i = 0; i < TOTAL_ROWS_COUNT; i++) { | ||
Map<String, Object> row = Collections.singletonMap("c", i); | ||
String offset = String.valueOf(i); | ||
int channelId = i % channels.length; | ||
channels[channelId].insertRow(row, offset); | ||
offsetTokensPerChannel[channelId] = offset; | ||
} | ||
|
||
for (int i = 0; i < channels.length; i++) { | ||
TestUtils.waitForOffset(channels[i], offsetTokensPerChannel[i]); | ||
} | ||
|
||
int totalRowsCount = 0; | ||
ResultSet rs = | ||
connection | ||
.createStatement() | ||
.executeQuery(String.format("show tables in database %s;", dbName)); | ||
while (rs.next()) { | ||
totalRowsCount += rs.getInt("rows"); | ||
} | ||
Assert.assertEquals(TOTAL_ROWS_COUNT, totalRowsCount); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could the logic here combined with
snowflake-ingest-java/src/main/java/net/snowflake/ingest/streaming/internal/FlushService.java
Line 407 in 3a3cbc8
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It could, but it could be triggered in the middle of a chunk, which makes sense when we split based on chunk/blob size, but not for the newly added check. The added check only cares about the number of chunks in a blob, so I put the break just before a new chunk is about to start.