-
Notifications
You must be signed in to change notification settings - Fork 304
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
HPCC-30457 ParquetIO.Read() improperly combines hThor and Thor files #17892
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -227,13 +227,48 @@ arrow::Status ParquetHelper::openReadFile() | |
StringBuffer filename; | ||
StringBuffer path; | ||
splitFilename(location.c_str(), nullptr, &path, &filename, nullptr, false); | ||
Owned<IDirectoryIterator> itr = createDirectoryIterator(path.str(), filename.append("*.parquet")); | ||
Owned<IDirectoryIterator> fileItr; | ||
|
||
// hThor and Thor files that were written with parquet have diffent naming schemes | ||
// Thor files are tagged with a worker ID while hThor files are single non-tagged files | ||
if (activityCtx->numSlaves() > 1) | ||
{ | ||
// Executing on Thor: Check for single files that were written by hThor or from external source | ||
Owned<IDirectoryIterator> singleItr = createDirectoryIterator(path.str(), filename.append(".parquet")); | ||
if (!singleItr || !singleItr->first()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. minor: createDirectoryIterator() never returns null, so no need to check. |
||
{ | ||
// Check for partitioned files which will have indexes prefixed to the filename | ||
Owned<IDirectoryIterator> multItr = createDirectoryIterator(path.str(), filename.insert(filename.length() - 8, '*')); | ||
fileItr = multItr; | ||
} | ||
else | ||
{ | ||
// There was a single file so check for partitioned files | ||
Owned<IDirectoryIterator> multItr = createDirectoryIterator(path.str(), filename.insert(filename.length() - 8, "[0-9]*")); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this does what you expect. "[0-9]*" is not treated as a regular expression it is a filename wildcard containing '*' or '?'. I think you will need to check for filename* and then post-process. Or use "filename*.parquet"? |
||
if (!multItr || !multItr->isValid()) | ||
fileItr = singleItr; // If there aren't any partitioned files read the single file with Thor | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not unconditionally assign multiItr, neither of them contain any values |
||
else | ||
fileItr = multItr; | ||
} | ||
} | ||
else | ||
{ | ||
// When reading on hThor check for single file and only if there isn't a single file check for partitioned files to read | ||
Owned<IDirectoryIterator> singleItr = createDirectoryIterator(path.str(), filename.append(".parquet")); | ||
if (!singleItr || !singleItr->first()) | ||
{ | ||
Owned<IDirectoryIterator> multItr = createDirectoryIterator(path.str(), filename.insert(filename.length() - 8, '*')); | ||
fileItr = multItr; | ||
} | ||
else | ||
fileItr = singleItr; | ||
} | ||
|
||
auto reader_properties = parquet::ReaderProperties(pool); | ||
auto arrow_reader_props = parquet::ArrowReaderProperties(); | ||
ForEach (*itr) | ||
ForEach (*fileItr) | ||
{ | ||
IFile &file = itr->query(); | ||
IFile &file = fileItr->query(); | ||
parquet::arrow::FileReaderBuilder reader_builder; | ||
reportIfFailure(reader_builder.OpenFile(file.queryFilename(), false, reader_properties)); | ||
reader_builder.memory_pool(pool); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
typo: different