-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from jagedn/master
This is an initial upgrade of the plugin using an open source library (carpet) to read/write parquet files avoiding the complexity of using low level library. Carpet can read parquet files as Map but also you can provide a java Record class as projection so the library read only field presents in the Record Main changes: - upgrade to nextflow 24 - use carpet lib - refactor splitParquet - some validation use cases
- Loading branch information
Showing
32 changed files
with
204 additions
and
72 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,8 @@ | ||
package nextflow.parquet | ||
|
||
import nextflow.plugin.extension.Factory | ||
import nextflow.plugin.extension.Function | ||
|
||
import java.nio.file.Path | ||
|
||
import groovy.transform.CompileStatic | ||
|
@@ -12,18 +15,16 @@ import nextflow.extension.CH | |
import nextflow.extension.DataflowHelper | ||
import nextflow.plugin.extension.Operator | ||
import nextflow.plugin.extension.PluginExtensionPoint | ||
import org.apache.hadoop.conf.Configuration | ||
import org.apache.hadoop.fs.Path as HadoopPath | ||
import org.apache.parquet.example.data.Group | ||
import org.apache.parquet.example.data.simple.convert.GroupRecordConverter | ||
import org.apache.parquet.hadoop.ParquetFileReader | ||
import org.apache.parquet.hadoop.util.HadoopInputFile | ||
import org.apache.parquet.io.ColumnIOFactory | ||
|
||
import com.jerolba.carpet.CarpetReader | ||
import com.jerolba.carpet.CarpetWriter | ||
import org.apache.parquet.hadoop.ParquetFileWriter | ||
|
||
/** | ||
* Implements extensions for reading and writing Parquet files. | ||
* | ||
* @author Ben Sherman <[email protected]> | ||
* @author Jorge Aguilera <[email protected]> | ||
*/ | ||
@Slf4j | ||
@CompileStatic | ||
|
@@ -37,14 +38,14 @@ class ParquetExtension extends PluginExtensionPoint { | |
} | ||
|
||
/** | ||
* Load each Parquet file in a source channel, emitting each row as a separate item. | ||
* Load a Parquet file emitting each row as a separate item. | ||
* | ||
* @param path | ||
*/ | ||
@Operator | ||
DataflowWriteChannel splitParquet(DataflowReadChannel source) { | ||
DataflowWriteChannel splitParquet(DataflowReadChannel source, Map params=[:]) { | ||
final target = CH.create() | ||
final splitter = new ParquetSplitter(target) | ||
final splitter = new ParquetSplitter(target, params) | ||
|
||
final onNext = { it -> splitter.apply(it) } | ||
final onComplete = { target << Channel.STOP } | ||
|
@@ -55,66 +56,40 @@ class ParquetExtension extends PluginExtensionPoint { | |
|
||
class ParquetSplitter { | ||
private DataflowWriteChannel target | ||
private Class<Record> clazz | ||
|
||
ParquetSplitter(DataflowWriteChannel target) { | ||
ParquetSplitter(DataflowWriteChannel target, Map params) { | ||
this.target = target | ||
if( params.record ) { | ||
if (!(params.record instanceof Class<Record>)) { | ||
throw new IllegalArgumentException("A Record.class is required. Class provided $params.record") | ||
} | ||
this.clazz = params.record as Class<Record> | ||
} | ||
} | ||
|
||
void apply(Object source) { | ||
try { | ||
log.debug "Start reading $source, with projection ${clazz ?: 'raw'}" | ||
// create parquet reader | ||
final reader = ParquetFileReader.open(HadoopInputFile.fromPath(toHadoopPath(source), new Configuration())) | ||
final schema = reader.getFooter().getFileMetaData().getSchema() | ||
final fields = schema.getFields() | ||
|
||
// read each row from parquet file | ||
def pages = null | ||
try { | ||
while( (pages=reader.readNextRowGroup()) != null ) { | ||
final rows = pages.getRowCount() | ||
final columnIO = new ColumnIOFactory().getColumnIO(schema) | ||
final recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema)) | ||
|
||
for( int i = 0; i < rows; i++ ) | ||
target << fetchRow(recordReader.read()) | ||
} | ||
} | ||
finally { | ||
reader.close() | ||
final reader = new CarpetReader(toFile(source), clazz ?: Map) | ||
for (def record : reader) { | ||
target << record | ||
} | ||
} | ||
catch( IOException e ) { | ||
throw new IllegalStateException("Error while reading Parquet file - cause: ${e.message ?: e}", e) | ||
} | ||
} | ||
|
||
private HadoopPath toHadoopPath(Object source) { | ||
if( source instanceof String ) | ||
new HadoopPath((String)source) | ||
else if( source instanceof Path ) | ||
new HadoopPath(((Path)source).toUriString()) | ||
else | ||
throw new IllegalArgumentException("Invalid input for splitParquet operator: ${source}") | ||
} | ||
|
||
private Map fetchRow(Group group) { | ||
def result = [:] | ||
|
||
final fieldCount = group.getType().getFieldCount() | ||
for( int field = 0; field < fieldCount; field++ ) { | ||
final valueCount = group.getFieldRepetitionCount(field) | ||
final fieldType = group.getType().getType(field) | ||
final fieldName = fieldType.getName() | ||
|
||
for( int index = 0; index < valueCount; index++ ) | ||
if( fieldType.isPrimitive() ) { | ||
println "${fieldName} ${group.getValueToString(field, index)}" | ||
result[fieldName] = group.getValueToString(field, index) | ||
} | ||
private File toFile(Object source){ | ||
return switch( source ){ | ||
case {it instanceof String}->Path.of(source as String).toFile() | ||
case {it instanceof Path} -> (source as Path).toFile() | ||
default->throw new IllegalArgumentException("Invalid input for splitParquet operator: ${source}") | ||
} | ||
|
||
return result | ||
} | ||
|
||
} | ||
|
||
/** | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,18 +17,26 @@ | |
package nextflow.parquet | ||
|
||
import groovy.transform.CompileStatic | ||
import groovy.util.logging.Slf4j | ||
import nextflow.plugin.BasePlugin | ||
import org.pf4j.PluginWrapper | ||
|
||
/** | ||
* Implements the nf-parquet plugin entry point | ||
* | ||
* @author Ben Sherman <[email protected]> | ||
* @author Jorge Aguilera <[email protected]> | ||
*/ | ||
@CompileStatic | ||
@Slf4j | ||
class ParquetPlugin extends BasePlugin { | ||
|
||
ParquetPlugin(PluginWrapper wrapper) { | ||
super(wrapper) | ||
initPlugin() | ||
} | ||
|
||
private void initPlugin(){ | ||
log.debug "${this.class.name} plugin initialized" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
98 changes: 98 additions & 0 deletions
98
plugins/nf-parquet/src/test/nextflow/parquet/PluginTest.groovy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
package nextflow.parquet | ||
|
||
import nextflow.Channel | ||
import nextflow.plugin.Plugins | ||
import nextflow.plugin.TestPluginDescriptorFinder | ||
import nextflow.plugin.TestPluginManager | ||
import nextflow.plugin.extension.PluginExtensionProvider | ||
import org.pf4j.PluginDescriptorFinder | ||
import spock.lang.Shared | ||
import test.Dsl2Spec | ||
import test.MockScriptRunner | ||
|
||
import java.nio.file.Files | ||
import java.nio.file.Path | ||
import java.util.jar.Manifest | ||
|
||
class PluginTest extends Dsl2Spec{ | ||
|
||
@Shared String pluginsMode | ||
|
||
def setup() { | ||
// reset previous instances | ||
PluginExtensionProvider.reset() | ||
// this need to be set *before* the plugin manager class is created | ||
pluginsMode = System.getProperty('pf4j.mode') | ||
System.setProperty('pf4j.mode', 'dev') | ||
// the plugin root should | ||
def root = Path.of('.').toAbsolutePath().normalize() | ||
def manager = new TestPluginManager(root){ | ||
@Override | ||
protected PluginDescriptorFinder createPluginDescriptorFinder() { | ||
return new TestPluginDescriptorFinder(){ | ||
@Override | ||
protected Manifest readManifestFromDirectory(Path pluginPath) { | ||
def manifestPath= getManifestPath(pluginPath) | ||
final input = Files.newInputStream(manifestPath) | ||
return new Manifest(input) | ||
} | ||
protected Path getManifestPath(Path pluginPath) { | ||
return pluginPath.resolve('build/resources/main/META-INF/MANIFEST.MF') | ||
} | ||
} | ||
} | ||
} | ||
Plugins.init(root, 'dev', manager) | ||
} | ||
|
||
def cleanup() { | ||
Plugins.stop() | ||
PluginExtensionProvider.reset() | ||
pluginsMode ? System.setProperty('pf4j.mode',pluginsMode) : System.clearProperty('pf4j.mode') | ||
} | ||
|
||
def 'should starts' () { | ||
when: | ||
def SCRIPT = ''' | ||
channel.of('hi!') | ||
''' | ||
and: | ||
def result = new MockScriptRunner([:]).setScript(SCRIPT).execute() | ||
then: | ||
result.val == 'hi!' | ||
result.val == Channel.STOP | ||
} | ||
|
||
def 'should parse a parquet file in raw mode'(){ | ||
when: | ||
def path = getClass().getResource('/test.parquet').toURI().path | ||
def SCRIPT = """ | ||
include {splitParquet} from 'plugin/nf-parquet' | ||
channel.fromPath("$path").splitParquet() | ||
""".toString() | ||
and: | ||
def result = new MockScriptRunner([:]).setScript(SCRIPT).execute() | ||
then: | ||
result.val == [id:1, name:"test2", sizell:10, value:0.010838246310055144, percentile:0.28001529169191186] | ||
result.val == Channel.STOP | ||
} | ||
|
||
def 'should parse a projection'(){ | ||
when: | ||
def path = getClass().getResource('/test.parquet').toURI().path | ||
def SCRIPT = """ | ||
include {splitParquet} from 'plugin/nf-parquet' | ||
record SingleRecord(long id, String name) { | ||
} | ||
channel.fromPath("$path").splitParquet( [record:SingleRecord] ) | ||
""".toString() | ||
and: | ||
def result = new MockScriptRunner([:]).setScript(SCRIPT).execute() | ||
then: | ||
result.val.id == 1 | ||
result.val == Channel.STOP | ||
} | ||
|
||
} |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
plugins { | ||
id "nf-parquet@${System.getenv("PARQUET_PLUGIN_VERSION") ?: "latest"}" | ||
} | ||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
include { splitParquet } from 'plugin/nf-parquet' | ||
|
||
record SingleRecord(long id, String name) { | ||
} | ||
|
||
channel.fromPath("test*.parquet").splitParquet( record: SingleRecord) | ||
| view |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
include { splitParquet } from 'plugin/nf-parquet' | ||
|
||
import myrecords.* | ||
|
||
channel.fromPath("presidents.parquet").splitParquet() | ||
| view |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
include { splitParquet } from 'plugin/nf-parquet' | ||
|
||
import myrecords.* | ||
|
||
channel.fromPath("*.parquet").splitParquet() | ||
| view |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
package myrecords; | ||
|
||
record Address(String street, String zip, String city) { } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
package myrecords; | ||
record CustomRecord(long id, String name, int sizell, double value, double percentile) { | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
package myrecords; | ||
|
||
record Job(String company, String position, int years){ } |
Oops, something went wrong.