Skip to content

Commit

Permalink
HPCC-30416 Change Parquet Plugin ECL interface
Browse files Browse the repository at this point in the history
  • Loading branch information
jackdelv committed Oct 3, 2023
1 parent 8b3ec0a commit 0929b3e
Show file tree
Hide file tree
Showing 12 changed files with 48 additions and 45 deletions.
6 changes: 3 additions & 3 deletions plugins/parquet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@ The Parquet Plugin offers the following main functions:
The Read function allows ECL programmers to create an ECL dataset from both regular and partitioned Parquet files. It leverages the Apache Arrow interface for Parquet to efficiently stream data from ECL to the plugin, ensuring optimized data transfer.

```
dataset := Read(layout, '/source/directory/data.parquet');
dataset := ParquetIO.Read(layout, '/source/directory/data.parquet');
```

#### 2. Writing Parquet Files

The Write function empowers ECL programmers to write ECL datasets to Parquet files. By leveraging the Parquet format's columnar storage capabilities, this function provides efficient compression and optimized storage for data.

```
Write(inDataset, '/output/directory/data.parquet');
ParquetIO.Write(inDataset, '/output/directory/data.parquet');
```

### Partitioned Files (Tabular Datasets)
Expand All @@ -51,7 +51,7 @@ Write(inDataset, '/output/directory/data.parquet');
The Read Partition function extends the Read functionality by enabling ECL programmers to read from partitioned Parquet files.

```
github_dataset := ReadPartition(layout, '/source/directory/partioned_dataset');
github_dataset := ParquetIO.ReadPartition(layout, '/source/directory/partioned_dataset');
```

#### 2. Writing Partitioned Files
Expand Down
4 changes: 2 additions & 2 deletions plugins/parquet/examples/blob_test.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ END;
#IF(0)
in_image_data := DATASET('~parquet::image', imageRecord, FLAT);
OUTPUT(in_image_data, NAMED('IN_IMAGE_DATA'));
PARQUET.Write(in_image_data, '/datadrive/dev/test_data/test_image.parquet');
ParquetIO.Write(in_image_data, '/datadrive/dev/test_data/test_image.parquet');

#END;

#IF(1)
out_image_data := Read({DATA image}, '/datadrive/dev/test_data/test_image.parquet');
out_image_data := ParquetIO.Read({DATA image}, '/datadrive/dev/test_data/test_image.parquet');
OUTPUT(out_image_data, NAMED('OUT_IMAGE_DATA'));
#END
6 changes: 3 additions & 3 deletions plugins/parquet/examples/create_partition.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ layout := RECORD
END;

#IF(0)
github_dataset := Read(layout, '/datadrive/dev/test_data/ghtorrent-2019-01-07.parquet');
github_dataset := ParquetIO.Read(layout, '/datadrive/dev/test_data/ghtorrent-2019-01-07.parquet');
Write(DISTRIBUTE(github_dataset, SKEW(.05)), '/datadrive/dev/test_data/hpcc_gh_partition/data.parquet');
#END

#IF(1)
github_dataset := ReadPartition(layout, '/datadrive/dev/test_data/hpcc_gh_partition');
OUTPUT(COUNT(github_dataset), NAMED('GITHUB_PARTITION'));
github_dataset := ParquetIO.ReadPartition(layout, '/datadrive/dev/test_data/hpcc_gh_partition');
OUTPUT(CHOOSEN(github_dataset, 10000), NAMED('GITHUB_PARTITION'));
#END
4 changes: 2 additions & 2 deletions plugins/parquet/examples/decimal_test.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ END;
decimal_data := DATASET([{152.25}, {125.56}], layout);

#IF(1)
Write(decimal_data, '/datadrive/dev/test_data/decimal.parquet');
ParquetIO.Write(decimal_data, '/datadrive/dev/test_data/decimal.parquet');
#END

#IF(1)
Read(layout, '/datadrive/dev/test_data/decimal.parquet');
ParquetIO.Read(layout, '/datadrive/dev/test_data/decimal.parquet');
#END
4 changes: 2 additions & 2 deletions plugins/parquet/examples/large_io.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ END;

#IF(0)
csv_data := DATASET('~parquet::large::ghtorrent-2019-02-04.csv', layout, CSV(HEADING(1)));
Write(csv_data, '/datadrive/dev/test_data/ghtorrent-2019-02-04.parquet');
ParquetIO.Write(csv_data, '/datadrive/dev/test_data/ghtorrent-2019-02-04.parquet');
#END

#IF(1)
parquet_data := Read(layout, '/datadrive/dev/test_data/hpcc_gh_partition/data.parquet');
parquet_data := ParquetIO.Read(layout, '/datadrive/dev/test_data/hpcc_gh_partition/data.parquet');
OUTPUT(COUNT(parquet_data), NAMED('ghtorrent_2019_01_07'));
#END
4 changes: 2 additions & 2 deletions plugins/parquet/examples/nested_io.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ nested_dataset := DATASET([{U'J\353ck', U'\353ackson', { {22, 2, ['James', 'Jona
{'Amy', U'Amy\353on', { {59, 1, ['Andy']}, 3.9, 59}}, {'Grace', U'Graceso\353', { {11, 3, ['Grayson', 'Gina', 'George']}, 7.9, 100}}], parentRec);

#IF(1)
Write(nested_dataset, '/datadrive/dev/test_data/nested.parquet');
ParquetIO.Write(nested_dataset, '/datadrive/dev/test_data/nested.parquet');
#END

#IF(1)
read_in := Read(parentRec, '/datadrive/dev/test_data/nested.parquet');
read_in := ParquetIO.Read(parentRec, '/datadrive/dev/test_data/nested.parquet');
OUTPUT(read_in, NAMED('NESTED_PARQUET_IO'));
#END
4 changes: 2 additions & 2 deletions plugins/parquet/examples/null_test.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ empty_record :=
end;
empty_dataset := dataset([{-2, ' ', '-2 ', 'NA ', 0, 0, ' ', ' ', 0, ' ', ' ', 0, 0, 0, ' ', ' ', 0, ' ', ' ', 0, 0, 0, ' ', ' ', 0, 0, 0, ' ', ' ', 0, 0, 0, 0, ' ', 'N', ' ', 20151105, 20151105, ' ', ' ', 0, 5, 0, '0', '0', '0', 0, 'N', 0, ' ', ' ', ' ', ' ', ' ', ' ', 0, 0, ' ', '0', '\123\000\000\000\123'}], empty_record);

Write(empty_dataset, '/datadrive/dev/test_data/empty_parquet.parquet');
ParquetIO.Write(empty_dataset, '/datadrive/dev/test_data/empty_parquet.parquet');

empty_data_in := Read(empty_record, '/datadrive/dev/test_data/empty_parquet.parquet');
empty_data_in := ParquetIO.Read(empty_record, '/datadrive/dev/test_data/empty_parquet.parquet');
OUTPUT(empty_data_in);
4 changes: 2 additions & 2 deletions plugins/parquet/examples/parquet_read_write.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ layout := RECORD
INTEGER commit_date;
END;

parquet_data := Read(layout, '/datadrive/dev/test_data/ghtorrent-2019-01-07.parquet');
parquet_data := ParquetIO.Read(layout, '/datadrive/dev/test_data/ghtorrent-2019-01-07.parquet');
part_a := CHOOSEN(parquet_data, 1000);
OUTPUT(CHOOSEN(DATASET('~parquet::large::ghtorrent-2019-02-04.csv', layout, CSV(HEADING(1))), 1000), NAMED('CSV_GHTORRENT'));
OUTPUT(part_a, NAMED('GHTORRENT'));

Write(part_a, '/datadrive/dev/test_data/github_partition/part_a.parquet');
ParquetIO.Write(part_a, '/datadrive/dev/test_data/github_partition/part_a.parquet');
2 changes: 1 addition & 1 deletion plugins/parquet/examples/read_file.ecl
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
IMPORT Parquet;

Parquet.Read({INTEGER n}, '/var/lib/HPCCSystems/mydropzone/sample.parquet');
ParquetIO.Read({INTEGER n}, '/var/lib/HPCCSystems/mydropzone/sample.parquet');
4 changes: 2 additions & 2 deletions plugins/parquet/examples/simple_io.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ simpleRec := RECORD
END;
simpleDataset := DATASET([{1, 2.4356, U'de\3531', 'Jack'}, {1, 2.4356, U'de\3531', 'Jack'}, {2, 4.8937, U'as\352df', 'John'}, {3, 1.8573, 'nj\351vf', 'Jane'}, {4, 9.1235, U'ds\354fg', 'Jill'}, {5, 6.3297, U'po\355gm', 'Jim'}], simpleRec);

Write(simpleDataset, '/datadrive/dev/test_data/simple.parquet');
ParquetIO.Write(simpleDataset, '/datadrive/dev/test_data/simple.parquet');

read_in := Read(simpleRec, '/datadrive/dev/test_data/simple.parquet');
read_in := ParquetIO.Read(simpleRec, '/datadrive/dev/test_data/simple.parquet');
OUTPUT(read_in, NAMED('SIMPLE_PARQUET_IO'));
2 changes: 1 addition & 1 deletion plugins/parquet/examples/write_file.ecl
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ ds := DATASET
DISTRIBUTED
);

Parquet.Write(ds, '/var/lib/HPCCSystems/mydropzone/sample.parquet');
ParquetIO.Write(ds, '/var/lib/HPCCSystems/mydropzone/sample.parquet');
49 changes: 26 additions & 23 deletions plugins/parquet/parquet.ecllib
Original file line number Diff line number Diff line change
Expand Up @@ -19,29 +19,32 @@ SHARED ParquetService := SERVICE : plugin('parquetembed')
BOOLEAN getEmbedContext():cpp, pure, namespace='parquetembed', entrypoint='getEmbedContext', prototype='IEmbedContext* getEmbedContext()';
END;

EXPORT Read(resultLayout, filePath) := FUNCTIONMACRO
STREAMED DATASET(resultLayout) _DoParquetRead() := EMBED(parquet : activity, option('read'), location(filePath))
ENDEMBED;
RETURN _DoParquetRead();
ENDMACRO;

EXPORT ReadPartition(resultLayout, basePath) := FUNCTIONMACRO
STREAMED DATASET(resultLayout) _DoParquetReadPartition() := EMBED(parquet: activity, option('readpartition'), location(basePath))
ENDEMBED;
RETURN _DoParquetReadPartition();
ENDMACRO;

EXPORT Write(outDS, filePath) := MACRO
_DoParquetWrite(STREAMED DATASET(RECORDOF(outDS)) _ds) := EMBED(parquet : activity, option('write'), destination(filePath))
ENDEMBED;
_doParquetWrite(outDS);
ENDMACRO;

EXPORT WritePartition(outDS, outRows = 100000, basePath) := MACRO
_DoParquetWritePartition(STREAMED DATASET(RECORDOF(outDS)) _ds) := EMBED(parquet : activity, option('writepartition'), destination(basePath), MaxRowSize(outRows))
ENDEMBED;
_DoParquetWritePartition(outDS)
ENDMACRO;
EXPORT ParquetIO := MODULE

EXPORT Read(resultLayout, filePath) := FUNCTIONMACRO
LOCAL STREAMED DATASET(resultLayout) _DoParquetRead() := EMBED(parquet : activity, option('read'), location(filePath))
ENDEMBED;
RETURN _DoParquetRead();
ENDMACRO;

EXPORT ReadPartition(resultLayout, basePath) := FUNCTIONMACRO
LOCAL STREAMED DATASET(resultLayout) _DoParquetReadPartition() := EMBED(parquet: activity, option('readpartition'), location(basePath))
ENDEMBED;
RETURN _DoParquetReadPartition();
ENDMACRO;

EXPORT Write(outDS, filePath) := MACRO
LOCAL _DoParquetWrite(STREAMED DATASET(RECORDOF(outDS)) _ds) := EMBED(parquet : activity, option('write'), destination(filePath))
ENDEMBED;
_doParquetWrite(outDS);
ENDMACRO;

EXPORT WritePartition(outDS, outRows = 100000, basePath) := MACRO
LOCAL _DoParquetWritePartition(STREAMED DATASET(RECORDOF(outDS)) _ds) := EMBED(parquet : activity, option('writepartition'), destination(basePath), MaxRowSize(outRows))
ENDEMBED;
_DoParquetWritePartition(outDS)
ENDMACRO;
END;

EXPORT getEmbedContext := ParquetService.getEmbedContext;
EXPORT BOOLEAN supportsImport := FALSE;
Expand Down

0 comments on commit 0929b3e

Please sign in to comment.