Skip to content

Commit

Permalink
HPCC-30776 Add support for generic file formats to ECL
Browse files Browse the repository at this point in the history
  • Loading branch information
dcamper committed Feb 9, 2024
1 parent 5f9b805 commit b0276d1
Show file tree
Hide file tree
Showing 12 changed files with 128 additions and 19 deletions.
2 changes: 2 additions & 0 deletions ecl/hql/hqlatoms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ IAtom * fieldsAtom;
IAtom * __fileposAtom;
IAtom * _signed_Atom;
IAtom * filenameAtom;
IAtom * fileTypeAtom;
IAtom * filepositionAtom;
IAtom * _files_Atom;
IAtom * filterAtom;
Expand Down Expand Up @@ -661,6 +662,7 @@ MODULE_INIT(INIT_PRIORITY_HQLATOM)
MAKEATOM(field);
MAKEATOM(fields);
MAKEATOM(filename);
MAKEATOM(fileType);
MAKEATOM(__filepos);
MAKESYSATOM(signed);
MAKEATOM(fileposition);
Expand Down
1 change: 1 addition & 0 deletions ecl/hql/hqlatoms.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ extern HQL_API IAtom * fewAtom;
extern HQL_API IAtom * fieldAtom;
extern HQL_API IAtom * fieldsAtom;
extern HQL_API IAtom * filenameAtom;
extern HQL_API IAtom * fileTypeAtom;
extern HQL_API IAtom * __fileposAtom;
extern HQL_API IAtom * _signed_Atom;
extern HQL_API IAtom * filepositionAtom;
Expand Down
3 changes: 2 additions & 1 deletion ecl/hql/hqlattr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,7 @@ unsigned getOperatorMetaFlags(node_operator op)
case no_thor:
case no_flat:
case no_pipe:
case no_filetype:
case no_joined:
case no_any:
case no_xml:
Expand Down Expand Up @@ -622,7 +623,7 @@ unsigned getOperatorMetaFlags(node_operator op)

case no_unused6:
case no_unused13: case no_unused14: case no_unused15:
case no_unused34: case no_unused35: case no_unused36: case no_unused37: case no_unused38:
case no_unused35: case no_unused36: case no_unused37: case no_unused38:
case no_unused40: case no_unused41: case no_unused42: case no_unused43: case no_unused44: case no_unused45: case no_unused46: case no_unused47: case no_unused48: case no_unused49:
case no_unused50: case no_unused52:
case no_unused80:
Expand Down
4 changes: 3 additions & 1 deletion ecl/hql/hqlexpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1604,6 +1604,7 @@ const char *getOpString(node_operator op)
case no_csv: return "CSV";
case no_xml: return "XML";
case no_json: return "JSON";
case no_filetype: return "TYPE";

case no_when: return "WHEN";
case no_priority: return "PRIORITY";
Expand Down Expand Up @@ -2020,7 +2021,7 @@ const char *getOpString(node_operator op)

case no_unused6:
case no_unused13: case no_unused14: case no_unused15:
case no_unused34: case no_unused35: case no_unused36: case no_unused37: case no_unused38:
case no_unused35: case no_unused36: case no_unused37: case no_unused38:
case no_unused40: case no_unused41: case no_unused42: case no_unused43: case no_unused44: case no_unused45: case no_unused46: case no_unused47: case no_unused48: case no_unused49:
case no_unused50: case no_unused52:
case no_unused80:
Expand Down Expand Up @@ -5095,6 +5096,7 @@ unsigned CHqlRealExpression::getCachedEclCRC()
case no_csv:
case no_xml:
case no_json:
case no_filetype:
case no_null:
if (thisType && (thisType->getTypeCode() == type_null))
thisType = nullptr;
Expand Down
2 changes: 1 addition & 1 deletion ecl/hql/hqlexpr.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ enum node_operator : unsigned short {
no_unlikely,
no_inline,
no_nwaydistribute,
no_unused34,
no_filetype, // File format/type information (input or output)
no_unused35,
no_unused36,
no_unused37,
Expand Down
2 changes: 2 additions & 0 deletions ecl/hql/hqlgram.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -766,6 +766,8 @@ class HqlGram : implements IErrorReceiver, public CInterface
void checkValidPipeRecord(const attribute & errpos, IHqlExpression * record, IHqlExpression * attrs, IHqlExpression * expr);
void checkValidLookupFlag(IHqlExpression * dataset, IHqlExpression * filename, attribute & atr);

void setPluggableModeExpr(attribute & targetAttr, attribute & pluginAttr, attribute & options);

void createAppendDictionaries(attribute & targetAttr, attribute & leftAttr, attribute & rightAttr, IAtom * kind);
void createAppendFiles(attribute & targetAttr, attribute & leftAttr, attribute & rightAttr, IAtom * kind);
IHqlExpression * createAppendFiles(attribute & filesAttr, IHqlExpression * _attrs);
Expand Down
11 changes: 10 additions & 1 deletion ecl/hql/hqlgram.y
Original file line number Diff line number Diff line change
Expand Up @@ -3566,6 +3566,11 @@ outputFlag
$3.unwindCommaList(args);
$$.setExpr(createExprAttribute(jsonAtom, args), $1);
}
| TYPE '(' UNKNOWN_ID attribs ')'
{
parser->setPluggableModeExpr($$, $3, $4);
$$.setPosition($1);
}
| UPDATE {
$$.setExpr(createComma(createAttribute(updateAtom), createAttribute(overwriteAtom)), $1);
}
Expand Down Expand Up @@ -10494,7 +10499,7 @@ mode
: FLAT { $$.setExpr(createValue(no_flat, makeNullType())); }
| CSV { $$.setExpr(createValue(no_csv, makeNullType())); }
| CSV '(' csvOptions ')'
{
{
HqlExprArray args;
$3.unwindCommaList(args);
$$.setExpr(createValue(no_csv, makeNullType(), args));
Expand Down Expand Up @@ -10542,6 +10547,10 @@ mode
$$.setExpr(createValue(no_json, makeNullType(), args));
}
| pipe
| TYPE '(' UNKNOWN_ID attribs ')'
{
parser->setPluggableModeExpr($$, $3, $4);
}
;

dsOption
Expand Down
30 changes: 30 additions & 0 deletions ecl/hql/hqlgram2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8992,6 +8992,36 @@ bool HqlGram::convertAllToAttribute(attribute &atr)
return true;
}

void HqlGram::setPluggableModeExpr(attribute & targetAttr, attribute & pluginAttr, attribute & options)
{
const char * fileFormatStr = str(pluginAttr.getId());
bool isSupportedFileType = false;

// TODO: Hardcoded tests for known filetype plugins; should be
// an iteration through a list of available types, checking for names
isSupportedFileType = strisame(fileFormatStr, "parquet");

if (isSupportedFileType)
{
// Do we just cite the name of the plugin, or perhaps something
// like a unique ID from the plugin's factory (assuming there is one)?
OwnedHqlExpr fileTypeExp = createExprAttribute(fileTypeAtom, createConstant(fileFormatStr));

// Create an option list with the filetype first (easy to find while debugging)
HqlExprArray args;
args.append(*LINK(fileTypeExp));
options.unwindCommaList(args);
targetAttr.setExpr(createValue(no_filetype, makeNullType(), args));
}
else
{
// ERR_UNKNOWN_TYPE is generic; create file type-specific error?
reportError(ERR_UNKNOWN_TYPE, pluginAttr, "Unknown plugin file type %s", fileFormatStr);
options.release();
// Set the target to something...
targetAttr.setExpr(createValue(no_thor, makeNullType()));
}
}

void HqlGram::checkValidRecordMode(IHqlExpression * dataset, attribute & atr, attribute & modeattr)
{
Expand Down
2 changes: 1 addition & 1 deletion ecl/hql/hqlir.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,6 @@ const char * getOperatorIRText(node_operator op)
EXPAND_CASE(no,unlikely);
EXPAND_CASE(no,inline);
EXPAND_CASE(no,nwaydistribute);
EXPAND_CASE(no,unused34);
EXPAND_CASE(no,unused35);
EXPAND_CASE(no,unused36);
EXPAND_CASE(no,unused37);
Expand Down Expand Up @@ -662,6 +661,7 @@ const char * getOperatorIRText(node_operator op)
EXPAND_CASE(no,getenv);
EXPAND_CASE(no,json);
EXPAND_CASE(no,matched_injoin);
EXPAND_CASE(no,filetype);
}

return "<unknown>";
Expand Down
1 change: 1 addition & 0 deletions ecl/hql/hqltrans.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2687,6 +2687,7 @@ bool onlyTransformOnce(IHqlExpression * expr)
case no_csv:
case no_xml:
case no_json:
case no_filetype:
case no_list:
return (expr->numChildren() == 0);
case no_select:
Expand Down
46 changes: 32 additions & 14 deletions ecl/hqlcpp/hqlsource.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -632,7 +632,7 @@ static bool forceLegacyMapping(IHqlExpression * expr)
class SourceBuilder
{
public:
SourceBuilder(HqlCppTranslator & _translator, IHqlExpression *_tableExpr, IHqlExpression *_nameExpr, bool canReadGenerically)
SourceBuilder(HqlCppTranslator & _translator, IHqlExpression *_tableExpr, IHqlExpression *_nameExpr, bool canReadGenerically, bool forceReadGenerically)
: tableExpr(_tableExpr), newInputMapping(false), translator(_translator)
{
nameExpr.setown(foldHqlExpression(_nameExpr));
Expand Down Expand Up @@ -661,7 +661,7 @@ class SourceBuilder
isUnfilteredCount = false;
requiresOrderedMerge = false;
genericDiskReads = translator.queryOptions().genericDiskReads;
genericDiskRead = genericDiskReads && canReadGenerically;
genericDiskRead = (genericDiskReads && canReadGenerically) || forceReadGenerically;
rootSelfRow = NULL;
activityKind = TAKnone;

Expand Down Expand Up @@ -2847,8 +2847,8 @@ void SourceBuilder::gatherSteppingMeta(IHqlExpression * expr, SourceSteppingInfo
class DiskReadBuilderBase : public SourceBuilder
{
public:
DiskReadBuilderBase(HqlCppTranslator & _translator, IHqlExpression *_tableExpr, IHqlExpression *_nameExpr, bool canReadGenerically)
: SourceBuilder(_translator, _tableExpr, _nameExpr, canReadGenerically), monitors(_tableExpr, _translator, 0, true, true)
DiskReadBuilderBase(HqlCppTranslator & _translator, IHqlExpression *_tableExpr, IHqlExpression *_nameExpr, bool canReadGenerically, bool forceReadGenerically)
: SourceBuilder(_translator, _tableExpr, _nameExpr, canReadGenerically, forceReadGenerically), monitors(_tableExpr, _translator, 0, true, true)
{
fpos.setown(getFilepos(tableExpr, false));
lfpos.setown(getFilepos(tableExpr, true));
Expand Down Expand Up @@ -2912,7 +2912,17 @@ void DiskReadBuilderBase::buildMembers(IHqlExpression * expr)
if ((modeOp != no_thor) && (modeOp != no_flat))
{
StringBuffer format;
format.append(getOpString(modeOp)).toLowerCase();
if (modeOp != no_filetype)
{
format.append(getOpString(modeOp)).toLowerCase();
}
else
{
// Pluggable file type; cite the file type name
IHqlExpression * fileType = queryAttributeChild(mode, fileTypeAtom, 0);
getStringValue(format, fileType);
format.toLowerCase();
}
instance->startctx.addQuotedF("virtual const char * queryFormat() { return \"%s\"; }", format.str());
}
}
Expand Down Expand Up @@ -3096,9 +3106,10 @@ class DiskReadBuilder : public DiskReadBuilderBase
{
public:
DiskReadBuilder(HqlCppTranslator & _translator, IHqlExpression *_tableExpr, IHqlExpression *_nameExpr)
: DiskReadBuilderBase(_translator, _tableExpr, _nameExpr, (_tableExpr->queryChild(2)->getOperator() != no_pipe))
: DiskReadBuilderBase(_translator, _tableExpr, _nameExpr, (_tableExpr->queryChild(2)->getOperator() != no_pipe), (_tableExpr->queryChild(2)->getOperator() == no_filetype))
{
extractCanMatch = (modeOp == no_thor) || (modeOp == no_flat) ||
(modeOp == no_filetype) ||
((modeOp == no_csv) && genericDiskRead);
}

Expand Down Expand Up @@ -3232,10 +3243,15 @@ void DiskReadBuilder::buildFormatOption(BuildCtx & ctx, IHqlExpression * name, I

void DiskReadBuilder::buildFormatOptions(BuildCtx & fixedCtx, BuildCtx & dynCtx, IHqlExpression * expr)
{
IHqlExpression * pluggableFileTypeAtom = expr->queryAttribute(fileTypeAtom); // null if pluggable file type not used

ForEachChild(i, expr)
{
IHqlExpression * cur = expr->queryChild(i);
if (cur->isAttribute())

// Skip if expression is a pluggable file type (we don't want it appearing as an option)
// or if it is not an attribute
if (cur != pluggableFileTypeAtom && cur->isAttribute())
{
OwnedHqlExpr name = createConstant(str(cur->queryName()));
if (cur->numChildren())
Expand Down Expand Up @@ -3398,7 +3414,7 @@ class DiskNormalizeBuilder : public DiskReadBuilderBase
{
public:
DiskNormalizeBuilder(HqlCppTranslator & _translator, IHqlExpression *_tableExpr, IHqlExpression *_nameExpr)
: DiskReadBuilderBase(_translator, _tableExpr, _nameExpr, false)
: DiskReadBuilderBase(_translator, _tableExpr, _nameExpr, false, false)
{
}

Expand Down Expand Up @@ -3470,7 +3486,7 @@ class DiskAggregateBuilder : public DiskReadBuilderBase
{
public:
DiskAggregateBuilder(HqlCppTranslator & _translator, IHqlExpression *_tableExpr, IHqlExpression *_nameExpr)
: DiskReadBuilderBase(_translator, _tableExpr, _nameExpr, false)
: DiskReadBuilderBase(_translator, _tableExpr, _nameExpr, false, false)
{
failedFilterValue.clear();
}
Expand Down Expand Up @@ -3538,7 +3554,7 @@ class DiskCountBuilder : public DiskReadBuilderBase
{
public:
DiskCountBuilder(HqlCppTranslator & _translator, IHqlExpression *_tableExpr, IHqlExpression *_nameExpr, node_operator _aggOp)
: DiskReadBuilderBase(_translator, _tableExpr, _nameExpr, false)
: DiskReadBuilderBase(_translator, _tableExpr, _nameExpr, false, false)
{
aggOp = _aggOp;
isCompoundCount = true;
Expand Down Expand Up @@ -3637,7 +3653,7 @@ class DiskGroupAggregateBuilder : public DiskReadBuilderBase
{
public:
DiskGroupAggregateBuilder(HqlCppTranslator & _translator, IHqlExpression *_tableExpr, IHqlExpression *_nameExpr)
: DiskReadBuilderBase(_translator, _tableExpr, _nameExpr, false)
: DiskReadBuilderBase(_translator, _tableExpr, _nameExpr, false, false)
{
failedFilterValue.clear();
}
Expand Down Expand Up @@ -3708,7 +3724,7 @@ class ChildBuilderBase : public SourceBuilder
{
public:
ChildBuilderBase(HqlCppTranslator & _translator, IHqlExpression *_tableExpr, IHqlExpression *_nameExpr)
: SourceBuilder(_translator, _tableExpr, _nameExpr, false)
: SourceBuilder(_translator, _tableExpr, _nameExpr, false, false)
{
}

Expand Down Expand Up @@ -3983,7 +3999,7 @@ class IndexReadBuilderBase : public SourceBuilder
friend class MonitorRemovalTransformer;
public:
IndexReadBuilderBase(HqlCppTranslator & _translator, IHqlExpression *_tableExpr, IHqlExpression *_nameExpr)
: SourceBuilder(_translator, _tableExpr, _nameExpr, false),
: SourceBuilder(_translator, _tableExpr, _nameExpr, false, false),
monitors(_tableExpr, _translator, -(int)numPayloadFields(_tableExpr), false, getHintBool(_tableExpr, createValueSetsAtom, _translator.queryOptions().createValueSets))
{
}
Expand Down Expand Up @@ -4874,6 +4890,7 @@ ABoundActivity * HqlCppTranslator::doBuildActivityTable(BuildCtx & ctx, IHqlExpr
case no_flat:
case no_pipe:
case no_csv:
case no_filetype:
return doBuildActivityDiskRead(ctx, expr);
case no_xml:
case no_json:
Expand All @@ -4889,7 +4906,7 @@ class FetchBuilder : public SourceBuilder
{
public:
FetchBuilder(HqlCppTranslator & _translator, IHqlExpression *_tableExpr, IHqlExpression *_nameExpr, IHqlExpression * _fetchExpr)
: SourceBuilder(_translator, _tableExpr, _nameExpr, false)
: SourceBuilder(_translator, _tableExpr, _nameExpr, false, false)
{
compoundExpr.set(_fetchExpr);
fetchExpr.set(queryFetch(_fetchExpr));
Expand Down Expand Up @@ -4959,6 +4976,7 @@ void FetchBuilder::buildMembers(IHqlExpression * expr)
}
case no_xml:
case no_json:
case no_filetype:
break;
default:
translator.buildFormatCrcFunction(instance->classctx, "getDiskFormatCrc", physicalRecord);
Expand Down
43 changes: 43 additions & 0 deletions ecl/regress/filetypeplugin.ecl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*##############################################################################

HPCC SYSTEMS software Copyright (C) 2024 HPCC Systems®.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
############################################################################## */

// If option is FALSE or omitted, generic disk reads are off by default but
// the use of DATASET(,TYPE()) should enable generic disk reads for that
// one activity
// #OPTION('genericDiskReads', TRUE);

NamesLayout := RECORD
STRING20 surname;
STRING10 forename;
INTEGER2 age := 25;
END;

//------------------------------------------------------------------------------

namesTableParquet_1 := DATASET(DYNAMIC('~parquet_in::no_options'), NamesLayout, TYPE(PARQUET), OPT);
// OUTPUT(namesTableParquet_1, {namesTableParquet_1}, '~parquet_out::no_options', TYPE(PARQUET), OVERWRITE);
OUTPUT(namesTableParquet_1);

namesTableParquet_2 := DATASET(DYNAMIC('~parquet_in::with_options'), NamesLayout, TYPE(PARQUET : RANDOMFLAG, FOO(TRUE), BAR('BAZ')), OPT);
// OUTPUT(namesTableParquet_2, {namesTableParquet_2}, '~parquet_out::with_options', TYPE(PARQUET : BACKWARDS(TRUE)), OVERWRITE);
OUTPUT(namesTableParquet_2);

// Following is present just to see what the IR output looks like
testCSV_1 := DATASET(DYNAMIC('~csv_in::no_options'), NamesLayout, CSV, OPT);
OUTPUT(testCSV_1);
testCSV_2 := DATASET(DYNAMIC('~csv_in::with_options'), NamesLayout, CSV(HEADING(1)), OPT);
OUTPUT(testCSV_2);

0 comments on commit b0276d1

Please sign in to comment.