Merge pull request #12 from nKandel/master

Tensorflow 2 support added with custom node feature Reviewed-By: Roger Dev <[email protected]> Merged-by: Gavin Halliday <[email protected]>
hpcc-systems · Sep 15, 2023 · 41299e5 · 41299e5
2 parents 7a48af7 + 530ff40
commit 41299e5
Show file tree

Hide file tree

Showing 90 changed files with 9,251 additions and 262 deletions.
diff --git a/GNNI.ecl b/GNNI.ecl
diff --git a/Internal/Keras.ecl b/Internal/Keras.ecl
diff --git a/Internal/TensExtract.ecl b/Internal/TensExtract.ecl
@@ -2,7 +2,7 @@ IMPORT PYTHON3 as PYTHON;
 IMPORT $.^ AS GNN;
 IMPORT GNN.Tensor;
 IMPORT Std.System.Thorlib;
-
+IMPORT GNN.Utils;
 nodeId := Thorlib.node();
 nNodes := Thorlib.nodes();
 
@@ -21,7 +21,8 @@ MAX_SLICE := POWER(2, 24);
   * @see Tensor.AlignTensors
   */
 EXPORT DATASET(t_Tensor) TensExtract(DATASET(t_Tensor) tens, UNSIGNED pos,
-                                    UNSIGNED datcount) := FUNCTION
+                                    UNSIGNED datcount, INTEGER limitNodes=0) := FUNCTION
+
   // Python embed function to do most of the heavy lifting.
   STREAMED DATASET(t_Tensor) extract(STREAMED DATASET(t_Tensor) tens,
             UNSIGNED pos, UNSIGNED datcount, nodeid, nNodes, maxslice) := EMBED(Python: activity)
@@ -179,5 +180,14 @@ EXPORT DATASET(t_Tensor) TensExtract(DATASET(t_Tensor) tens, UNSIGNED pos,
       # END OF getResults()
     return getResults()
   ENDEMBED; // Extract
-  RETURN SORT(extract(tens, pos-1, datcount, nodeId, nNodes, MAX_SLICE), wi, sliceId, LOCAL);
-END;
+
+  effNodes := Utils.getEffNodesNumber(limitNodes);
+
+  extractedData0 := extract(tens, pos-1, datcount, nodeId, nNodes, MAX_SLICE);
+  extractedDataD := DISTRIBUTE(extractedData0, nodeId % effNodes); // ROUNDUP(Thorlib.nodes() / effNodes)
+
+  extractDataD1 := Project(NOCOMBINE(extractedDataD), TRANSFORM(RECORDOF(LEFT), SELF.nodeId:=nodeId, SELF:=LEFT));
+  extractedData := IF(limitNodes=0, extractedData0, extractDataD1);
+
+  RETURN SORT(extractedData, wi, sliceId, LOCAL);
+END;
diff --git a/OBTTests/ecl/ClassicTestModified.ecl b/OBTTests/ecl/ClassicTestModified.ecl
@@ -96,9 +96,13 @@ Test := PROJECT(TestSet, TRANSFORM(RECORDOF(LEFT), SELF.y := targetFunc(LEFT.x[1
 TrainInd := NORMALIZE(Train, featureCount, TRANSFORM(TensData,
                             SELF.indexes := [LEFT.id, COUNTER],
                             SELF.value := LEFT.x[COUNTER]));
+
+OUTPUT(TrainInd, NAMED('TrainInd'));                            
 TrainDep := NORMALIZE(Train, 1, TRANSFORM(TensData,
                             SELF.indexes := [LEFT.id, COUNTER],
                             SELF.value := LEFT.y));
+OUTPUT(TrainDep, NAMED('TrainDep'));                            
+
 
 // Form a Tensor from the tensor data.  This packs the data into 'slices' that can contain dense
 // or sparse portions of the Tensor.  If the tensor is small, it will fit into a single slice.
@@ -108,6 +112,11 @@ TrainDep := NORMALIZE(Train, 1, TRANSFORM(TensData,
 TrainIndTensor:= Tensor.R4.MakeTensor([0, featureCount], TrainInd);
 TrainDepTensor := Tensor.R4.MakeTensor([0, 1], TrainDep);
 
+OUTPUT(TrainIndTensor, NAMED('TrainIndTensor'));                            
+OUTPUT(TrainDepTensor, NAMED('TrainDepTensor'));                            
+
+
+
 TestInd := NORMALIZE(test, featureCount, TRANSFORM(TensData,
                             SELF.indexes := [LEFT.id, COUNTER],
                             SELF.value := LEFT.x[COUNTER]));

diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ using Keras.  This includes Classical (Dense) Neural Networks as
 well as Convolutional and Recursive Networks (such as LSTM), or any combination
 of the above. 
 
-GNN currently supports both Tensorflow 1.x and Tensorflow 2.x versions. It also supports the use of
+GNN currently supports Tensorflow 2.x versions. It also supports the use of
 GPUs in conjunction with Tensorflow, with certain
 restrictions in the supported topology.  Specifically:
 - All servers in a cluster must have the same GPU configuration
@@ -50,8 +50,11 @@ The folder Test/HARTests
 contains tests that show how to create more sophisticated Convolutional and
 Recurrent networks.
 
+The folder Test/PretrainedModelTest
+contains tests that show how to use the pre-trained models.
+
 ## OTHER DOCUMENTATION
 Programmer Documentation is available at:
 [HPCC Machine Learning Library](http://hpccsystems.com/download/free-modules/machine-learning-library)
 A tutorial on installing and running GNN is available at:
-[Generalized Neural Network Blog](http://hpccsystems.com/blog/gnn-bundle)
+[Generalized Neural Network Blog](http://hpccsystems.com/blog/gnn-bundle)
diff --git a/Test/ClassicTest.ecl b/Test/ClassicTest.ecl
@@ -58,7 +58,8 @@ train0 := DATASET(trainCount, TRANSFORM(trainRec,
                       );
 // Be sure to compute Y in a second step.  Otherwise, the RANDOM() will be executed twice and the Y will be based
 // on different values than those assigned to X.  This is an ECL quirk that is not easy to fix.
-train := PROJECT(train0, TRANSFORM(RECORDOF(LEFT), SELF.y := targetFunc(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
+train := PROJECT(train0, TRANSFORM(
+  RECORDOF(LEFT), SELF.y := targetFunc(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
 OUTPUT(train, NAMED('trainData'));
 
 // Build the test data.  Same process as the training data.
@@ -72,7 +73,8 @@ test0 := DATASET(testCount, TRANSFORM(trainRec,
                       SELF.y := 0)
                       );
 
-test := PROJECT(test0, TRANSFORM(RECORDOF(LEFT), SELF.y := targetFunc(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
+test := PROJECT(test0, TRANSFORM(
+  RECORDOF(LEFT), SELF.y := targetFunc(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
 
 // Break the training and test data into X (independent) and Y (dependent) data sets.  Format as Tensor Data.
 trainX0 := NORMALIZE(train, featureCount, TRANSFORM(TensData,

diff --git a/Test/ClassificationTest.ecl b/Test/ClassificationTest.ecl
@@ -76,7 +76,8 @@ train0 := DATASET(trainCount, TRANSFORM(trainRec,
                       );
 // Be sure to compute Y in a second step.  Otherewise, the RANDOM() will be executed twice and the Y will be based
 // on different values than those assigned to X.  This is an ECL quirk that is not easy to fix.
-train := PROJECT(train0, TRANSFORM(RECORDOF(LEFT), SELF.y := targetFunc(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
+train := PROJECT(train0, TRANSFORM(
+  RECORDOF(LEFT), SELF.y := targetFunc(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
 OUTPUT(train, NAMED('trainData'));
 
 // Build the test data.  Same process as the training data.
@@ -90,7 +91,8 @@ test0 := DATASET(testCount, TRANSFORM(trainRec,
                       SELF.y := [])
                       );
 
-test := PROJECT(test0, TRANSFORM(RECORDOF(LEFT), SELF.y := targetFunc(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
+test := PROJECT(test0, TRANSFORM(
+  RECORDOF(LEFT), SELF.y := targetFunc(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
 
 // Break the training and test data into X (independent) and Y (dependent) data sets.
 // Format as NumericField data.
@@ -179,4 +181,6 @@ OUTPUT(metrics, NAMED('metrics'));
 preds := GNNI.PredictNF(mod2, testX);
 
 OUTPUT(testY, ALL, NAMED('testDat'));
-OUTPUT(preds, NAMED('predictions'));
+OUTPUT(preds, NAMED('predictions'));
+
+OUTPUT(IF(metrics[2].value>0.95, 'Pass', 'Fail'), NAMED('Accuracy'));
diff --git a/Test/ExtractTest.ecl b/Test/ExtractTest.ecl
@@ -4,7 +4,7 @@
 /**
   * Test the TensorExtract module
   */
-IMPORT Python;
+IMPORT Python3 as Python;
 IMPORT $.^ AS GNN;
 IMPORT GNN.Tensor;
 IMPORT GNN.Internal.Types AS iTypes;

diff --git a/Test/FuncModelTest.ecl b/Test/FuncModelTest.ecl
@@ -57,7 +57,8 @@ train0R := DATASET(trainCount, TRANSFORM(trainRecR,
                       );
 // Be sure to compute Y in a second step.  Otherwise, the RANDOM() will be executed twice and the Y will be based
 // on different values than those assigned to X.  This is an ECL quirk that is not easy to fix.
-trainR := PROJECT(train0R, TRANSFORM(RECORDOF(LEFT), SELF.y := targetFuncR(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
+trainR := PROJECT(train0R, TRANSFORM(
+  RECORDOF(LEFT), SELF.y := targetFuncR(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
 OUTPUT(trainR, NAMED('trainDataR'));
 
 // Build the test data.  Same process as the training data.
@@ -71,7 +72,8 @@ test0R := DATASET(testCount, TRANSFORM(trainRecR,
                       SELF.y := 0)
                       );
 
-testR := PROJECT(test0R, TRANSFORM(RECORDOF(LEFT), SELF.y := targetFuncR(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
+testR := PROJECT(test0R, TRANSFORM(
+  RECORDOF(LEFT), SELF.y := targetFuncR(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
 
 // Break the training and test data into X (independent) and Y (dependent) data sets.  Format as Tensor Data.
 trainX0R := NORMALIZE(trainR, featureCount, TRANSFORM(TensData,
@@ -132,7 +134,8 @@ train0C := DATASET(trainCount, TRANSFORM(trainRecC,
                       );
 // Be sure to compute Y in a second step.  Otherewise, the RANDOM() will be executed twice and the Y will be based
 // on different values than those assigned to X.  This is an ECL quirk that is not easy to fix.
-trainC := PROJECT(train0C, TRANSFORM(RECORDOF(LEFT), SELF.y := targetFuncC(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
+trainC := PROJECT(train0C, TRANSFORM(
+  RECORDOF(LEFT), SELF.y := targetFuncC(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
 OUTPUT(trainC, NAMED('trainDataC'));
 
 // Build the test data.  Same process as the training data.
@@ -146,7 +149,8 @@ test0C := DATASET(testCount, TRANSFORM(trainRecC,
                       SELF.y := [])
                       );
 
-testC := PROJECT(test0C, TRANSFORM(RECORDOF(LEFT), SELF.y := targetFuncC(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
+testC := PROJECT(test0C, TRANSFORM(
+  RECORDOF(LEFT), SELF.y := targetFuncC(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
 
 // Break the training and test data into X (independent) and Y (dependent) data sets.
 // Format as NumericField data.

diff --git a/Test/MultiModel.ecl b/Test/MultiModel.ecl
@@ -61,7 +61,9 @@ trainR0 := DATASET(trainCount, TRANSFORM(trainRecR,
                       );
 // Be sure to compute Y in a second step.  Otherwise, the RANDOM() will be executed twice and the Y will be based
 // on different values than those assigned to X.  This is an ECL quirk that is not easy to fix.
-trainR := PROJECT(trainR0, TRANSFORM(RECORDOF(LEFT), SELF.y := targetFuncR(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
+trainR := PROJECT(trainR0, TRANSFORM(RECORDOF(LEFT), 
+    SELF.y := targetFuncR(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), 
+    SELF := LEFT));
 OUTPUT(trainR, NAMED('trainDataR'));
 
 // Build the test data.  Same process as the training data.
@@ -75,7 +77,9 @@ testR0 := DATASET(testCount, TRANSFORM(trainRecR,
                       SELF.y := 0)
                       );
 
-testR := PROJECT(testR0, TRANSFORM(RECORDOF(LEFT), SELF.y := targetFuncR(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
+testR := PROJECT(testR0, TRANSFORM(RECORDOF(LEFT), 
+    SELF.y := targetFuncR(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), 
+    SELF := LEFT));
 
 // Break the training and test data into X (independent) and Y (dependent) data sets.  Format as Tensor Data.
 trainRX0 := NORMALIZE(trainR, featureCount, TRANSFORM(TensData,
@@ -210,7 +214,9 @@ trainC0 := DATASET(trainCount, TRANSFORM(trainRecC,
                       );
 // Be sure to compute Y in a second step.  Otherewise, the RANDOM() will be executed twice and the Y will be based
 // on different values than those assigned to X.  This is an ECL quirk that is not easy to fix.
-trainC := PROJECT(trainC0, TRANSFORM(RECORDOF(LEFT), SELF.y := targetFuncC(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
+trainC := PROJECT(trainC0, TRANSFORM(RECORDOF(LEFT), 
+    SELF.y := targetFuncC(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), 
+    SELF := LEFT));
 OUTPUT(trainC, NAMED('trainData'));
 
 // Build the test data.  Same process as the training data.
@@ -224,7 +230,9 @@ testC0 := DATASET(testCount, TRANSFORM(trainRecC,
                       SELF.y := [])
                       );
 
-testC := PROJECT(testC0, TRANSFORM(RECORDOF(LEFT), SELF.y := targetFuncC(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), SELF := LEFT));
+testC := PROJECT(testC0, TRANSFORM(RECORDOF(LEFT), 
+    SELF.y := targetFuncC(LEFT.x[1], LEFT.x[2], LEFT.x[3], LEFT.x[4], LEFT.x[5]), 
+    SELF := LEFT));
 
 // Break the training and test data into X (independent) and Y (dependent) data sets.
 // Format as NumericField data.

diff --git a/Test/PretrainedModelTest/ConvNeXtBase.ecl b/Test/PretrainedModelTest/ConvNeXtBase.ecl
@@ -0,0 +1,110 @@
+/*##############################################################################
+## HPCC SYSTEMS software Copyright (C) 2023 HPCC Systems.  All rights reserved.
+############################################################################## */
+/*
+About this test:
+  Test the usability of Pre-trained Model ConvNeXtBase.
+  Reference: https://www.tensorflow.org/api_docs/python/tf/keras/applications/convnext/ConvNeXtBase
+  Input shape = (224, 224, 3) 
+  Note: The outputs of convnext.preprocess_input are integers
+
+Results:
+
+class                   probability
+tusker	                9.285942077636719
+African_elephant	      8.67857837677002
+Indian_elephant	        3.142804145812988
+*/
+
+IMPORT Python3 AS Python;
+IMPORT $.^ AS GNN;
+IMPORT GNN.GNNI;
+IMPORT GNN.Tensor;
+IMPORT GNN.Internal AS int;
+IMPORT GNN.Internal.Types AS iTypes;
+IMPORT Std.System.Thorlib;
+IMPORT STD;
+
+kString := iTypes.kString;
+kStrType := iTypes.kStrType;
+t_Tensor := Tensor.R4.t_Tensor;
+TensData := Tensor.R4.TensData;
+
+// load the test data, an image of an elephant
+imageRecord := RECORD
+  STRING filename;
+  DATA   image;   
+       //first 4 bytes contain the length of the image data
+  UNSIGNED8  RecPos{virtual(fileposition)};
+END;
+
+imageData := DATASET('~le::elephant',imageRecord,FLAT);
+OUTPUT(imageData, NAMED('elephant'));
+
+result := (STRING)(imageData[1].image);
+
+SET OF INTEGER hexToNparry(DATA byte_array):= EMBED(Python)
+  from PIL import Image
+  import numpy as np
+  import io
+  try:
+    import tensorflow as tf # V2.x
+  except:
+    assert 1 == 0, 'tensorflow not found'
+  bytes_data = bytes(byte_array)
+  image = Image.open(io.BytesIO(bytes_data))
+  image = image.resize((224,224))
+  I_array = np.array(image)
+  I_array = tf.keras.applications.convnext.preprocess_input(I_array)
+  return I_array.flatten().tolist()
+ENDEMBED;
+
+valueRec := RECORD
+  INTEGER value;
+END;
+
+idValueRec := RECORD
+  UNSIGNED8 id;
+  INTEGER value;
+END;
+
+imageNpArray := hexToNparry(imageData[1].image);
+x1 := DATASET(imageNpArray, valueRec);
+x2 := PROJECT(x1, TRANSFORM(idValueRec, SELF.id := COUNTER - 1, SELF.value := LEFT.value));
+x3 := PROJECT(x2, TRANSFORM(TensData,
+    SELF.indexes := [1, TRUNCATE(LEFT.id/(224*3)) + 1, TRUNCATE(LEFT.id/3)%224 + 1, LEFT.id%3 + 1],
+    SELF.value := LEFT.value));
+x := Tensor.R4.MakeTensor([0,224,224,3], x3);
+
+// load the model
+s := GNNI.GetSession(1);
+ldef := ['''applications.convnext.ConvNeXtBase(weights = "imagenet")'''];
+mod := GNNI.DefineModel(s, ldef);
+
+// Predict 
+preds_tens := GNNI.Predict(mod, x);
+preds := Tensor.R4.GetData(preds_tens);
+
+predictRes := RECORD
+  STRING class;
+  REAL4 probability;
+END;
+
+// decode predictions
+DATASET(predictRes) decodePredictions(DATASET(TensData) preds, INTEGER topK = 3) := EMBED(Python)
+  try:
+    from tensorflow.keras.applications.convnext import decode_predictions
+  except:
+    assert 1 == 0, 'tensorflow not found'
+  import numpy as np
+  predsNp = np.zeros((1, 1000))
+  for pred in preds:
+    predsNp[0, pred[0][1]-1] = pred[1]
+  res = decode_predictions(predsNp, top=topK)[0]
+  ret = []
+  for i in range(topK):
+    ret.append((res[i][1], res[i][2]))
+  return ret
+ENDEMBED;
+
+OUTPUT(decodePredictions(preds), NAMED('predictions'));