diff --git a/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/beam/DistributionOutlierPipeline.java b/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/beam/DistributionOutlierPipeline.java
index 79971ab91e..331c57975f 100644
--- a/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/beam/DistributionOutlierPipeline.java
+++ b/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/beam/DistributionOutlierPipeline.java
@@ -22,13 +22,8 @@
*
*
distanceOutOfELD: 0 -> inside of EDL, -1: -> No EDLs. >0 -> out of EDL
*
- *
* --datasetId=0057a720-17c9-4658-971e-9578f3577cf5 * --attempt=1 * --runner=SparkRunner *
- * --targetPath=/some/path/to/output/ *
- * --inputPath=/some/path/to/output/0057a720-17c9-4658-971e-9578f3577cf5/1/verbatim.avro
- *
- *
java -jar /data/pipelines-data/avro-tools-1.11.0.jar tojson distribution-00000-of-00005.avro
- * >5.json java -jar /data/pipelines-data/avro-tools-1.11.0.jar fromjson --schema-file schema.avsc
- * ./verbatim.json > verbatim.avro
+ *
Example: java au.org.ala.pipelines.beam.DistributionOutlierPipeline
+ * --config=/data/la-pipelines/config/la-pipelines.yaml --fsPath=/data
*/
@Slf4j
@NoArgsConstructor(access = AccessLevel.PRIVATE)
diff --git a/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/transforms/DistributionOutlierTransform.java b/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/transforms/DistributionOutlierTransform.java
index aa9f0c1e2f..5a8b9a5e00 100644
--- a/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/transforms/DistributionOutlierTransform.java
+++ b/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/transforms/DistributionOutlierTransform.java
@@ -104,6 +104,7 @@ public Iterable apply(
throw new RuntimeException(
"Expert distribution service throws a runtime error, please check logs");
} catch (Exception e) {
+ log.error(e.getMessage());
throw new RuntimeException("Runtime error: " + e.getMessage());
}
@@ -123,7 +124,6 @@ public MapElements flatToString() {
}
/**
- * Only can be used when EDL exists - which means the record can only in/out EDL Force to reset
* distanceOutOfEDL 0: inside edl, -1: no edl
*
* @param record
diff --git a/livingatlas/scripts/la-pipelines b/livingatlas/scripts/la-pipelines
index 7ddcdc56f3..ffb7e6f72e 100755
--- a/livingatlas/scripts/la-pipelines
+++ b/livingatlas/scripts/la-pipelines
@@ -84,6 +84,7 @@ dwca-avro --> interpret --> validate --> uuid --> sds --> image-load ... │
- 'image-load' pushes an export of multimedia AVRO files to the image service;
- 'image-sync' retrieves details of images stored in image service for indexing purposes;
- 'index' generates a AVRO records ready to be index to SOLR;
+- 'outlier' generates a distance to expert distribution layer;
- 'sample' use the sampling service to retrieve values for points for the layers in the spatial service;
- 'jackknife' adds an aggregated jackknife AVRO for all datasets. Requires samping-avro. After running a full 'index' is required.
- 'solr' reads index records in AVRO and submits them to SOLR;
@@ -110,6 +111,7 @@ Usage:
$CMD [options] sample (...|all) $WHERE
$CMD [options] clustering (all) $WHERE
$CMD [options] jackknife (all) $WHERE
+ $CMD [options] outlier (all) $WHERE
$CMD [options] solr (...|all) $WHERE
$CMD [options] solr-schema (...|all) $WHERE
$CMD [options] archive-list
@@ -679,6 +681,25 @@ function clustering () {
logStepEnd "Clustering" $dr $ltype $SECONDS
}
+function outlier () {
+ local dr="*" ltype="$TYPE"
+ CLASS=au.org.ala.pipelines.beam.DistributionOutlierPipeline
+
+ logStepStart "Outlier" $dr
+ SECONDS=0
+
+ SH_ARGS=outlier-sh-args.local
+ java-pipeline $ltype $SH_ARGS $CLASS $dr
+
+ SH_ARGS=outlier-sh-args.spark-embedded
+ spark-embed-pipeline $ltype $SH_ARGS $CLASS $dr
+
+ SH_ARGS=outlier-sh-args.spark-cluster
+ spark-cluster-pipeline $ltype $SH_ARGS $CLASS $dr "add-jackknife $dr"
+
+ logStepEnd "Outlier" $dr $ltype $SECONDS
+}
+
function solr () {
local dr="$1" ltype="$2"
CLASS=au.org.ala.pipelines.beam.IndexRecordToSolrPipeline