diff --git a/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/beam/DistributionOutlierPipeline.java b/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/beam/DistributionOutlierPipeline.java index 79971ab91e..331c57975f 100644 --- a/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/beam/DistributionOutlierPipeline.java +++ b/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/beam/DistributionOutlierPipeline.java @@ -22,13 +22,8 @@ * *

distanceOutOfELD: 0 -> inside of EDL, -1: -> No EDLs. >0 -> out of EDL * - *

* --datasetId=0057a720-17c9-4658-971e-9578f3577cf5 * --attempt=1 * --runner=SparkRunner * - * --targetPath=/some/path/to/output/ * - * --inputPath=/some/path/to/output/0057a720-17c9-4658-971e-9578f3577cf5/1/verbatim.avro - * - *

java -jar /data/pipelines-data/avro-tools-1.11.0.jar tojson distribution-00000-of-00005.avro - * >5.json java -jar /data/pipelines-data/avro-tools-1.11.0.jar fromjson --schema-file schema.avsc - * ./verbatim.json > verbatim.avro + *

Example: java au.org.ala.pipelines.beam.DistributionOutlierPipeline + * --config=/data/la-pipelines/config/la-pipelines.yaml --fsPath=/data */ @Slf4j @NoArgsConstructor(access = AccessLevel.PRIVATE) diff --git a/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/transforms/DistributionOutlierTransform.java b/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/transforms/DistributionOutlierTransform.java index aa9f0c1e2f..5a8b9a5e00 100644 --- a/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/transforms/DistributionOutlierTransform.java +++ b/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/transforms/DistributionOutlierTransform.java @@ -104,6 +104,7 @@ public Iterable apply( throw new RuntimeException( "Expert distribution service throws a runtime error, please check logs"); } catch (Exception e) { + log.error(e.getMessage()); throw new RuntimeException("Runtime error: " + e.getMessage()); } @@ -123,7 +124,6 @@ public MapElements flatToString() { } /** - * Only can be used when EDL exists - which means the record can only in/out EDL Force to reset * distanceOutOfEDL 0: inside edl, -1: no edl * * @param record diff --git a/livingatlas/scripts/la-pipelines b/livingatlas/scripts/la-pipelines index 7ddcdc56f3..ffb7e6f72e 100755 --- a/livingatlas/scripts/la-pipelines +++ b/livingatlas/scripts/la-pipelines @@ -84,6 +84,7 @@ dwca-avro --> interpret --> validate --> uuid --> sds --> image-load ... │ - 'image-load' pushes an export of multimedia AVRO files to the image service; - 'image-sync' retrieves details of images stored in image service for indexing purposes; - 'index' generates a AVRO records ready to be index to SOLR; +- 'outlier' generates a distance to expert distribution layer; - 'sample' use the sampling service to retrieve values for points for the layers in the spatial service; - 'jackknife' adds an aggregated jackknife AVRO for all datasets. Requires samping-avro. After running a full 'index' is required. - 'solr' reads index records in AVRO and submits them to SOLR; @@ -110,6 +111,7 @@ Usage: $CMD [options] sample (...|all) $WHERE $CMD [options] clustering (all) $WHERE $CMD [options] jackknife (all) $WHERE + $CMD [options] outlier (all) $WHERE $CMD [options] solr (...|all) $WHERE $CMD [options] solr-schema (...|all) $WHERE $CMD [options] archive-list @@ -679,6 +681,25 @@ function clustering () { logStepEnd "Clustering" $dr $ltype $SECONDS } +function outlier () { + local dr="*" ltype="$TYPE" + CLASS=au.org.ala.pipelines.beam.DistributionOutlierPipeline + + logStepStart "Outlier" $dr + SECONDS=0 + + SH_ARGS=outlier-sh-args.local + java-pipeline $ltype $SH_ARGS $CLASS $dr + + SH_ARGS=outlier-sh-args.spark-embedded + spark-embed-pipeline $ltype $SH_ARGS $CLASS $dr + + SH_ARGS=outlier-sh-args.spark-cluster + spark-cluster-pipeline $ltype $SH_ARGS $CLASS $dr "add-jackknife $dr" + + logStepEnd "Outlier" $dr $ltype $SECONDS +} + function solr () { local dr="$1" ltype="$2" CLASS=au.org.ala.pipelines.beam.IndexRecordToSolrPipeline