From 76a5792ccad6fa010e806e9d6d8b4cc0f1b1459f Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Wed, 2 Mar 2022 15:41:27 +0000 Subject: [PATCH] fix for #680 and a fix for missing properties when indexed generate with expert distribution extra EMR defaults --- livingatlas/configs/la-pipelines-emr.yaml | 145 +++++++++++++++++- .../beam/IndexRecordToSolrPipeline.java | 18 +-- .../scripts/update-solr-cluster-config.sh | 4 +- 3 files changed, 147 insertions(+), 20 deletions(-) diff --git a/livingatlas/configs/la-pipelines-emr.yaml b/livingatlas/configs/la-pipelines-emr.yaml index 0127db13dc..4bdd6a8522 100644 --- a/livingatlas/configs/la-pipelines-emr.yaml +++ b/livingatlas/configs/la-pipelines-emr.yaml @@ -30,11 +30,11 @@ fs: local: fsPath: /data hdfs: - fsPath: hdfs:// + fsPath: hdfs:/// dwca-avro: - inputPath: /mnt/dwca-tmp/{datasetId} - tempLocation: /mnt/dwca-tmp/{datasetId} + inputPath: /data/biocache-load/{datasetId} + tempLocation: /data/biocache-load/{datasetId} targetPath: hdfs:///pipelines-data interpret: @@ -118,4 +118,141 @@ alaNameMatch: timeoutSec: 70 retryConfig: maxAttempts: 5 - initialIntervalMillis: 5000 \ No newline at end of file + initialIntervalMillis: 5000 + + +### la-pipelines cli additional arguments, like JVM or spark command line arguments +interpret-sh-args: + local: + jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*] + spark-embedded: + jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*] + spark-cluster: + conf: spark.default.parallelism=48 + num-executors: 8 + executor-cores: 8 + executor-memory: 18G + driver-memory: 2G + +image-sync-sh-args: + local: + jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*] + spark-embedded: + jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*] + spark-cluster: + conf: spark.default.parallelism=48 + num-executors: 8 + executor-cores: 8 + executor-memory: 18G + driver-memory: 2G + +image-load-sh-args: + local: + jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*] + spark-embedded: + jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*] + spark-cluster: + conf: spark.default.parallelism=48 + num-executors: 8 + executor-cores: 8 + executor-memory: 18G + driver-memory: 2G + +uuid-sh-args: + local: + jvm: -Xmx8g -XX:+UseG1GC + spark-embedded: + jvm: -Xmx8g -XX:+UseG1GC + spark-cluster: + conf: spark.default.parallelism=48 + num-executors: 8 + executor-cores: 8 + executor-memory: 18G + driver-memory: 2G + +sampling-sh-args: + local: + jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*] + spark-embedded: + jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*] + spark-cluster: + conf: spark.default.parallelism=48 + num-executors: 8 + executor-cores: 8 + executor-memory: 18G + driver-memory: 2G + +outlier-sh-args: + local: + jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*] + spark-embedded: + jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*] + spark-cluster: + conf: spark.default.parallelism=48 + num-executors: 8 + executor-cores: 8 + executor-memory: 18G + driver-memory: 2G + +sensitive-sh-args: + spark-embedded: + jvm: -Xmx8g -XX:+UseG1GC + spark-cluster: + jvm: -Xmx8g -XX:+UseG1GC + conf: spark.default.parallelism=48 + num-executors: 8 + executor-cores: 8 + executor-memory: 18G + driver-memory: 2G + +sample-sh-args: + local: + jvm: -Xmx8g -XX:+UseG1GC + +index-sh-args: + local: + jvm: -Xmx8g -XX:+UseG1GC + spark-embedded: + jvm: -Xmx8g -XX:+UseG1GC + spark-cluster: + conf: spark.default.parallelism=48 + num-executors: 8 + executor-cores: 8 + executor-memory: 18G + driver-memory: 2G + +jackknife-sh-args: + local: + jvm: -Xmx8g -XX:+UseG1GC + spark-embedded: + jvm: -Xmx8g -XX:+UseG1GC + spark-cluster: + conf: spark.default.parallelism=48 + num-executors: 8 + executor-cores: 8 + executor-memory: 18G + driver-memory: 2G + +clustering-sh-args: + local: + jvm: -Xmx8g -XX:+UseG1GC + spark-embedded: + jvm: -Xmx8g -XX:+UseG1GC + spark-cluster: + conf: spark.default.parallelism=48 + num-executors: 8 + executor-cores: 8 + executor-memory: 18G + driver-memory: 2G + +solr-sh-args: + local: + jvm: -Xmx8g -XX:+UseG1GC + spark-embedded: + jvm: -Xmx8g -XX:+UseG1GC + spark-cluster: + conf: spark.default.parallelism=500 + num-executors: 8 + executor-cores: 8 + executor-memory: 18G + driver-memory: 2G diff --git a/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/beam/IndexRecordToSolrPipeline.java b/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/beam/IndexRecordToSolrPipeline.java index 0cf0c6fa10..753d38231b 100644 --- a/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/beam/IndexRecordToSolrPipeline.java +++ b/livingatlas/pipelines/src/main/java/au/org/ala/pipelines/beam/IndexRecordToSolrPipeline.java @@ -542,6 +542,7 @@ public void processElement(ProcessContext c) { .setInts(indexRecord.getInts()) .setStrings(stringsToPersist) .setDoubles(doublesToPersist) + .setDynamicProperties(indexRecord.getDynamicProperties()) .build(); c.output(KV.of(indexRecord.getId(), ir)); @@ -563,26 +564,15 @@ public void processElement(ProcessContext c) { String id = e.getKey(); DistributionOutlierRecord outlierRecord = e.getValue().getValue(); - IndexRecord indexRecord = e.getValue().getKey(); - IndexRecord ouputIR = - IndexRecord.newBuilder() - .setId(indexRecord.getId()) - .setTaxonID(indexRecord.getTaxonID()) - .setLatLng(indexRecord.getLatLng()) - .setMultiValues(indexRecord.getMultiValues()) - .setDates(indexRecord.getDates()) - .setLongs(indexRecord.getLongs()) - .setBooleans(indexRecord.getBooleans()) - .setInts(indexRecord.getInts()) - .build(); + if (outlierRecord != null) { - ouputIR + indexRecord .getDoubles() .put(DISTANCE_FROM_EXPERT_DISTRIBUTION, outlierRecord.getDistanceOutOfEDL()); } - c.output(KV.of(id, ouputIR)); + c.output(KV.of(id, indexRecord)); } }; } diff --git a/livingatlas/solr/scripts/update-solr-cluster-config.sh b/livingatlas/solr/scripts/update-solr-cluster-config.sh index 1afb61f821..c7818c0a14 100755 --- a/livingatlas/solr/scripts/update-solr-cluster-config.sh +++ b/livingatlas/solr/scripts/update-solr-cluster-config.sh @@ -9,10 +9,10 @@ zip config.zip * #curl -X GET "http://localhost:8987/solr/admin/collections?action=DELETE&name=biocache3" #echo 'Deleting existing configset' -curl -X GET "http://localhost:8988/solr/admin/configs?action=DELETE&name=biocache&omitHeader=true" +curl -X GET "http://localhost:8983/solr/admin/configs?action=DELETE&name=biocache&omitHeader=true" echo 'Creating configset' -curl -X POST --header "Content-Type:application/octet-stream" --data-binary @config.zip "http://localhost:8988/solr/admin/configs?action=UPLOAD&name=biocache" +curl -X POST --header "Content-Type:application/octet-stream" --data-binary @config.zip "http://localhost:8983/solr/admin/configs?action=UPLOAD&name=biocache_dev" #echo 'Creating collection' #curl -X GET "http://localhost:8986/solr/admin/collections?action=CREATE&name=biocache&numShards=8&maxShardsPerNode=1&replicationFactor=1&collection.configName=biocache"