Skip to content

Commit

Permalink
fix for #680 and a fix for missing properties when indexed generate w…
Browse files Browse the repository at this point in the history
…ith expert distribution

extra EMR defaults
  • Loading branch information
djtfmartin committed Mar 2, 2022
1 parent 6678116 commit 76a5792
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 20 deletions.
145 changes: 141 additions & 4 deletions livingatlas/configs/la-pipelines-emr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ fs:
local:
fsPath: /data
hdfs:
fsPath: hdfs://
fsPath: hdfs:///

dwca-avro:
inputPath: /mnt/dwca-tmp/{datasetId}
tempLocation: /mnt/dwca-tmp/{datasetId}
inputPath: /data/biocache-load/{datasetId}
tempLocation: /data/biocache-load/{datasetId}
targetPath: hdfs:///pipelines-data

interpret:
Expand Down Expand Up @@ -118,4 +118,141 @@ alaNameMatch:
timeoutSec: 70
retryConfig:
maxAttempts: 5
initialIntervalMillis: 5000
initialIntervalMillis: 5000


### la-pipelines cli additional arguments, like JVM or spark command line arguments
interpret-sh-args:
local:
jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*]
spark-embedded:
jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*]
spark-cluster:
conf: spark.default.parallelism=48
num-executors: 8
executor-cores: 8
executor-memory: 18G
driver-memory: 2G

image-sync-sh-args:
local:
jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*]
spark-embedded:
jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*]
spark-cluster:
conf: spark.default.parallelism=48
num-executors: 8
executor-cores: 8
executor-memory: 18G
driver-memory: 2G

image-load-sh-args:
local:
jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*]
spark-embedded:
jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*]
spark-cluster:
conf: spark.default.parallelism=48
num-executors: 8
executor-cores: 8
executor-memory: 18G
driver-memory: 2G

uuid-sh-args:
local:
jvm: -Xmx8g -XX:+UseG1GC
spark-embedded:
jvm: -Xmx8g -XX:+UseG1GC
spark-cluster:
conf: spark.default.parallelism=48
num-executors: 8
executor-cores: 8
executor-memory: 18G
driver-memory: 2G

sampling-sh-args:
local:
jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*]
spark-embedded:
jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*]
spark-cluster:
conf: spark.default.parallelism=48
num-executors: 8
executor-cores: 8
executor-memory: 18G
driver-memory: 2G

outlier-sh-args:
local:
jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*]
spark-embedded:
jvm: -Xmx8g -XX:+UseG1GC -Dspark.master=local[*]
spark-cluster:
conf: spark.default.parallelism=48
num-executors: 8
executor-cores: 8
executor-memory: 18G
driver-memory: 2G

sensitive-sh-args:
spark-embedded:
jvm: -Xmx8g -XX:+UseG1GC
spark-cluster:
jvm: -Xmx8g -XX:+UseG1GC
conf: spark.default.parallelism=48
num-executors: 8
executor-cores: 8
executor-memory: 18G
driver-memory: 2G

sample-sh-args:
local:
jvm: -Xmx8g -XX:+UseG1GC

index-sh-args:
local:
jvm: -Xmx8g -XX:+UseG1GC
spark-embedded:
jvm: -Xmx8g -XX:+UseG1GC
spark-cluster:
conf: spark.default.parallelism=48
num-executors: 8
executor-cores: 8
executor-memory: 18G
driver-memory: 2G

jackknife-sh-args:
local:
jvm: -Xmx8g -XX:+UseG1GC
spark-embedded:
jvm: -Xmx8g -XX:+UseG1GC
spark-cluster:
conf: spark.default.parallelism=48
num-executors: 8
executor-cores: 8
executor-memory: 18G
driver-memory: 2G

clustering-sh-args:
local:
jvm: -Xmx8g -XX:+UseG1GC
spark-embedded:
jvm: -Xmx8g -XX:+UseG1GC
spark-cluster:
conf: spark.default.parallelism=48
num-executors: 8
executor-cores: 8
executor-memory: 18G
driver-memory: 2G

solr-sh-args:
local:
jvm: -Xmx8g -XX:+UseG1GC
spark-embedded:
jvm: -Xmx8g -XX:+UseG1GC
spark-cluster:
conf: spark.default.parallelism=500
num-executors: 8
executor-cores: 8
executor-memory: 18G
driver-memory: 2G
Original file line number Diff line number Diff line change
Expand Up @@ -542,6 +542,7 @@ public void processElement(ProcessContext c) {
.setInts(indexRecord.getInts())
.setStrings(stringsToPersist)
.setDoubles(doublesToPersist)
.setDynamicProperties(indexRecord.getDynamicProperties())
.build();

c.output(KV.of(indexRecord.getId(), ir));
Expand All @@ -563,26 +564,15 @@ public void processElement(ProcessContext c) {
String id = e.getKey();

DistributionOutlierRecord outlierRecord = e.getValue().getValue();

IndexRecord indexRecord = e.getValue().getKey();
IndexRecord ouputIR =
IndexRecord.newBuilder()
.setId(indexRecord.getId())
.setTaxonID(indexRecord.getTaxonID())
.setLatLng(indexRecord.getLatLng())
.setMultiValues(indexRecord.getMultiValues())
.setDates(indexRecord.getDates())
.setLongs(indexRecord.getLongs())
.setBooleans(indexRecord.getBooleans())
.setInts(indexRecord.getInts())
.build();

if (outlierRecord != null) {
ouputIR
indexRecord
.getDoubles()
.put(DISTANCE_FROM_EXPERT_DISTRIBUTION, outlierRecord.getDistanceOutOfEDL());
}

c.output(KV.of(id, ouputIR));
c.output(KV.of(id, indexRecord));
}
};
}
Expand Down
4 changes: 2 additions & 2 deletions livingatlas/solr/scripts/update-solr-cluster-config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ zip config.zip *
#curl -X GET "http://localhost:8987/solr/admin/collections?action=DELETE&name=biocache3"

#echo 'Deleting existing configset'
curl -X GET "http://localhost:8988/solr/admin/configs?action=DELETE&name=biocache&omitHeader=true"
curl -X GET "http://localhost:8983/solr/admin/configs?action=DELETE&name=biocache&omitHeader=true"

echo 'Creating configset'
curl -X POST --header "Content-Type:application/octet-stream" --data-binary @config.zip "http://localhost:8988/solr/admin/configs?action=UPLOAD&name=biocache"
curl -X POST --header "Content-Type:application/octet-stream" --data-binary @config.zip "http://localhost:8983/solr/admin/configs?action=UPLOAD&name=biocache_dev"

#echo 'Creating collection'
#curl -X GET "http://localhost:8986/solr/admin/collections?action=CREATE&name=biocache&numShards=8&maxShardsPerNode=1&replicationFactor=1&collection.configName=biocache"
Expand Down

0 comments on commit 76a5792

Please sign in to comment.