Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LivingAtlas: Additional fields for SpeciesListPipeline (ARGA) #865

Open
wants to merge 23 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions livingatlas/configs/la-pipelines.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,8 @@ speciesLists:
maxDownloadAgeInMinutes: 1440
includeConservationStatus: true
includeInvasiveStatus: true
includePresentInCountry: false
includeTraits: false

# Sampling specific configuration
sampling:
Expand Down
2 changes: 2 additions & 0 deletions livingatlas/pipelines/src/main/docker/solr8.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ services:
- "start"
- "-cloud"
- "-f"
restart: on-failure
platform: linux/amd64
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
* <li>Links to species lists for records
* <li>stateProvince and country associated conservation status for the record
* <li>stateProvince and country associated invasive status for the record
* <li>optional `presentInCountry` flag for the record
* <li>optional species `trait` values for the record
* </ul>
*
* This pipeline is left for debug purposes only. Species lists are joined to the records in the
Expand Down Expand Up @@ -149,6 +151,8 @@ public KV<String, String> apply(KV<String, ALATaxonRecord> record) {

final boolean includeConservationStatus = options.getIncludeConservationStatus();
final boolean includeInvasiveStatus = options.getIncludeInvasiveStatus();
final boolean includePresentInCountry = options.getIncludePresentInCountry();
final boolean includeTraits = options.getIncludeTraits();

// join collections
return result.apply(
Expand All @@ -167,7 +171,11 @@ public void processElement(ProcessContext c) {
if (speciesLists != null) {
TaxonProfile.Builder builder =
SpeciesListUtils.createTaxonProfileBuilder(
speciesLists, includeConservationStatus, includeInvasiveStatus);
speciesLists,
includeConservationStatus,
includeInvasiveStatus,
includePresentInCountry,
includeTraits);
// output a link to each occurrence record we've matched by taxonID
for (String occurrenceID : occurrenceIDs) {
builder.setId(occurrenceID);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,9 @@ public static Map<String, TaxonProfile> generateTaxonProfileCollection(
alaTaxonRecord,
speciesListMap,
options.getIncludeConservationStatus(),
options.getIncludeInvasiveStatus()))
options.getIncludeInvasiveStatus(),
options.getIncludePresentInCountry(),
options.getIncludeTraits()))
.collect(Collectors.toList());

return profiles.stream()
Expand All @@ -152,15 +154,21 @@ static TaxonProfile convertToTaxonProfile(
ALATaxonRecord alaTaxonRecord,
Map<String, List<SpeciesListRecord>> speciesListMap,
boolean includeConservationStatus,
boolean includeInvasiveStatus) {
boolean includeInvasiveStatus,
boolean includePresentInCountry,
boolean includeTraits) {

Iterable<SpeciesListRecord> speciesLists =
speciesListMap.get(alaTaxonRecord.getTaxonConceptID());

if (speciesLists != null) {
TaxonProfile.Builder builder =
SpeciesListUtils.createTaxonProfileBuilder(
speciesLists, includeConservationStatus, includeInvasiveStatus);
speciesLists,
includeConservationStatus,
includeInvasiveStatus,
includePresentInCountry,
includeTraits);
builder.setId(alaTaxonRecord.getId());
return builder.build();
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,14 @@ public interface SpeciesLevelPipelineOptions extends InterpretationPipelineOptio
Boolean getIncludeInvasiveStatus();

void setIncludeInvasiveStatus(Boolean includeInvasiveStatus);

@Default.Boolean(false)
Boolean getIncludePresentInCountry();

void setIncludePresentInCountry(Boolean includePresentInCountry);

@Default.Boolean(false)
Boolean getIncludeTraits();

void setIncludeTraits(Boolean includeTraits);
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ public interface IndexFields {
String POINT_0_02 = "point-0.02";
String POINT_0_1 = "point-0.1";
String POINT_1 = "point-1";
String PRESENT_IN_COUNTRY = "presentInCountry";
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wondering why we need this. Cant the data just provide countryCode ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its the field name, so data will look like presentInCountry:Australia or presentInCountry:Italy. Could use country code I suppose presentInCountry:AU but data is from region field in species list, which uses full name, so would require an additional lookup.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've renamed the field to taxonPresentInCountry now.

String PROVENANCE = "provenance";
String TAXON_RANK = "taxonRank";
String RAW_STATE_CONSERVATION = "raw_stateConservation";
Expand All @@ -73,4 +74,7 @@ public interface IndexFields {
String GGBN_TERMS_LOAN = "http://data.ggbn.org/schemas/ggbn/terms/Loan";
String LOAN_DESTINATION_TERM = "http://data.ggbn.org/schemas/ggbn/terms/loanDestination";
String LOAN_IDENTIFIER_TERM = "http://data.ggbn.org/schemas/ggbn/terms/loanIdentifier";
String AUS_TRAITS_FIRE_RESPONSE = "fire_response";
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Convert from snake case to camel

String AUS_TRAITS_POST_FIRE_RECRUITMENT = "post_fire_recruitment";
String AUS_TRAITS_PHOTOSYNTHETIC_PATHWAY = "photosynthetic_pathway";
}
Original file line number Diff line number Diff line change
Expand Up @@ -805,6 +805,51 @@ private static void addSpeciesListInfo(
}
}
}

// index presentInCountry
if (tpr.getPresentInCountry() != null) {
indexRecord.getStrings().put(PRESENT_IN_COUNTRY, tpr.getPresentInCountry());
}

// taxon-level traits from speciesLists
Map<String, String> traits = tpr.getTraits();
for (Map.Entry<String, String> trait : traits.entrySet()) {
if (trait.getKey() != null) {
// save to a <Map> for dynamic-properties fallback
Map<String, String> traitMap = new HashMap<>();
traitMap.put(trait.getKey(), trait.getValue());
// check if traitName is declared as a value in @au.org.ala.pipelines.transforms.IndexFields
java.lang.reflect.Field[] fields = IndexFields.class.getDeclaredFields();
boolean isTraitInDeclaredFields = false;
// Check each <IndexFields> field value to see if it matches the current trait name
for (java.lang.reflect.Field f : fields) {
String strValue = null;
try {
strValue = (String) f.get(null);
} catch (IllegalAccessException e) {
// Don't throw an exception - log.warn and failover to next speciesList
log.warn(
"addSpeciesListInfo() - failed to get value for <IndexFields> field: "
+ f.getName()
+ ", with exception: "
+ e.getMessage());
}
if (strValue.equals(trait.getKey())) {
isTraitInDeclaredFields = true;
break;
}
}
// Dirty data has duplicate entries process via a Set first
Set<String> traitValuesSet = new HashSet<>(Arrays.asList(trait.getValue().split("\\|")));
List<String> traitValuesList = new ArrayList<>(traitValuesSet);
// Add to indexedRecord either as multivalues or dynamicProperties
if (isTraitInDeclaredFields) {
addIfNotEmpty(indexRecord, trait.getKey(), traitValuesList);
} else {
indexRecord.setDynamicProperties(traitMap);
}
}
}
}

private static MultimediaIndexRecord convertToMultimediaRecord(String uuid, Image image) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
package au.org.ala.pipelines.util;

import com.google.common.base.Strings;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.*;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just noticed this - IDE did this and it might break coding rules?

import lombok.AccessLevel;
import lombok.NoArgsConstructor;
import org.gbif.pipelines.io.avro.*;
Expand All @@ -12,19 +10,25 @@
@NoArgsConstructor(access = AccessLevel.PRIVATE)
public class SpeciesListUtils {

private static String LIST_COMMON_TRAIT = "COMMON_TRAIT";

/**
* Creates a reusable template (Builder) for a TaxonProfile based on the supplied species lists.
*/
public static TaxonProfile.Builder createTaxonProfileBuilder(
Iterable<SpeciesListRecord> speciesLists,
boolean includeConservationStatus,
boolean includeInvasiveStatus) {
boolean includeInvasiveStatus,
boolean includePresentInCountry,
boolean includeTraits) {

Iterator<SpeciesListRecord> iter = speciesLists.iterator();

List<String> speciesListIDs = new ArrayList<>();
List<ConservationStatus> conservationStatusList = new ArrayList<>();
List<InvasiveStatus> invasiveStatusList = new ArrayList<>();
String presentInCountryValue = null;
Map<String, String> traitsMap = new HashMap<>();

while (iter.hasNext()) {

Expand All @@ -48,6 +52,12 @@ public static TaxonProfile.Builder createTaxonProfileBuilder(
.setSpeciesListID(speciesListRecord.getSpeciesListID())
.setRegion(speciesListRecord.getRegion())
.build());
} else if (includePresentInCountry && speciesListRecord.getPresentInCountry() != null) {
presentInCountryValue = speciesListRecord.getPresentInCountry();
} else if (includeTraits
&& speciesListRecord.getListType().equals(LIST_COMMON_TRAIT)
&& speciesListRecord.getTraitName() != null) {
traitsMap.put(speciesListRecord.getTraitName(), speciesListRecord.getTraitValue());
}
}

Expand All @@ -56,6 +66,8 @@ public static TaxonProfile.Builder createTaxonProfileBuilder(
builder.setSpeciesListID(speciesListIDs);
builder.setConservationStatuses(conservationStatusList);
builder.setInvasiveStatuses(invasiveStatusList);
builder.setPresentInCountry(presentInCountryValue);
builder.setTraits(traitsMap);
return builder;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ public class SpeciesList {
boolean isAuthoritative;
boolean isInvasive;
boolean isThreatened;
String presentInCountry;

@JsonPOJOBuilder(withPrefix = "")
@JsonIgnoreProperties(ignoreUnknown = true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ public static void run(SpeciesLevelPipelineOptions options) throws IOException {
int guidIdx = columnHeaders.indexOf("guid");
int statusIdx = columnHeaders.indexOf("status");
int sourceStatusIdx = columnHeaders.indexOf("sourceStatus");
int countIdx = columnHeaders.indexOf("count");
int traitNameIdx = columnHeaders.indexOf("traitName");
int traitValueIdx = columnHeaders.indexOf("traitValue");

String region = null;

Expand Down Expand Up @@ -164,16 +167,30 @@ public static void run(SpeciesLevelPipelineOptions options) throws IOException {

String status = statusIdx > 0 ? currentLine[statusIdx] : null;
String sourceStatus = sourceStatusIdx > 0 ? currentLine[sourceStatusIdx] : null;
String count = countIdx > 0 ? currentLine[countIdx] : null;
String traitName = traitNameIdx > 0 ? currentLine[traitNameIdx] : null;
String traitValue = traitValueIdx > 0 ? currentLine[traitValueIdx] : null;
// ARGA addition to set `presentInCountry` to the value specified in the list's
// `region` attribute, when list has type "OTHER", has region set and
// contains a `count` column (note: count not currently used)
String presentInCountry =
(list.getListType().equals("OTHER") && region != null && count != null)
? region
: null;

SpeciesListRecord speciesListRecord =
SpeciesListRecord.newBuilder()
.setTaxonID(taxonID)
.setSpeciesListID(list.getDataResourceUid())
.setStatus(status)
.setRegion(region)
.setListType(list.getListType())
.setIsInvasive(list.isInvasive())
.setIsThreatened(list.isThreatened())
.setSourceStatus(sourceStatus)
.setPresentInCountry(presentInCountry)
.setTraitName(traitName)
.setTraitValue(traitValue)
.build();
dataFileWriter.append(speciesListRecord);
taxaRead++;
Expand Down
6 changes: 6 additions & 0 deletions livingatlas/solr/conf/managed-schema
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,12 @@
<field name="stateConservation" type="string" docValues="true" multiValued="false" indexed="true" />
<field name="countryConservation" type="string" docValues="true" multiValued="false" indexed="true" />

<!-- ARGA fields -->
<field name="presentInCountry" type="string" docValues="true" multiValued="false" indexed="true" />
<field name="fire_response" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="post_fire_recruitment" type="string" docValues="true" multiValued="true" indexed="true" />
<field name="photosynthetic_pathway" type="string" docValues="true" multiValued="true" indexed="true" />

<!-- Additional invasive fields -->
<field name="stateInvasive" type="string" docValues="true" multiValued="false" indexed="true" />
<field name="countryInvasive" type="string" docValues="true" multiValued="false" indexed="true" />
Expand Down
6 changes: 5 additions & 1 deletion sdks/models/src/main/avro/specific/species-list-record.avsc
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@
{"name": "speciesListID","type":"string"},
{"name": "isThreatened", "type": "boolean"},
{"name": "isInvasive", "type": "boolean"},
{"name": "listType", "type": ["null", "string"], "default": null },
{"name": "region", "type": ["null", "string"], "default": null },
{"name": "status", "type": ["null", "string"]},
{"name": "sourceStatus", "type": ["null", "string"]}
{"name": "sourceStatus", "type": ["null", "string"]},
{"name": "presentInCountry", "type": ["null", "string"], "default": null},
{"name": "traitName", "type": ["null", "string"], "default": null},
{"name": "traitValue", "type": ["null", "string"], "default": null}
]
}
4 changes: 3 additions & 1 deletion sdks/models/src/main/avro/specific/taxon-profile.avsc
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@
{"name": "id", "type": ["null", "string"]},
{"name": "speciesListID", "type": {"type" : "array", "items" : "string"}, "default" : []},
{"name": "conservationStatuses", "type": {"type" : "array", "items" : "ConservationStatus"}, "default" : []},
{"name": "invasiveStatuses", "type": {"type" : "array", "items" : "InvasiveStatus"}, "default" : []}
{"name": "invasiveStatuses", "type": {"type" : "array", "items" : "InvasiveStatus"}, "default" : []},
{"name": "presentInCountry", "type": ["null", "string"], "default" : null},
{"name": "traits", "type": {"type": "map", "values": "string"}, "default" : {}}
]
}
]