Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Added recordedByID into es query builder
  • Loading branch information
muttcg committed Mar 13, 2020
1 parent 267dbff commit 4b8d8b2
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 110 deletions.
Original file line number Diff line number Diff line change
@@ -1,24 +1,37 @@
package org.gbif.occurrence.download.file;

import java.net.URI;
import java.time.ZoneOffset;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.UUID;
import java.util.function.Function;
import java.util.stream.Collectors;

import org.gbif.api.model.occurrence.Occurrence;
import org.gbif.api.model.occurrence.UserIdentifier;
import org.gbif.api.util.ClassificationUtils;
import org.gbif.api.vocabulary.Country;
import org.gbif.api.vocabulary.OccurrenceIssue;
import org.gbif.api.vocabulary.Rank;
import org.gbif.dwc.terms.*;
import org.gbif.dwc.terms.DcTerm;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.terms.GbifInternalTerm;
import org.gbif.dwc.terms.GbifTerm;
import org.gbif.dwc.terms.Term;
import org.gbif.occurrence.common.TermUtils;
import org.gbif.occurrence.common.download.DownloadUtils;
import org.gbif.occurrence.download.hive.DownloadTerms;

import java.net.URI;
import java.time.ZoneOffset;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Pair;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import org.apache.commons.lang3.tuple.Pair;

import static org.gbif.occurrence.common.download.DownloadUtils.DELIMETERS_MATCH_PATTERN;

Expand All @@ -27,6 +40,10 @@
*/
public class OccurrenceMapReader {

private OccurrenceMapReader() {
// NOP
}

public static final Map<Rank, Term> rank2KeyTerm =
ImmutableMap.<Rank, Term>builder().put(Rank.KINGDOM, GbifTerm.kingdomKey).put(Rank.PHYLUM, GbifTerm.phylumKey)
.put(Rank.CLASS, GbifTerm.classKey).put(Rank.ORDER, GbifTerm.orderKey).put(Rank.FAMILY, GbifTerm.familyKey)
Expand Down Expand Up @@ -58,6 +75,7 @@ public static Map<String, String> buildInterpretedOccurrenceMap(Occurrence occur
interpretedOccurrence.put(GbifTerm.typifiedName.simpleName(), occurrence.getTypifiedName());
interpretedOccurrence.put(GbifTerm.lastParsed.simpleName(), getSimpleValue(occurrence.getLastParsed()));
interpretedOccurrence.put(GbifTerm.lastInterpreted.simpleName(), getSimpleValue(occurrence.getLastInterpreted()));
interpretedOccurrence.put(GbifTerm.recordedByID.simpleName(), getSimpleValue(occurrence.getRecordedByIds()));

Optional.ofNullable(occurrence.getVerbatimField(DcTerm.identifier))
.ifPresent(x -> interpretedOccurrence.put(DcTerm.identifier.simpleName(), x));
Expand Down Expand Up @@ -202,6 +220,16 @@ private static String getSimpleValue(Object value) {
return null;
}

/**
* Transform a list of UserIdentifier's into simple string, like:
* ORCID:1312312,METADATA:123123,OTHER:12312
*/
private static String getSimpleValue(List<UserIdentifier> recordedByIds) {
return recordedByIds.stream()
.map(x -> x.getType().name() + ":" + x.getValue())
.collect(Collectors.joining(","));
}


/**
* Validates if the occurrence record it's a repatriated record.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ private static AggregationBuilder buildAggs(OccurrenceHeatmapRequest request) {
GeoBoundingBoxQueryBuilder geoBoundingBoxQuery =
QueryBuilders.geoBoundingBoxQuery(OccurrenceEsField.COORDINATE_POINT.getFieldName())
.setCorners(
Double.valueOf(coords[3]),
Double.valueOf(coords[0]),
Double.valueOf(coords[1]),
Double.valueOf(coords[2]));
Double.parseDouble(coords[3]),
Double.parseDouble(coords[0]),
Double.parseDouble(coords[1]),
Double.parseDouble(coords[2]));

FilterAggregationBuilder filterAggs = AggregationBuilders.filter(BOX_AGGS, geoBoundingBoxQuery);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ public class ExtResultReader {
private static final String ROW_CAN_T_BE_NULL_MSG = "row can't be null";
private static final String COLUMN_CAN_T_BE_NULL_MSG = "column can't be null";

private static String CF = Columns.OCCURRENCE_COLUMN_FAMILY;
private static final String CF = Columns.OCCURRENCE_COLUMN_FAMILY;
/**
* Should never be instantiated.
*/
Expand Down Expand Up @@ -129,7 +129,7 @@ public static <T extends Enum<?>> T getEnum(Result row, Term column, Class<T> en
String value = getString(row, Columns.column(column), null);
if (!Strings.isNullOrEmpty(value)) {
try {
return (T) VocabularyUtils.lookupEnum(value, enumClass);
return VocabularyUtils.lookupEnum(value, enumClass);
} catch (IllegalArgumentException e) {
// value not matching enum!!! LOG???
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ private EsQueryUtils() {}
.put(OccurrenceSearchParameter.RELATIVE_ORGANISM_QUANTITY, OccurrenceEsField.RELATIVE_ORGANISM_QUANTITY)
.put(OccurrenceSearchParameter.COLLECTION_KEY, OccurrenceEsField.COLLECTION_KEY)
.put(OccurrenceSearchParameter.INSTITUTION_KEY, OccurrenceEsField.INSTITUTION_KEY)
.put(OccurrenceSearchParameter.RECORDED_BY_ID, OccurrenceEsField.RECORDED_BY_ID_VALUE)
.build();

static final Map<OccurrenceEsField, Integer> CARDINALITIES =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import org.gbif.api.model.common.search.SearchResponse;
import org.gbif.api.model.occurrence.Occurrence;
import org.gbif.api.model.occurrence.OccurrenceRelation;
import org.gbif.api.model.occurrence.UserIdentifier;
import org.gbif.api.model.occurrence.VerbatimOccurrence;
import org.gbif.api.model.occurrence.search.OccurrenceSearchParameter;
import org.gbif.api.model.occurrence.search.OccurrenceSearchRequest;
Expand Down Expand Up @@ -124,37 +125,30 @@ private static List<? extends Terms.Bucket> getBuckets(Aggregation aggregation)

private static Optional<List<Facet<OccurrenceSearchParameter>>> parseFacets(
org.elasticsearch.action.search.SearchResponse esResponse, OccurrenceSearchRequest request) {

Function<Aggregation, Facet<OccurrenceSearchParameter>> mapFn = aggs -> {
// get buckets
List<? extends Terms.Bucket> buckets = getBuckets(aggs);

// get facet of the agg
OccurrenceSearchParameter facet = ES_TO_SEARCH_MAPPING.get(aggs.getName());

// check for paging in facets
long facetOffset = extractFacetOffset(request, facet);
long facetLimit = extractFacetLimit(request, facet);

List<Facet.Count> counts =
buckets.stream()
.skip(facetOffset)
.limit(facetOffset + facetLimit)
.map(b -> new Facet.Count(b.getKeyAsString(), b.getDocCount()))
.collect(Collectors.toList());

return new Facet<>(facet, counts);
};

return Optional.ofNullable(esResponse.getAggregations())
.map(
aggregations ->
aggregations.asList().stream()
.map(
aggs -> {
// get buckets
List<? extends Terms.Bucket> buckets = getBuckets(aggs);

// get facet of the agg
OccurrenceSearchParameter facet =
ES_TO_SEARCH_MAPPING.get(aggs.getName());

// check for paging in facets
long facetOffset = extractFacetOffset(request, facet);
long facetLimit = extractFacetLimit(request, facet);

List<Facet.Count> counts =
buckets.stream()
.skip(facetOffset)
.limit(facetOffset + facetLimit)
.map(
b ->
new Facet.Count(
b.getKeyAsString(),
b.getDocCount()))
.collect(Collectors.toList());

return new Facet<>(facet, counts);
})
.collect(Collectors.toList()));
.map(aggregations -> aggregations.asList().stream().map(mapFn).collect(Collectors.toList()));
}

private static Optional<List<Occurrence>> parseHits(org.elasticsearch.action.search.SearchResponse esResponse, boolean excludeInterpreted) {
Expand Down Expand Up @@ -269,6 +263,8 @@ public static Occurrence toOccurrence(SearchHit hit, boolean excludeInterpreted)
// multimedia extension
parseMultimediaItems(hit, occ);

parseRecordedByIds(hit, occ);

// add verbatim fields
occ.getVerbatimFields().putAll(extractVerbatimFields(hit, excludeInterpreted));
// TODO: add verbatim extensions
Expand Down Expand Up @@ -300,8 +296,6 @@ private static void setIdentifier(SearchHit hit, VerbatimOccurrence occ) {
.ifPresent(result -> occ.getVerbatimFields().put(DcTerm.identifier, result));
}



private static void setOccurrenceFields(SearchHit hit, Occurrence occ) {
getValue(hit, GBIF_ID, Long::valueOf)
.ifPresent(
Expand All @@ -310,8 +304,7 @@ private static void setOccurrenceFields(SearchHit hit, Occurrence occ) {
occ.getVerbatimFields().put(GbifTerm.gbifID, String.valueOf(id));
});
getValue(hit, BASIS_OF_RECORD, BasisOfRecord::valueOf).ifPresent(occ::setBasisOfRecord);
getValue(hit, ESTABLISHMENT_MEANS, EstablishmentMeans::valueOf)
.ifPresent(occ::setEstablishmentMeans);
getValue(hit, ESTABLISHMENT_MEANS, EstablishmentMeans::valueOf).ifPresent(occ::setEstablishmentMeans);
getValue(hit, LIFE_STAGE, LifeStage::valueOf).ifPresent(occ::setLifeStage);
getDateValue(hit, MODIFIED).ifPresent(occ::setModified);
getValue(hit, REFERENCES, URI::create).ifPresent(occ::setReferences);
Expand Down Expand Up @@ -346,6 +339,19 @@ private static void setOccurrenceFields(SearchHit hit, Occurrence occ) {
getDoubleValue(hit, RELATIVE_ORGANISM_QUANTITY).ifPresent(occ::setRelativeOrganismQuantity);
}

private static void parseRecordedByIds(SearchHit hit, Occurrence occ) {
Function<Map<String, Object>, UserIdentifier> mapFn = m -> {
UserIdentifier ui = new UserIdentifier();
extractValue(m, "type", UserIdentifierType::valueOf).ifPresent(ui::setType);
extractStringValue(m, "value").ifPresent(ui::setValue);
return ui;
};

getObjectsListValue(hit, RECORDED_BY_IDS)
.map(i -> i.stream().map(mapFn).collect(Collectors.toList()))
.ifPresent(occ::setRecordedByIds);
}

private static void setTemporalFields(SearchHit hit, Occurrence occ) {
getDateValue(hit, DATE_IDENTIFIED).ifPresent(occ::setDateIdentified);
getValue(hit, DAY, Integer::valueOf).ifPresent(occ::setDay);
Expand Down Expand Up @@ -421,52 +427,36 @@ private static void setCrawlingFields(SearchHit hit, Occurrence occ) {
}

private static void parseMultimediaItems(SearchHit hit, Occurrence occ) {

Function<Map<String, Object>, MediaObject> mapFn = m -> {
MediaObject mediaObject = new MediaObject();

extractValue(m, "type", MediaType::valueOf).ifPresent(mediaObject::setType);
extractValue(m, "identifier", URI::create).ifPresent(mediaObject::setIdentifier);
extractValue(m, "references", URI::create).ifPresent(mediaObject::setReferences);
extractValue(m, "created", STRING_TO_DATE).ifPresent(mediaObject::setCreated);
extractStringValue(m, "format").ifPresent(mediaObject::setFormat);
extractStringValue(m, "audience").ifPresent(mediaObject::setAudience);
extractStringValue(m, "contributor").ifPresent(mediaObject::setContributor);
extractStringValue(m, "creator").ifPresent(mediaObject::setCreator);
extractStringValue(m, "description").ifPresent(mediaObject::setDescription);
extractStringValue(m, "publisher").ifPresent(mediaObject::setPublisher);
extractStringValue(m, "rightsHolder").ifPresent(mediaObject::setRightsHolder);
extractStringValue(m, "source").ifPresent(mediaObject::setSource);
extractStringValue(m, "title").ifPresent(mediaObject::setTitle);
extractStringValue(m, "license")
.map(license ->
License.fromString(license)
.map(l -> Optional.ofNullable(l.getLicenseUrl()).orElse(license))
.orElse(license))
.ifPresent(mediaObject::setLicense);

return mediaObject;
};

getObjectsListValue(hit, MEDIA_ITEMS)
.ifPresent(
items ->
occ.setMedia(
items.stream()
.map(
item -> {
MediaObject mediaObject = new MediaObject();

extractValue(item, "type", MediaType::valueOf)
.ifPresent(mediaObject::setType);
extractStringValue(item, "format").ifPresent(mediaObject::setFormat);
extractValue(item, "identifier", URI::create)
.ifPresent(mediaObject::setIdentifier);
extractStringValue(item, "audience")
.ifPresent(mediaObject::setAudience);
extractStringValue(item, "contributor")
.ifPresent(mediaObject::setContributor);
extractValue(item, "created", STRING_TO_DATE)
.ifPresent(mediaObject::setCreated);
extractStringValue(item, "creator")
.ifPresent(mediaObject::setCreator);
extractStringValue(item, "description")
.ifPresent(mediaObject::setDescription);
extractStringValue(item, "license")
.map(
license ->
License.fromString(license)
.map(
l ->
Optional.ofNullable(l.getLicenseUrl())
.orElse(license))
.orElse(license))
.ifPresent(mediaObject::setLicense);
extractStringValue(item, "publisher")
.ifPresent(mediaObject::setPublisher);
extractValue(item, "references", URI::create)
.ifPresent(mediaObject::setReferences);
extractStringValue(item, "rightsHolder")
.ifPresent(mediaObject::setRightsHolder);
extractStringValue(item, "source").ifPresent(mediaObject::setSource);
extractStringValue(item, "title").ifPresent(mediaObject::setTitle);

return mediaObject;
})
.collect(Collectors.toList())));
.map(i -> i.stream().map(mapFn).collect(Collectors.toList()))
.ifPresent(occ::setMedia);
}

private static Optional<String> getStringValue(SearchHit hit, OccurrenceEsField esField) {
Expand Down Expand Up @@ -515,15 +505,17 @@ private static <T> Optional<T> getValue(SearchHit hit, OccurrenceEsField esField
}

private static <T> Optional<T> extractValue(Map<String, Object> fields, String fieldName, Function<String, T> mapper) {
return Optional.ofNullable(fields.get(fieldName)).map(String::valueOf).filter(v -> !v.isEmpty())
.map(v -> {
try {
return mapper.apply(v);
} catch (Exception ex) {
LOG.error("Error extracting field {} with value {}", fieldName, v);
return null;
}
});
return Optional.ofNullable(fields.get(fieldName))
.map(String::valueOf)
.filter(v -> !v.isEmpty())
.map(v -> {
try {
return mapper.apply(v);
} catch (Exception ex) {
LOG.error("Error extracting field {} with value {}", fieldName, v);
return null;
}
});
}

private static Optional<String> extractStringValue(Map<String, Object> fields, String fieldName) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
package org.gbif.occurrence.search.es;

/**
* Enumeration that holds a map
*/

import org.gbif.dwc.terms.DcTerm;
import org.gbif.dwc.terms.DwcTerm;
import org.gbif.dwc.terms.GbifInternalTerm;
Expand Down Expand Up @@ -71,7 +67,7 @@ public enum OccurrenceEsField {

//Taxonomic classification
TAXON_KEY("gbifClassification.taxonKey", GbifTerm.taxonKey),
USAGE_TAXON_KEY("gbifClassification.usage.key",GbifTerm.taxonKey),
USAGE_TAXON_KEY("gbifClassification.usage.key", GbifTerm.taxonKey),
TAXON_RANK("gbifClassification.usage.rank", DwcTerm.taxonRank),
ACCEPTED_TAXON_KEY("gbifClassification.acceptedUsage.key", GbifTerm.acceptedTaxonKey),
ACCEPTED_SCIENTIFIC_NAME("gbifClassification.acceptedUsage.name", GbifTerm.acceptedScientificName),
Expand Down Expand Up @@ -117,6 +113,8 @@ public enum OccurrenceEsField {
SAMPLE_SIZE_UNIT("sampleSizeUnit", DwcTerm.sampleSizeUnit),
SAMPLE_SIZE_VALUE("sampleSizeValue", DwcTerm.sampleSizeValue),
RELATIVE_ORGANISM_QUANTITY("relativeOrganismQuantity", GbifTerm.relativeOrganismQuantity),
RECORDED_BY_IDS("recordedByIds", GbifTerm.recordedByID),
RECORDED_BY_ID_VALUE("recordedByIds.value", GbifTerm.recordedByID),

//Crawling
CRAWL_ID("crawlId", GbifInternalTerm.crawlId),
Expand All @@ -134,7 +132,6 @@ public enum OccurrenceEsField {
FULL_TEXT("all", null);



private final String fieldName;

private final Term term;
Expand Down
Loading

0 comments on commit 4b8d8b2

Please sign in to comment.