Skip to content

Commit

Permalink
#733 Improve NCBI clustering (#746)
Browse files Browse the repository at this point in the history
  • Loading branch information
timrobertson100 authored Jun 21, 2022
1 parent 809d2e3 commit 509df46
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,16 @@
* of a SQL select a.*,b.* from a join b).
*/
public class RowOccurrenceFeatures implements OccurrenceFeatures {

// Dataset keys are considered reliable over time
private static final List<String> SEQUENCE_REPOSITORY_KEYS =
Arrays.asList(
"d8cd16ba-bb74-4420-821e-083f2bac17c2", // INSDC sequences
"393b8c26-e4e0-4dd0-a218-93fc074ebf4e", // INSDC host organisms
"583d91fe-bbc0-4b4a-afe1-801f88263016", // INSDC environmental samples
"040c5662-da76-4782-a48e-cdea1892d14c" // iBOL
);

private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

static {
Expand Down Expand Up @@ -228,6 +238,11 @@ public String getCollectionCode() {
return get("collectionCode");
}

@Override
public boolean isFromSequenceRepository() {
return SEQUENCE_REPOSITORY_KEYS.contains(getDatasetKey().toLowerCase());
}

List<String> listOrNull(String field) {
Object o = get(field);
// what follows exists only to simply testing (List) and Spark using Hive (Seq) integrations
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,4 +91,16 @@ default String getScopedIdentifier() {
return null;
}
}

/**
* Allows implementations to declare that the record originates from a sequence repository.
* Default behaviour is false, meaning that consumers may receive false negatives. This hook was
* introduced to allow a relaxation of the rules to accommodate the sparse metadata seen in
* repositories like NCBI.
*
* @see <a href="https://github.com/gbif/pipelines/issues/733">Pipelines issue 733</a>
*/
default boolean isFromSequenceRepository() {
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import com.google.common.annotations.VisibleForTesting;
import java.time.LocalDate;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
Expand Down Expand Up @@ -46,17 +47,42 @@ public static <T extends OccurrenceFeatures> RelationshipAssertion<T> generate(T
}

// fact combinations that are of interest as assertions
FeatureAssertion[][] passConditions = {
{SAME_ACCEPTED_SPECIES, SAME_COORDINATES, SAME_DATE},
{SAME_ACCEPTED_SPECIES, WITHIN_200m, SAME_DATE}, // accommodate 3 decimal place roundings
{SAME_ACCEPTED_SPECIES, SAME_COORDINATES, NON_CONFLICTING_DATE, IDENTIFIERS_OVERLAP},
{SAME_ACCEPTED_SPECIES, WITHIN_200m, NON_CONFLICTING_DATE, IDENTIFIERS_OVERLAP},
{SAME_ACCEPTED_SPECIES, WITHIN_2Km, SAME_DATE, IDENTIFIERS_OVERLAP},
{SAME_ACCEPTED_SPECIES, WITHIN_2Km, NON_CONFLICTING_DATE, IDENTIFIERS_OVERLAP},
{SAME_ACCEPTED_SPECIES, NON_CONFLICTING_COORDINATES, SAME_DATE, IDENTIFIERS_OVERLAP},
{SAME_ACCEPTED_SPECIES, SAME_COORDINATES, APPROXIMATE_DATE, SAME_RECORDER_NAME},
{SAME_ACCEPTED_SPECIES, WITHIN_2Km, APPROXIMATE_DATE, SAME_RECORDER_NAME},
};
List<FeatureAssertion[]> passConditions =
new ArrayList<>(
Arrays.asList(
new FeatureAssertion[][] {
{SAME_ACCEPTED_SPECIES, SAME_COORDINATES, SAME_DATE},
{SAME_ACCEPTED_SPECIES, WITHIN_200m, SAME_DATE},
{
SAME_ACCEPTED_SPECIES,
SAME_COORDINATES,
NON_CONFLICTING_DATE,
IDENTIFIERS_OVERLAP
},
{SAME_ACCEPTED_SPECIES, WITHIN_200m, NON_CONFLICTING_DATE, IDENTIFIERS_OVERLAP},
{SAME_ACCEPTED_SPECIES, WITHIN_2Km, SAME_DATE, IDENTIFIERS_OVERLAP},
{SAME_ACCEPTED_SPECIES, WITHIN_2Km, NON_CONFLICTING_DATE, IDENTIFIERS_OVERLAP},
{
SAME_ACCEPTED_SPECIES,
NON_CONFLICTING_COORDINATES,
SAME_DATE,
IDENTIFIERS_OVERLAP
},
{SAME_ACCEPTED_SPECIES, SAME_COORDINATES, APPROXIMATE_DATE, SAME_RECORDER_NAME},
{SAME_ACCEPTED_SPECIES, WITHIN_2Km, APPROXIMATE_DATE, SAME_RECORDER_NAME},
}));

// Accommodate sparse data from sequence repositories
// see https://github.com/gbif/pipelines/issues/733
if (o1.isFromSequenceRepository() || o2.isFromSequenceRepository()) {
passConditions.add(
new FeatureAssertion[] {
SAME_ACCEPTED_SPECIES,
NON_CONFLICTING_COORDINATES,
NON_CONFLICTING_DATE,
IDENTIFIERS_OVERLAP
});
}

// always exclude things on different location or date
if (assertion.justificationDoesNotContain(DIFFERENT_DATE, DIFFERENT_COUNTRY)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ public class OccurrenceFeaturesPojo implements OccurrenceFeatures {
private final String institutionCode;
private final String collectionCode;

private final boolean isFromSequenceRepository;

@Override
public String getId() {
return id;
Expand Down Expand Up @@ -138,4 +140,9 @@ public String getInstitutionCode() {
public String getCollectionCode() {
return collectionCode;
}

@Override
public boolean isFromSequenceRepository() {
return isFromSequenceRepository;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,31 @@ public void testCompareOmittedIdentifiers() {
assertFalse(runCompareIdentifier("s.n.", "S/N").justificationContains(IDENTIFIERS_OVERLAP));
}

/** Test relaxed rules when record originates from a sequence repository */
@Test
public void testSequenceRepositories() {
OccurrenceFeatures o1 =
OccurrenceFeaturesPojo.builder().id("1").speciesKey("212").catalogNumber("ABC").build();

OccurrenceFeatures o2 =
OccurrenceFeaturesPojo.builder().id("2").speciesKey("212").catalogNumber("ABC").build();

OccurrenceFeatures o3 =
OccurrenceFeaturesPojo.builder()
.id("2")
.datasetKey("2")
.speciesKey("212")
.catalogNumber("ABC")
.isFromSequenceRepository(true) // should relax rules
.build();

RelationshipAssertion<OccurrenceFeatures> assertion = OccurrenceRelationships.generate(o1, o2);
assertNull(assertion);
assertion = OccurrenceRelationships.generate(o1, o3);
assertNotNull(assertion);
assertTrue(assertion.justificationContainsAll(SAME_ACCEPTED_SPECIES, IDENTIFIERS_OVERLAP));
}

/** Generates assertions for the comparison of two identifiers only. */
private RelationshipAssertion<OccurrenceFeatures> runCompareIdentifier(String id1, String id2) {
OccurrenceFeatures o1 = OccurrenceFeaturesPojo.builder().catalogNumber(id1).build();
Expand Down

0 comments on commit 509df46

Please sign in to comment.