From 509df46af75bec5dbd8a6761e87a5e063e7206dc Mon Sep 17 00:00:00 2001 From: Tim Robertson Date: Tue, 21 Jun 2022 17:29:30 +0200 Subject: [PATCH] #733 Improve NCBI clustering (#746) --- .../clustering/RowOccurrenceFeatures.java | 15 ++++++ .../clustering/OccurrenceFeatures.java | 12 +++++ .../clustering/OccurrenceRelationships.java | 48 ++++++++++++++----- .../clustering/OccurrenceFeaturesPojo.java | 7 +++ .../OccurrenceRelationshipsTest.java | 25 ++++++++++ 5 files changed, 96 insertions(+), 11 deletions(-) diff --git a/gbif/pipelines/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/RowOccurrenceFeatures.java b/gbif/pipelines/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/RowOccurrenceFeatures.java index caa9d00116..861408645f 100644 --- a/gbif/pipelines/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/RowOccurrenceFeatures.java +++ b/gbif/pipelines/clustering-gbif/src/main/java/org/gbif/pipelines/clustering/RowOccurrenceFeatures.java @@ -23,6 +23,16 @@ * of a SQL select a.*,b.* from a join b). */ public class RowOccurrenceFeatures implements OccurrenceFeatures { + + // Dataset keys are considered reliable over time + private static final List SEQUENCE_REPOSITORY_KEYS = + Arrays.asList( + "d8cd16ba-bb74-4420-821e-083f2bac17c2", // INSDC sequences + "393b8c26-e4e0-4dd0-a218-93fc074ebf4e", // INSDC host organisms + "583d91fe-bbc0-4b4a-afe1-801f88263016", // INSDC environmental samples + "040c5662-da76-4782-a48e-cdea1892d14c" // iBOL + ); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); static { @@ -228,6 +238,11 @@ public String getCollectionCode() { return get("collectionCode"); } + @Override + public boolean isFromSequenceRepository() { + return SEQUENCE_REPOSITORY_KEYS.contains(getDatasetKey().toLowerCase()); + } + List listOrNull(String field) { Object o = get(field); // what follows exists only to simply testing (List) and Spark using Hive (Seq) integrations diff --git a/sdks/core/src/main/java/org/gbif/pipelines/core/parsers/clustering/OccurrenceFeatures.java b/sdks/core/src/main/java/org/gbif/pipelines/core/parsers/clustering/OccurrenceFeatures.java index fa01949899..aa4391f3ce 100644 --- a/sdks/core/src/main/java/org/gbif/pipelines/core/parsers/clustering/OccurrenceFeatures.java +++ b/sdks/core/src/main/java/org/gbif/pipelines/core/parsers/clustering/OccurrenceFeatures.java @@ -91,4 +91,16 @@ default String getScopedIdentifier() { return null; } } + + /** + * Allows implementations to declare that the record originates from a sequence repository. + * Default behaviour is false, meaning that consumers may receive false negatives. This hook was + * introduced to allow a relaxation of the rules to accommodate the sparse metadata seen in + * repositories like NCBI. + * + * @see Pipelines issue 733 + */ + default boolean isFromSequenceRepository() { + return false; + } } diff --git a/sdks/core/src/main/java/org/gbif/pipelines/core/parsers/clustering/OccurrenceRelationships.java b/sdks/core/src/main/java/org/gbif/pipelines/core/parsers/clustering/OccurrenceRelationships.java index 279ceacaaf..1fdf36d106 100644 --- a/sdks/core/src/main/java/org/gbif/pipelines/core/parsers/clustering/OccurrenceRelationships.java +++ b/sdks/core/src/main/java/org/gbif/pipelines/core/parsers/clustering/OccurrenceRelationships.java @@ -5,6 +5,7 @@ import com.google.common.annotations.VisibleForTesting; import java.time.LocalDate; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Objects; @@ -46,17 +47,42 @@ public static RelationshipAssertion generate(T } // fact combinations that are of interest as assertions - FeatureAssertion[][] passConditions = { - {SAME_ACCEPTED_SPECIES, SAME_COORDINATES, SAME_DATE}, - {SAME_ACCEPTED_SPECIES, WITHIN_200m, SAME_DATE}, // accommodate 3 decimal place roundings - {SAME_ACCEPTED_SPECIES, SAME_COORDINATES, NON_CONFLICTING_DATE, IDENTIFIERS_OVERLAP}, - {SAME_ACCEPTED_SPECIES, WITHIN_200m, NON_CONFLICTING_DATE, IDENTIFIERS_OVERLAP}, - {SAME_ACCEPTED_SPECIES, WITHIN_2Km, SAME_DATE, IDENTIFIERS_OVERLAP}, - {SAME_ACCEPTED_SPECIES, WITHIN_2Km, NON_CONFLICTING_DATE, IDENTIFIERS_OVERLAP}, - {SAME_ACCEPTED_SPECIES, NON_CONFLICTING_COORDINATES, SAME_DATE, IDENTIFIERS_OVERLAP}, - {SAME_ACCEPTED_SPECIES, SAME_COORDINATES, APPROXIMATE_DATE, SAME_RECORDER_NAME}, - {SAME_ACCEPTED_SPECIES, WITHIN_2Km, APPROXIMATE_DATE, SAME_RECORDER_NAME}, - }; + List passConditions = + new ArrayList<>( + Arrays.asList( + new FeatureAssertion[][] { + {SAME_ACCEPTED_SPECIES, SAME_COORDINATES, SAME_DATE}, + {SAME_ACCEPTED_SPECIES, WITHIN_200m, SAME_DATE}, + { + SAME_ACCEPTED_SPECIES, + SAME_COORDINATES, + NON_CONFLICTING_DATE, + IDENTIFIERS_OVERLAP + }, + {SAME_ACCEPTED_SPECIES, WITHIN_200m, NON_CONFLICTING_DATE, IDENTIFIERS_OVERLAP}, + {SAME_ACCEPTED_SPECIES, WITHIN_2Km, SAME_DATE, IDENTIFIERS_OVERLAP}, + {SAME_ACCEPTED_SPECIES, WITHIN_2Km, NON_CONFLICTING_DATE, IDENTIFIERS_OVERLAP}, + { + SAME_ACCEPTED_SPECIES, + NON_CONFLICTING_COORDINATES, + SAME_DATE, + IDENTIFIERS_OVERLAP + }, + {SAME_ACCEPTED_SPECIES, SAME_COORDINATES, APPROXIMATE_DATE, SAME_RECORDER_NAME}, + {SAME_ACCEPTED_SPECIES, WITHIN_2Km, APPROXIMATE_DATE, SAME_RECORDER_NAME}, + })); + + // Accommodate sparse data from sequence repositories + // see https://github.com/gbif/pipelines/issues/733 + if (o1.isFromSequenceRepository() || o2.isFromSequenceRepository()) { + passConditions.add( + new FeatureAssertion[] { + SAME_ACCEPTED_SPECIES, + NON_CONFLICTING_COORDINATES, + NON_CONFLICTING_DATE, + IDENTIFIERS_OVERLAP + }); + } // always exclude things on different location or date if (assertion.justificationDoesNotContain(DIFFERENT_DATE, DIFFERENT_COUNTRY)) { diff --git a/sdks/core/src/test/java/org/gbif/pipelines/core/parsers/clustering/OccurrenceFeaturesPojo.java b/sdks/core/src/test/java/org/gbif/pipelines/core/parsers/clustering/OccurrenceFeaturesPojo.java index 6dc370235e..cbdab2ae3d 100644 --- a/sdks/core/src/test/java/org/gbif/pipelines/core/parsers/clustering/OccurrenceFeaturesPojo.java +++ b/sdks/core/src/test/java/org/gbif/pipelines/core/parsers/clustering/OccurrenceFeaturesPojo.java @@ -29,6 +29,8 @@ public class OccurrenceFeaturesPojo implements OccurrenceFeatures { private final String institutionCode; private final String collectionCode; + private final boolean isFromSequenceRepository; + @Override public String getId() { return id; @@ -138,4 +140,9 @@ public String getInstitutionCode() { public String getCollectionCode() { return collectionCode; } + + @Override + public boolean isFromSequenceRepository() { + return isFromSequenceRepository; + } } diff --git a/sdks/core/src/test/java/org/gbif/pipelines/core/parsers/clustering/OccurrenceRelationshipsTest.java b/sdks/core/src/test/java/org/gbif/pipelines/core/parsers/clustering/OccurrenceRelationshipsTest.java index 5412f0dd3b..46590363dd 100644 --- a/sdks/core/src/test/java/org/gbif/pipelines/core/parsers/clustering/OccurrenceRelationshipsTest.java +++ b/sdks/core/src/test/java/org/gbif/pipelines/core/parsers/clustering/OccurrenceRelationshipsTest.java @@ -307,6 +307,31 @@ public void testCompareOmittedIdentifiers() { assertFalse(runCompareIdentifier("s.n.", "S/N").justificationContains(IDENTIFIERS_OVERLAP)); } + /** Test relaxed rules when record originates from a sequence repository */ + @Test + public void testSequenceRepositories() { + OccurrenceFeatures o1 = + OccurrenceFeaturesPojo.builder().id("1").speciesKey("212").catalogNumber("ABC").build(); + + OccurrenceFeatures o2 = + OccurrenceFeaturesPojo.builder().id("2").speciesKey("212").catalogNumber("ABC").build(); + + OccurrenceFeatures o3 = + OccurrenceFeaturesPojo.builder() + .id("2") + .datasetKey("2") + .speciesKey("212") + .catalogNumber("ABC") + .isFromSequenceRepository(true) // should relax rules + .build(); + + RelationshipAssertion assertion = OccurrenceRelationships.generate(o1, o2); + assertNull(assertion); + assertion = OccurrenceRelationships.generate(o1, o3); + assertNotNull(assertion); + assertTrue(assertion.justificationContainsAll(SAME_ACCEPTED_SPECIES, IDENTIFIERS_OVERLAP)); + } + /** Generates assertions for the comparison of two identifiers only. */ private RelationshipAssertion runCompareIdentifier(String id1, String id2) { OccurrenceFeatures o1 = OccurrenceFeaturesPojo.builder().catalogNumber(id1).build();