Skip to content

Commit

Permalink
Merge branch 'dev' into 622_distribution
Browse files Browse the repository at this point in the history
  • Loading branch information
qifeng-bai committed Dec 16, 2021
2 parents 27436e8 + 0223c9a commit f6f7712
Show file tree
Hide file tree
Showing 97 changed files with 6,598 additions and 592 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import java.io.File;
import java.util.Collections;
import java.util.Date;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
Expand Down Expand Up @@ -66,7 +67,7 @@ public Validation cancel(UUID key) {

@Override
public void delete(UUID key) {
// nothing
validation.setDeleted(new Date());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
package org.gbif.pipelines.tasks.cleaner;

import static org.gbif.pipelines.estools.common.SettingsType.INDEXING;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;

import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.UUID;
import org.gbif.common.messaging.api.messages.PipelinesCleanerMessage;
import org.gbif.pipelines.estools.EsIndex;
import org.gbif.pipelines.estools.model.IndexParams;
import org.gbif.pipelines.estools.service.EsService;
import org.gbif.pipelines.tasks.ValidationWsClientStub;
import org.gbif.pipelines.tasks.utils.EsServer;
import org.gbif.validator.ws.client.ValidationWsClient;
import org.junit.Before;
import org.junit.ClassRule;
import org.junit.Test;

public class CleanerCallbackIT {

@ClassRule public static final EsServer ES_SERVER = new EsServer();

@Before
public void cleanIndexes() {
EsService.deleteAllIndexes(ES_SERVER.getEsClient());
}

@Test
public void cleanerDeleteEsRecordsTest() {

// State
String datasetUuid = "8a4934ac-7d7f-41d4-892c-f6b71bb777a3";
CleanerConfiguration config = createConfig();
PipelinesCleanerMessage message = createMessage(datasetUuid);
ValidationWsClient validationClient = ValidationWsClientStub.create();

// Index document
String document =
"{\"datasetKey\":\""
+ datasetUuid
+ "\",\"maximumElevationInMeters\":2.2,\"issues\":"
+ "[\"GEODETIC_DATUM_ASSUMED_WGS84\",\"LICENSE_MISSING_OR_UNKNOWN\"],\"verbatim\":{\"core\":"
+ "{\"http://rs.tdwg.org/dwc/terms/maximumElevationInMeters\":\"1150\","
+ "\"http://rs.tdwg.org/dwc/terms/organismID\":\"251\",\"http://rs.tdwg.org/dwc/terms/bed\":\"251\"},\"extensions\":"
+ "{\"http://rs.tdwg.org/dwc/terms/MeasurementOrFact\":[{\"http://rs.tdwg.org/dwc/terms/measurementValue\":"
+ "\"1.7\"},{\"http://rs.tdwg.org/dwc/terms/measurementValue\":\"5.0\"},"
+ "{\"http://rs.tdwg.org/dwc/terms/measurementValue\":\"5.83\"}]}}}";

EsIndex.createIndex(
ES_SERVER.getEsConfig(),
IndexParams.builder()
.indexName(config.esAliases[0])
.settingsType(INDEXING)
.pathMappings(Paths.get("mappings/verbatim-mapping.json"))
.build());

EsService.indexDocument(ES_SERVER.getEsClient(), config.esAliases[0], 1L, document);
EsService.refreshIndex(ES_SERVER.getEsClient(), config.esAliases[0]);

// When
new CleanerCallback(config, validationClient).handleMessage(message);

// Update deleted data available
EsService.refreshIndex(ES_SERVER.getEsClient(), config.esAliases[0]);

// Should
assertFalse(Files.exists(Paths.get(String.join("/", config.fsRootPath, datasetUuid))));
assertFalse(Files.exists(Paths.get(String.join("/", config.hdfsRootPath, datasetUuid))));
assertEquals(0L, EsService.countIndexDocuments(ES_SERVER.getEsClient(), config.esAliases[0]));
assertNotNull(validationClient.get(UUID.fromString(datasetUuid)).getDeleted());
}

@Test
public void cleanerDeleteEsIndexTest() {

// State
String datasetUuid = "8a4934ac-7d7f-41d4-892c-f6b71bb777a3";
CleanerConfiguration config = createConfig();
PipelinesCleanerMessage message = createMessage(datasetUuid);
ValidationWsClient validationClient = ValidationWsClientStub.create();

// Index document
String document =
"{\"datasetKey\":\""
+ datasetUuid
+ "\",\"maximumElevationInMeters\":2.2,\"issues\":"
+ "[\"GEODETIC_DATUM_ASSUMED_WGS84\",\"LICENSE_MISSING_OR_UNKNOWN\"],\"verbatim\":{\"core\":"
+ "{\"http://rs.tdwg.org/dwc/terms/maximumElevationInMeters\":\"1150\","
+ "\"http://rs.tdwg.org/dwc/terms/organismID\":\"251\",\"http://rs.tdwg.org/dwc/terms/bed\":\"251\"},\"extensions\":"
+ "{\"http://rs.tdwg.org/dwc/terms/MeasurementOrFact\":[{\"http://rs.tdwg.org/dwc/terms/measurementValue\":"
+ "\"1.7\"},{\"http://rs.tdwg.org/dwc/terms/measurementValue\":\"5.0\"},"
+ "{\"http://rs.tdwg.org/dwc/terms/measurementValue\":\"5.83\"}]}}}";

String indexName = datasetUuid + "_vld_123123";
String indexToSwap = datasetUuid + "_vld_777777";

EsIndex.createIndex(
ES_SERVER.getEsConfig(),
IndexParams.builder()
.indexName(indexName)
.settingsType(INDEXING)
.pathMappings(Paths.get("mappings/verbatim-mapping.json"))
.build());

EsIndex.createIndex(
ES_SERVER.getEsConfig(),
IndexParams.builder()
.indexName(indexToSwap)
.settingsType(INDEXING)
.pathMappings(Paths.get("mappings/verbatim-mapping.json"))
.build());

EsService.indexDocument(ES_SERVER.getEsClient(), indexName, 1L, document);
EsService.refreshIndex(ES_SERVER.getEsClient(), indexName);
EsService.swapIndexes(
ES_SERVER.getEsClient(),
new HashSet<>(Arrays.asList(config.esAliases)),
Collections.singleton(indexName),
Collections.singleton(indexToSwap));

// When
new CleanerCallback(config, validationClient).handleMessage(message);

// Should
assertFalse(Files.exists(Paths.get(String.join("/", config.fsRootPath, datasetUuid))));
assertFalse(Files.exists(Paths.get(String.join("/", config.hdfsRootPath, datasetUuid))));
assertFalse(EsService.existsIndex(ES_SERVER.getEsClient(), indexName));
assertNotNull(validationClient.get(UUID.fromString(datasetUuid)).getDeleted());
}

private PipelinesCleanerMessage createMessage(String datasetUuid) {
PipelinesCleanerMessage message = new PipelinesCleanerMessage();
message.setDatasetUuid(UUID.fromString(datasetUuid));
message.setAttempt(1);
message.setValidator(true);
return message;
}

private CleanerConfiguration createConfig() {
CleanerConfiguration config = new CleanerConfiguration();
// ES
config.esHosts = ES_SERVER.getEsConfig().getRawHosts();
config.esAliases = new String[] {"validator"};
//
config.fsRootPath = getClass().getResource("/cleaner/fs").getPath();
config.hdfsRootPath = getClass().getResource("/cleaner/hdfs").getPath();

// Step config
config.stepConfig.coreSiteConfig = "";
config.stepConfig.hdfsSiteConfig = "";
return config;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,58 @@ public void testFailedValidatorCase() {
assertTrue(publisher.getMessages().isEmpty());
}

@Test
public void testFailedMissedFilesCase() {
// State
ArchiveValidatorConfiguration config = new ArchiveValidatorConfiguration();
config.archiveRepository = getClass().getResource(INPUT_DATASET_FOLDER).getFile();
config.stepConfig.repositoryPath = getClass().getResource("/dataset/").getFile();

ValidationWsClientStub validationClient = ValidationWsClientStub.create();

ArchiveValidatorCallback callback =
new ArchiveValidatorCallback(
config,
publisher,
curator,
historyClient,
validationClient,
new SchemaValidatorFactory());

UUID uuid = UUID.fromString("b578802e-f1ca-4e5b-acf8-4d45306e6b48");
int attempt = 1;
String crawlId = uuid.toString();

PipelinesArchiveValidatorMessage message =
new PipelinesArchiveValidatorMessage(
uuid,
attempt,
Collections.singleton(VALIDATOR_VALIDATE_ARCHIVE.name()),
EXECUTION_ID,
true,
FileFormat.DWCA.name());

// When
callback.handleMessage(message);

// Should
Validation validation = validationClient.getValidation();
Optional<FileInfo> occurrenceFile =
validation.getMetrics().getFileInfos().stream()
.filter(x -> x.getRowType() != null)
.filter(x -> x.getRowType().equals(DwcTerm.Occurrence.qualifiedName()))
.findFirst();

assertTrue(occurrenceFile.isPresent());
assertFalse(occurrenceFile.get().getIssues().isEmpty());

assertFalse(checkExists(curator, crawlId, LABEL));
assertFalse(checkExists(curator, crawlId, Fn.ERROR_MESSAGE.apply(LABEL)));
assertFalse(checkExists(curator, crawlId, Fn.MQ_CLASS_NAME.apply(LABEL)));
assertFalse(checkExists(curator, crawlId, Fn.MQ_MESSAGE.apply(LABEL)));
assertTrue(publisher.getMessages().isEmpty());
}

private boolean checkExists(CuratorFramework curator, String id, String path) {
return ZookeeperUtils.checkExists(curator, getPipelinesInfoPath(id, path));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
<?xml version="1.0" encoding="utf-8"?>
<eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.1/eml.xsd"
packageId="8575f23e-f762-11e1-a439-00145eb45e9a" system="http://gbif.org" scope="system"
xml:lang="en">

<dataset>
<alternateIdentifier>doi:10.15468/igasai</alternateIdentifier>
<alternateIdentifier>1099</alternateIdentifier>
<title>PonTaurus collection</title>
<associatedParty>
<address>
</address>
<electronicMailAddress>[email protected]</electronicMailAddress> <role>TECHNICAL_POINT_OF_CONTACT</role>
</associatedParty>
<language>en</language>
<abstract>
<para>Plant specimens gathered in the Toroslar mountain range of southern Turkey and the Pontic mountain range in north eastern torkey in 1999. The collection mainly covers grass vegetation plots of the subalpine level. It was collected together with many more observation records for vegetational studies applying phytosociological analysis. The resulting thesis was released in the public domain and is available at &lt;a href=&quot;http://www.archive.org/details/VegetationskundlicheUntersuchungenInDerHochgebirgsregionDerBolkar&quot;&gt;http://www.archive.org/details/VegetationskundlicheUntersuchungenInDerHochgebirgsregionDerBolkar&lt;/a&gt;. Specimens have been deposited at the Berlin Botanical Garden Herbarium (B) with duplicates send to the Istanbul herbarium and the private collection of Gerald Parolly who supervised this work.</para>
</abstract>
<intellectualRights>
<para>This work is licensed under a <ulink url="http://creativecommons.org/licenses/by/4.0/legalcode"><citetitle>Creative Commons Attribution (CC-BY) 4.0 License</citetitle></ulink>.</para>
</intellectualRights>
<contact>
<address>
</address>
<electronicMailAddress>[email protected]</electronicMailAddress> </contact>

</dataset>

<additionalMetadata>
<metadata>
<gbif>
<dateStamp>2017-10-09T13:31:37Z</dateStamp>
<citation>Botanic Garden and Botanical Museum Berlin-Dahlem. PonTaurus collection. Occurrence Dataset https://doi.org/10.15468/igasai accessed via GBIF.org on 2017-10-09.</citation>
</gbif>
</metadata>
</additionalMetadata>

</eml:eml>
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
<?xml version="1.0" encoding="utf-8"?>
<eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.1/eml.xsd"
packageId="10.15468/dl.a2ggjq" system="http://gbif.org" scope="system"
xml:lang="en">

<dataset>
<alternateIdentifier>0002149-171002173027117</alternateIdentifier>
<title>GBIF Occurrence Download 10.15468/dl.a2ggjq</title>
<creator>
<individualName>
<surName>GBIF Download Service</surName>
</individualName>
<electronicMailAddress>[email protected]</electronicMailAddress> </creator>
<metadataProvider>
<individualName>
<surName>GBIF Download Service</surName>
</individualName>
<electronicMailAddress>[email protected]</electronicMailAddress> </metadataProvider>
<pubDate>
2017-10-09
</pubDate>
<language>ENGLISH</language>
<abstract>
<para>A dataset containing all occurrences available in GBIF matching the query:
DatasetKey: PonTaurus collection
The dataset includes records from the following constituent datasets. The full metadata for each constituent is also included in this archive:
1534 records from PonTaurus collection
</para>
</abstract>
<contact>
<individualName>
<surName>GBIF Download Service</surName>
</individualName>
<electronicMailAddress>[email protected]</electronicMailAddress> </contact>

</dataset>

<additionalMetadata>
<metadata>
<gbif>
<dateStamp>2017-10-09T13:31:37Z</dateStamp>
<citation identifier="10.15468/dl.a2ggjq">GBIF Occurrence Download 10.15468/dl.a2ggjq</citation>
<physical>
<objectName></objectName>
<characterEncoding>UTF-8</characterEncoding>
<dataFormat>
<externallyDefinedFormat>
<formatName>Darwin Core Archive</formatName>
</externallyDefinedFormat>
</dataFormat>
<distribution>
<online>
<url function="download">http://api.gbif.org/v1/occurrence/download/request/0002149-171002173027117.zip</url>
</online>
</distribution>
</physical>
</gbif>
</metadata>
</additionalMetadata>

</eml:eml>
Loading

0 comments on commit f6f7712

Please sign in to comment.