Skip to content

Commit

Permalink
Harvester / Simple URL
Browse files Browse the repository at this point in the history
This commit is a squash of geonetwork/core-geonetwork#5942

A simple harvester which takes a URL expecting for now a JSON document
and loop over document identified by a JSONPointer and applying an XSL
to convert to ISO format.

This should allow GeoNetwork to harvest some of the opendata portal
providing all various search API providing JSON response usually.

Harvester / Simple URL / Paging and basic opendatasoft support.

Json harvester: fix merge conflicts

jsonHarvester: handle JSONLD format with @ in tag names

jsonHarvester: add ESRI JSONLD DCAT transformation

hack: to remove, extract uuid from URIs

jsonHarvester: extract uuid from identifier

https://data-atmo-hdf.opendata.arcgis.com/datasets/bac17d7d05a34242a8b22c535ecdb13d
will extract bac17d7d05a34242a8b22c535ecdb13d
  • Loading branch information
fxprunayre authored and pmauduit committed Feb 3, 2022
1 parent a5e2829 commit 59db5c9
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,15 @@

import java.io.IOException;
import java.io.InputStreamReader;
<<<<<<< HEAD
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
=======
import java.net.URI;
import java.net.URISyntaxException;
>>>>>>> 4918e66602 (Harvester / Simple URL)
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
Expand Down Expand Up @@ -139,6 +144,7 @@ public HarvestResult harvest(Logger log) throws Exception {

nodes.forEach(record -> {
String uuid = this.extractUuidFromIdentifier(record.get(params.recordIdPath).asText());
<<<<<<< HEAD
String apiUrl = params.url.split("\\?")[0];
URL url = null;
try {
Expand All @@ -149,6 +155,10 @@ public HarvestResult harvest(Logger log) throws Exception {
} catch (MalformedURLException e) {
log.warning("Failed to parse Node URL");
}
=======
Element xml = convertRecordToXml(record, uuid);
uuids.put(uuid, xml);
>>>>>>> 4918e66602 (Harvester / Simple URL)
});
aligner.align(uuids, errors);
allUuids.putAll(uuids);
Expand Down Expand Up @@ -201,8 +211,13 @@ protected List<String> buildListOfUrl(SimpleUrlParams params, int numberOfRecord
numberOfRecordsPerPage = Integer.parseInt(pageSizeParamValue);
} else {
log.warning(String.format(
<<<<<<< HEAD
"Page size param '%s' not found or is not a numeric in URL '%s'. Can't build a list of pages.",
params.pageSizeParam, params.url));
=======
"Page size param '%s' not found or is not a numeric in URL '%s'. Can't build a list of pages.",
params.pageSizeParam, params.url));
>>>>>>> 4918e66602 (Harvester / Simple URL)
urlList.add(params.url);
return urlList;
}
Expand All @@ -213,8 +228,13 @@ protected List<String> buildListOfUrl(SimpleUrlParams params, int numberOfRecord
startAtZero = Integer.parseInt(pageFromParamValue) == 0;
} else {
log.warning(String.format(
<<<<<<< HEAD
"Page from param '%s' not found or is not a numeric in URL '%s'. Can't build a list of pages.",
params.pageFromParam, params.url));
=======
"Page from param '%s' not found or is not a numeric in URL '%s'. Can't build a list of pages.",
params.pageFromParam, params.url));
>>>>>>> 4918e66602 (Harvester / Simple URL)
urlList.add(params.url);
return urlList;
}
Expand All @@ -225,17 +245,26 @@ protected List<String> buildListOfUrl(SimpleUrlParams params, int numberOfRecord
for (int i = 0; i < numberOfPages; i++) {
int from = i * numberOfRecordsPerPage + (startAtZero ? 0 : 1);
int size = i == numberOfPages - 1 ? // Last page
<<<<<<< HEAD
numberOfRecordsToHarvest - from + (startAtZero ? 0 : 1) :
numberOfRecordsPerPage;
String url = params.url
.replaceAll(params.pageFromParam + "=[0-9]+", params.pageFromParam + "=" + from)
.replaceAll(params.pageSizeParam + "=[0-9]+", params.pageSizeParam + "=" + size);
=======
numberOfRecordsToHarvest - from + (startAtZero ? 0 : 1) :
numberOfRecordsPerPage;
String url = params.url
.replaceAll(params.pageFromParam + "=[0-9]+", params.pageFromParam + "=" + from)
.replaceAll(params.pageSizeParam + "=[0-9]+", params.pageSizeParam + "=" + size);
>>>>>>> 4918e66602 (Harvester / Simple URL)
urlList.add(url);
}

return urlList;
}

<<<<<<< HEAD
private Element convertRecordToXml(JsonNode record, String uuid, String apiUrl, String nodeUrl) {
ObjectMapper objectMapper = new ObjectMapper();
try {
Expand All @@ -247,6 +276,17 @@ private Element convertRecordToXml(JsonNode record, String uuid, String apiUrl,
recordAsElement.addContent(new Element("uuid").setText(uuid));
recordAsElement.addContent(new Element("apiUrl").setText(apiUrl));
recordAsElement.addContent(new Element("nodeUrl").setText(nodeUrl));
=======
private Element convertRecordToXml(JsonNode record, String uuid) {
ObjectMapper objectMapper = new ObjectMapper();
try {
String recordAsXml = XML.toString(
new JSONObject(
objectMapper.writeValueAsString(record)), "record");
recordAsXml = Xml.stripNonValidXMLCharacters(recordAsXml).replace("<@", "<").replace("</@", "</");
Element recordAsElement = Xml.loadString(recordAsXml, false);
recordAsElement.addContent(new Element("uuid").setText(uuid));
>>>>>>> 4918e66602 (Harvester / Simple URL)
Path importXsl = context.getAppPath().resolve(Geonet.Path.IMPORT_STYLESHEETS);
final Path xslPath = importXsl.resolve(params.toISOConversion + ".xsl");
return Xml.transform(recordAsElement, xslPath);
Expand Down
4 changes: 4 additions & 0 deletions harvesters/src/main/resources/config-spring-geonetwork.xml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@
<bean id="wfsfeatures"
class="org.fao.geonet.kernel.harvest.harvester.wfsfeatures.WfsFeaturesHarvester"
scope="prototype"/>
<bean id="simpleurl"
class="org.fao.geonet.kernel.harvest.harvester.simpleUrl.SimpleUrlHarvester"
scope="prototype"/>

<!-- ArcSDE also requires a ArcSDEConnectionFactory beans -->
<bean id="arcSDEConnectionFactory"
class="org.fao.geonet.kernel.harvest.harvester.arcsde.ArcSDEConnectionFactory" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,11 @@
xmlns:xd="http://www.oxygenxml.com/ns/doc/xsl"
exclude-result-prefixes="#all">

<<<<<<< HEAD
<xsl:import href="protocol-mapping.xsl"></xsl:import>

=======
>>>>>>> 4918e66602 (Harvester / Simple URL)
<xsl:output method="xml" indent="yes"/>
<xsl:strip-space elements="*"/>

Expand Down Expand Up @@ -310,6 +313,7 @@
"Aide Sociale",
"Aide famille"
],....-->
<<<<<<< HEAD
<xsl:if test="keyword">
<mri:descriptiveKeywords>
<mri:MD_Keywords>
Expand Down Expand Up @@ -341,6 +345,23 @@
</mri:MD_Keywords>
</mri:descriptiveKeywords>
</xsl:if>
=======
<mri:descriptiveKeywords>
<mri:MD_Keywords>
<xsl:for-each select="keyword|theme">
<mri:keyword>
<gco:CharacterString>
<xsl:value-of select="name"/>
</gco:CharacterString>
</mri:keyword>
</xsl:for-each>
<mri:type>
<mri:MD_KeywordTypeCode codeListValue="theme"
codeList="./resources/codeList.xml#MD_KeywordTypeCode"/>
</mri:type>
</mri:MD_Keywords>
</mri:descriptiveKeywords>
>>>>>>> 4918e66602 (Harvester / Simple URL)

<!--
license_url: "http://opendatacommons.org/licenses/odbl/",
Expand Down Expand Up @@ -416,7 +437,10 @@
<mrd:transferOptions>
<mrd:MD_DigitalTransferOptions>
<xsl:for-each select="distribution">
<<<<<<< HEAD
<xsl:variable name="format" select="format"/>
=======
>>>>>>> 4918e66602 (Harvester / Simple URL)
<mrd:onLine>
<cit:CI_OnlineResource>
<cit:linkage>
Expand All @@ -426,7 +450,11 @@
</cit:linkage>
<cit:protocol>
<gco:CharacterString>
<<<<<<< HEAD
<xsl:value-of select="$format-protocol-mapping/entry[format=lower-case($format)]/protocol"/>
=======
<xsl:value-of select="mediaType"/>
>>>>>>> 4918e66602 (Harvester / Simple URL)
</gco:CharacterString>
</cit:protocol>
<cit:name>
Expand All @@ -436,14 +464,19 @@
</cit:name>
<cit:description>
<gco:CharacterString>
<<<<<<< HEAD
<xsl:value-of select="$format"/>
=======
<xsl:value-of select="format"/>
>>>>>>> 4918e66602 (Harvester / Simple URL)
</gco:CharacterString>
</cit:description>
</cit:CI_OnlineResource>
</mrd:onLine>
</xsl:for-each>
</mrd:MD_DigitalTransferOptions>
</mrd:transferOptions>
<<<<<<< HEAD
<mrd:transferOptions>
<mrd:MD_DigitalTransferOptions>
<mrd:onLine>
Expand Down Expand Up @@ -471,6 +504,8 @@
</mrd:onLine>
</mrd:MD_DigitalTransferOptions>
</mrd:transferOptions>
=======
>>>>>>> 4918e66602 (Harvester / Simple URL)
</mrd:MD_Distribution>
</mdb:distributionInfo>

Expand Down
1 change: 1 addition & 0 deletions web-ui/src/main/resources/catalog/locales/en-admin.json
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@
"cron-0 15 10 ? * MON-FRI": "Fire at 10:15am every Monday, Tuesday, Wednesday, Thursday and Friday",
"harvesterTimeZoneHelp": "Time in cron expression will be interpreted as in <i>{{timeZoneTransl}} {{zoneOffset}}</i> timezone.",
"csvExport": "Export as CSV",
"simpleurl": "Simple URL",
"duplicatedValueFoundHarvesterName": "An harvester with that name already exists. Choose another one.",
"duplicatedValueFoundUserName": "A user with that name already exists. Choose another one.",
"duplicatedValueFoundUserEmail": "A user with that email already exists. Choose another one.",
Expand Down

0 comments on commit 59db5c9

Please sign in to comment.