From 139f1c9bb303f0d3c00d5bfa49f169625b038ff9 Mon Sep 17 00:00:00 2001 From: planemad Date: Sun, 5 Jul 2020 02:13:10 -0400 Subject: [PATCH 1/4] Add script to join wikidata translations --- data/join-wikidata-labels.js | 220 +++++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 data/join-wikidata-labels.js diff --git a/data/join-wikidata-labels.js b/data/join-wikidata-labels.js new file mode 100644 index 0000000..86502ef --- /dev/null +++ b/data/join-wikidata-labels.js @@ -0,0 +1,220 @@ +'use strict'; + +// OpenStreetMap India +// https://wiki.openstreetmap.org/wiki/Map_internationalization_(India) +// Tileset +// 'https://b.tiles.mapbox.com/v4/planemad.6mk61jbn/{z}/{x}/{y}.vector.pbf?access_token=pk.eyJ1IjoicGxhbmVtYWQiLCJhIjoiemdYSVVLRSJ9.g3lbg_eN0kztmsfIPxa9MQ' + +// +// Add localised labels from Wikidata to the GeoJSON data +// +// 1. Export OSM data as a GeoJSON. +// + +const fs = require('fs'); +const fetch = require('node-fetch'); +const _ = require('lodash'); + +// const WDK = require('wikibase-sdk'); + +// let wdk = WDK({ +// instance: 'https://www.wikidata.org', +// sparqlEndpoint: 'https://query.wikidata.org/sparql' +// }) + +// let geojson = fs.readFileSync('places.geojson'); +let geojson = fs.readFileSync('places2.geojson'); +let features = JSON.parse(geojson).features; + + +let languages = ["as", "bn", "brx", "doi", "en", "gu", "hi", "kn", "ks", "gom", "mai", "ml", "mni", "mr", "ne", "or", "pa", "sa", "sat", "sd", "ta", "te", "ur"]; +let props = ["place", "wikidata", "name"] +languages.forEach(item=> props.push("name:"+item)) + +let qidField = 'wikidata' +let preferWikidataLabel = true +let requestChunkSize = 40 + + +let joinWikidataLabels = function (features, languages) { + + // Filter features to only include those with a qid + + features = features.filter(item=> + item.properties.hasOwnProperty(qidField) + ) + + // Drop all fields except names and wikidata id + + features.map(item=>{ + Object.keys(item.properties).forEach(prop=>{ + + if (!prop.startsWith("name") && ['place','wikidata','@id'].indexOf(prop)==-1){ + delete item.properties[prop] + } + }) + return item + }) + + // Build a list of qids that need to be queried on Wikidata + + let qids = [] + + features.forEach(item => { + if (item.properties.hasOwnProperty(qidField)) { + qids.push(item.properties[qidField]); + } + }); + + // Split list of qids into smaller chunks + // Get localised labels from Wikidata using the qid + + let requests = [] + + while (qids.length) { + let qidsPage = qids.splice(0, requestChunkSize); + requests.push(translateQids(qidsPage, languages)) + } + + // Collect all the Wikidata results + + Promise.all(requests).then(data => { + + data = Object.assign({}, ...data) + + // console.log(data) + + // Join properties from Wikidata + // Add translation if it exists + + features.map(item => { + + languages.forEach(langCode => { + + // Check if feature has a qid and a translation is available + // Additionally check if the feature already has an existing translation for that language + + if ( + item.properties.hasOwnProperty(qidField) && + data[item.properties[qidField]].hasOwnProperty(langCode) + ) { + if (preferWikidataLabel && !item.properties.hasOwnProperty('name:' + langCode)) + item.properties['name:' + langCode] = data[item.properties[qidField]][langCode]; + } else { + // item.properties['name:' + langCode] = null; + } + + }) + + item.properties = _.pick(item.properties, props); + + }); + + let geoJSON = { + "type": "FeatureCollection", + "features": features + } + // console.log(JSON.stringify(geoJSON)) + console.log(features) + return features; + + }); + +} + + +let translateQids = function (qids, languages) { + let sparql = ` + # Test query at https://w.wiki/QQX + SELECT ?item (REPLACE(STR(?item),STR(wd:),"") as ?qid) + ${languages.map(lang => '?itemLabel_' + lang).join(" ")} + { + VALUES ?item { ${qids.map(el => 'wd:' + el).join(" ")} } # Qid list + ${languages + .map( + lang => + `OPTIONAL { ?item rdfs:label ?itemLabel_${lang}. FILTER(LANG(?itemLabel_${lang})="${lang}")}` + ) + .join(" ")} + } + `; + + + + let result = queryWikidata(sparql) + .then(result => + + result.map(item => ({ + [item.qid]: (function () { + var obj = {}; + languages.forEach(lang => { + // Add the label for each language. + // Run the label through the cleaning filter before using + if (item["itemLabel_" + lang]) { + + let label = item["itemLabel_" + lang] + obj[lang] = cleanLabel(label) + + } + }); + return obj; + })() + })) + ) + .then(mapped => Object.assign({}, ...mapped)) + .catch((error) => { + console.log(error) + }); + + return result; +} + +// +// Fetch JSON results from Wikidata using a SPARQL query +// +let queryWikidata = function (sparql) { + + let sparqlEndpoint = 'https://query.wikidata.org/sparql' + let prettyResult = true + + var resultPromise = fetch( + `${sparqlEndpoint}?query=${encodeURIComponent( + sparql + )}&format=json`, { + headers: { + accept: "application/sparql-results+json" + } + } + ) + .then(response => response.json()) + .then(json => { + + // Remap result values into a pretty object + let result = json.results.bindings; + + if (prettyResult) { + result.forEach((row, i) => + Object.keys(row).forEach(k => { + result[i][k] = row[k].value; + }) + ); + } + + return result; + }); + + return resultPromise; +} + +let result = joinWikidataLabels(features, + languages +) + +// Wikidata label cleaning filter +function cleanLabel(label){ + + // Remove text after comma. This removes common qualifier text eg. `Bhopal, Madhya Pradesh' -> 'Bhopal' + label = label.split(',')[0]; + + return label; +} From b845ac13d399907384dcf76553088711666a078e Mon Sep 17 00:00:00 2001 From: planemad Date: Mon, 6 Jul 2020 01:22:41 -0400 Subject: [PATCH 2/4] join-wikidata-labels.js Improve querying by paging --- data/package.json | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 data/package.json diff --git a/data/package.json b/data/package.json new file mode 100644 index 0000000..6d83497 --- /dev/null +++ b/data/package.json @@ -0,0 +1,18 @@ +{ + "name": "join-wikidata-labels", + "version": "1.0.0", + "description": "Join localised labels from Wikidata to a GeoJSON", + "main": "join-wikidata-labels.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "author": "Arun Ganesh", + "license": "ISC", + "dependencies": { + "bottleneck": "^2.19.5", + "commander": "^5.1.0", + "fetch-retry": "^4.0.1", + "lodash": "^4.17.15", + "node-fetch": "^2.6.0" + } +} From e18a017d15eeee2146e5011f6beebe5de7626fbe Mon Sep 17 00:00:00 2001 From: planemad Date: Mon, 6 Jul 2020 01:23:27 -0400 Subject: [PATCH 3/4] Improve querying by paging --- data/join-wikidata-labels.js | 121 +++++++++++++++++++++++------------ 1 file changed, 81 insertions(+), 40 deletions(-) diff --git a/data/join-wikidata-labels.js b/data/join-wikidata-labels.js index 86502ef..647a1d2 100644 --- a/data/join-wikidata-labels.js +++ b/data/join-wikidata-labels.js @@ -8,12 +8,29 @@ // // Add localised labels from Wikidata to the GeoJSON data // -// 1. Export OSM data as a GeoJSON. +// 1. Export OSM data as export.geojson +// - world-places http://overpass-turbo.eu/s/VPm +// - india-urban-places http://overpass-turbo.eu/s/VPn +// - india-rural-places http://overpass-turbo.eu/s/VPn +// 2. Run `./join-wikidata-labels.js` +// 3. `tippecanoe -zg -o indic-places.mbtiles --drop-densest-as-needed --extend-zooms-if-still-dropping world-places-translated.geojson india-urban-places-translated.geojson india-rural--places-translated.geojson` // const fs = require('fs'); -const fetch = require('node-fetch'); + +const nodeFetch = require('node-fetch'); +var fetch = require('fetch-retry')(nodeFetch, { + retries: 5, + retryDelay: 500 +}); + const _ = require('lodash'); +const Bottleneck = require("bottleneck/es5"); + +const limiter = new Bottleneck({ + minTime: 50, + maxConcurrent: 5 +}); // const WDK = require('wikibase-sdk'); @@ -23,33 +40,37 @@ const _ = require('lodash'); // }) // let geojson = fs.readFileSync('places.geojson'); -let geojson = fs.readFileSync('places2.geojson'); -let features = JSON.parse(geojson).features; + +const inputFileName = 'india-rural-places.geojson' +const outputFileName = inputFileName.split('.')[0] + '-translated.geojson' + +const features = JSON.parse(fs.readFileSync(inputFileName)).features; -let languages = ["as", "bn", "brx", "doi", "en", "gu", "hi", "kn", "ks", "gom", "mai", "ml", "mni", "mr", "ne", "or", "pa", "sa", "sat", "sd", "ta", "te", "ur"]; -let props = ["place", "wikidata", "name"] -languages.forEach(item=> props.push("name:"+item)) +const languages = ["as", "bn", "brx", "doi", "en", "gu", "hi", "kn", "ks", "gom", "mai", "ml", "mni", "mr", "ne", "or", "pa", "sa", "sat", "sd", "ta", "te", "ur"]; -let qidField = 'wikidata' -let preferWikidataLabel = true -let requestChunkSize = 40 +let filterProps = ["place", "wikidata", "name"] +languages.forEach(item => filterProps.push("name:" + item)) + +const qidField = 'wikidata' +const preferWikidataLabel = true +const requestChunkSize = 250 let joinWikidataLabels = function (features, languages) { // Filter features to only include those with a qid - features = features.filter(item=> - item.properties.hasOwnProperty(qidField) + features = features.filter(item => + item.properties.hasOwnProperty(qidField) ) - + // Drop all fields except names and wikidata id - features.map(item=>{ - Object.keys(item.properties).forEach(prop=>{ - - if (!prop.startsWith("name") && ['place','wikidata','@id'].indexOf(prop)==-1){ + features.map(item => { + Object.keys(item.properties).forEach(prop => { + + if (!prop.startsWith("name") && ['place', 'wikidata', '@id'].indexOf(prop) == -1) { delete item.properties[prop] } }) @@ -76,13 +97,15 @@ let joinWikidataLabels = function (features, languages) { requests.push(translateQids(qidsPage, languages)) } + console.log(`Fetching Wikidata translations from ${requests.length} API requests`) + // Collect all the Wikidata results Promise.all(requests).then(data => { - data = Object.assign({}, ...data) + console.log(`Processing results`) - // console.log(data) + data = Object.assign({}, ...data) // Join properties from Wikidata // Add translation if it exists @@ -106,25 +129,32 @@ let joinWikidataLabels = function (features, languages) { }) - item.properties = _.pick(item.properties, props); + item.properties = _.pick(item.properties, filterProps); }); let geoJSON = { "type": "FeatureCollection", "features": features - } - // console.log(JSON.stringify(geoJSON)) - console.log(features) - return features; + } + + fs.writeFileSync(outputFileName, JSON.stringify(geoJSON)); }); } +// +// Get the translated labels for the given Wikidata QIDs in the requested languages +// + + let translateQids = function (qids, languages) { - let sparql = ` + + // Build a query for the Wikidata SPARQL endpoint + + let sparqlQuery = ` # Test query at https://w.wiki/QQX SELECT ?item (REPLACE(STR(?item),STR(wd:),"") as ?qid) ${languages.map(lang => '?itemLabel_' + lang).join(" ")} @@ -139,15 +169,15 @@ let translateQids = function (qids, languages) { } `; + // Formats the result into a dictionary for easy use - - let result = queryWikidata(sparql) + let result = queryWikidata(sparqlQuery) .then(result => - result.map(item => ({ [item.qid]: (function () { var obj = {}; languages.forEach(lang => { + // Add the label for each language. // Run the label through the cleaning filter before using if (item["itemLabel_" + lang]) { @@ -160,11 +190,11 @@ let translateQids = function (qids, languages) { return obj; })() })) - ) + ) .then(mapped => Object.assign({}, ...mapped)) .catch((error) => { console.log(error) - }); + }); return result; } @@ -172,24 +202,28 @@ let translateQids = function (qids, languages) { // // Fetch JSON results from Wikidata using a SPARQL query // + let queryWikidata = function (sparql) { let sparqlEndpoint = 'https://query.wikidata.org/sparql' let prettyResult = true - var resultPromise = fetch( - `${sparqlEndpoint}?query=${encodeURIComponent( + var resultPromise = limiter.schedule(() => + fetch( + `${sparqlEndpoint}?query=${encodeURIComponent( sparql )}&format=json`, { - headers: { - accept: "application/sparql-results+json" + headers: { + accept: "application/sparql-results+json", + "user-agent": "join-osm-wikidata-label/0.1 (https://github.com/osm-in/indic-map/issues/1; arun.planemad@gmail.com) node-fetch" // See https://meta.wikimedia.org/wiki/User-Agent_policy + } } - } - ) + )) .then(response => response.json()) .then(json => { // Remap result values into a pretty object + let result = json.results.bindings; if (prettyResult) { @@ -201,20 +235,27 @@ let queryWikidata = function (sparql) { } return result; + }) + .catch(function (error) { + console.log("Network Error", error); }); return resultPromise; } -let result = joinWikidataLabels(features, - languages -) + // Wikidata label cleaning filter -function cleanLabel(label){ +function cleanLabel(label) { // Remove text after comma. This removes common qualifier text eg. `Bhopal, Madhya Pradesh' -> 'Bhopal' label = label.split(',')[0]; return label; } + + +let result = joinWikidataLabels(features, + languages +) + From 8922ef1a13207a7bb2cdea039772ab44936e4fca Mon Sep 17 00:00:00 2001 From: Arun Ganesh Date: Mon, 6 Jul 2020 01:48:03 -0400 Subject: [PATCH 4/4] Use OSM India token --- data/join-wikidata-labels.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/data/join-wikidata-labels.js b/data/join-wikidata-labels.js index 647a1d2..9543cb7 100644 --- a/data/join-wikidata-labels.js +++ b/data/join-wikidata-labels.js @@ -3,7 +3,8 @@ // OpenStreetMap India // https://wiki.openstreetmap.org/wiki/Map_internationalization_(India) // Tileset -// 'https://b.tiles.mapbox.com/v4/planemad.6mk61jbn/{z}/{x}/{y}.vector.pbf?access_token=pk.eyJ1IjoicGxhbmVtYWQiLCJhIjoiemdYSVVLRSJ9.g3lbg_eN0kztmsfIPxa9MQ' +// 'https://b.tiles.mapbox.com/v4/planemad.6mk61jbn/{z}/{x}/{y}.vector.pbf?access_token=pk.eyJ1Ijoib3NtLWluIiwiYSI6ImNqcnVxMTNrNTJwbHc0M250anUyOW81MjgifQ.cZnvZEyWT5AzNeO3ajg5tg' + // // Add localised labels from Wikidata to the GeoJSON data