diff --git a/helper/unicode.js b/helper/unicode.js index e0fdaab80..7e1d77751 100644 --- a/helper/unicode.js +++ b/helper/unicode.js @@ -1,5 +1,6 @@ const _ = require('lodash'); const regenerate = require('regenerate'); +const unicodeToArray = require('lodash/_unicodeToArray'); // non-printable control characters // ref: https://en.wikipedia.org/wiki/List_of_Unicode_characters @@ -94,3 +95,31 @@ function normalize(str) { } module.exports.normalize = normalize; + +// unicode aware string length function +// note: ported from 'npm stringz' using 'lodash' internals in place of 'char-regex' +module.exports.length = (str) => { + + // sanity checking + if (!_.isString(str)) { throw new Error('invalid string'); } + + // return count of unicode characters + return unicodeToArray(str).length; +}; + +// unicode aware substring function +// note: ported from 'npm stringz' using 'lodash' internals in place of 'char-regex' +module.exports.substring = (str, begin, end) => { + + // sanity checking + if (!_.isString(str)) { throw new Error('invalid string'); } + + // even though negative numbers work here, they're not in the spec + if (!_.isFinite(begin) || begin < 0) { begin = 0; } + if (_.isFinite(end) && end < 0) { end = 0; } + + const chars = unicodeToArray(str); + if (chars.length === 0){ return ''; } + + return chars.slice(begin, end).join(''); +}; diff --git a/sanitizer/_text.js b/sanitizer/_text.js index f408e32d6..19610dd61 100644 --- a/sanitizer/_text.js +++ b/sanitizer/_text.js @@ -21,9 +21,9 @@ function _sanitize( raw, clean ){ if( !_.isString(text) || _.isEmpty(text) ){ messages.errors.push(`invalid param 'text': text length, must be >0`); } else { - if( text.length > MAX_TEXT_LENGTH ){ + if( unicode.length(text) > MAX_TEXT_LENGTH ){ messages.warnings.push(`param 'text' truncated to ${MAX_TEXT_LENGTH} characters`); - text = text.substring(0, MAX_TEXT_LENGTH); + text = unicode.substring(text, 0, MAX_TEXT_LENGTH); } clean.text = text; } diff --git a/test/unit/sanitizer/_text.js b/test/unit/sanitizer/_text.js index 83ee981a7..cb55e1208 100644 --- a/test/unit/sanitizer/_text.js +++ b/test/unit/sanitizer/_text.js @@ -1,4 +1,5 @@ const sanitizer = require('../../../sanitizer/_text')(); +const unicode = require('../../../helper/unicode'); module.exports.tests = {}; @@ -154,6 +155,19 @@ it again and again until we reach our destination.` }; t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]); t.end(); }); + + // https://github.com/pelias/api/issues/1574 + test('truncate should be unicode aware', (t) => { + const raw = { text: 'a' + '👩‍❤️‍👩'.repeat(200) }; + const clean = {}; + const messages = sanitizer.sanitize(raw, clean); + + t.equals(unicode.length(clean.text), 140); + t.equals(clean.text, 'a' + '👩‍❤️‍👩'.repeat(139)); + t.deepEquals(messages.errors, [], 'no errors'); + t.deepEquals(messages.warnings, [`param 'text' truncated to 140 characters`]); + t.end(); + }); }; module.exports.all = (tape, common) => {