From 7a166e6656bdefff81902dce1583d7fd53d7a82c Mon Sep 17 00:00:00 2001 From: "cq.longhaibin" Date: Mon, 11 Jun 2018 16:14:07 +0800 Subject: [PATCH] suport chinese. 1. test Signed-off-by: cq.longhaibin --- lunr.zh-cn.js => lunr.zhcn.js | 27 ++++++++++++------------ test/VersionsAndLanguagesTest.js | 3 ++- test/testdata/zhcn.js | 36 ++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 15 deletions(-) rename lunr.zh-cn.js => lunr.zhcn.js (84%) create mode 100644 test/testdata/zhcn.js diff --git a/lunr.zh-cn.js b/lunr.zhcn.js similarity index 84% rename from lunr.zh-cn.js rename to lunr.zhcn.js index dd0514c..f4876c9 100644 --- a/lunr.zh-cn.js +++ b/lunr.zhcn.js @@ -23,7 +23,7 @@ * This example returns an object, but the module * can return a function as the exported value. */ - return function(lunr) { + return function(lunr) { /* throw error if lunr is not yet included */ if ('undefined' === typeof lunr) { throw new Error('Lunr is not present. Please include / require Lunr before this script.'); @@ -41,44 +41,43 @@ var isLunr2 = lunr.version[0] == "2"; /* register specific locale function */ - lunr.zhch = function() { + lunr.zhcn = function() { this.pipeline.reset(); this.pipeline.add( - /*lunr.th.stopWordFilter,*/ - lunr.zhch.trimmer + lunr.zhcn.trimmer ); if (isLunr2) { // for lunr version 2.0.0 - this.tokenizer = lunr.zhch.tokenizer; + this.tokenizer = lunr.zhcn.tokenizer; } else { if (lunr.tokenizer) { // for lunr version 0.6.0 - lunr.tokenizer = lunr.zhch.tokenizer; + lunr.tokenizer = lunr.zhcn.tokenizer; } if (this.tokenizerFn) { // for lunr version 0.7.0 -> 1.0.0 - this.tokenizerFn = lunr.zhch.tokenizer; + this.tokenizerFn = lunr.zhcn.tokenizer; } } }; /* lunr trimmer function */ - lunr.zhch.isChineseChar = function(str){ + lunr.zhcn.isChineseChar = function(str){ var reg = /[\u4E00-\u9FA5\uF900-\uFA2D]/; return reg.test(str); } - lunr.zhch.trimmer = function(token){ - if(this.isChineseChar(token)){ + lunr.zhcn.trimmer = function(token){ + if(lunr.zhcn.isChineseChar(token)){ return token; } return token.replace(/^\W+/, '').replace(/^\W+$/, ''); } - lunr.Pipeline.registerFunction(lunr.zhch.trimmer, 'trimmer-zhcn'); + lunr.Pipeline.registerFunction(lunr.zhcn.trimmer, 'trimmer-zhcn'); - lunr.zhch.tokenizer = function (obj) { + lunr.zhcn.tokenizer = function (obj) { if (!arguments.length || obj == null || obj == undefined) return [] if (Array.isArray(obj)) return obj.map(function (t) { return lunr.utils.asString(t).toLowerCase() }) var str = obj.toString().replace(/^\s+/, '') - for (var i = str.length - 1; i >= 0; i--) { + for (var i = str.length - 1; i >= 0; i--) { //这里需要用标点符号进行分隔 if (/\S/.test(str.charAt(i))) { str = str.substring(0, i + 1) break @@ -101,7 +100,7 @@ // This should be removed when version 1.0.0 is released. var separator = lunr.tokenizer.seperator || lunr.tokenizer.separator - return obj.toString().trim().toLowerCase().split(separator) + return obj.toString().trim().toLowerCase().split(separator); } }; })) \ No newline at end of file diff --git a/test/VersionsAndLanguagesTest.js b/test/VersionsAndLanguagesTest.js index becc6e8..f8a0b30 100644 --- a/test/VersionsAndLanguagesTest.js +++ b/test/VersionsAndLanguagesTest.js @@ -33,7 +33,8 @@ var testDocuments = { ru: require('./testdata/ru'), sv: require('./testdata/sv'), tr: require('./testdata/tr'), - th: require('./testdata/th') + th: require('./testdata/th'), + zhcn: require('./testdata/zhcn') }; lunrVersions.forEach(function(lunrVersion) { diff --git a/test/testdata/zhcn.js b/test/testdata/zhcn.js new file mode 100644 index 0000000..8f5228b --- /dev/null +++ b/test/testdata/zhcn.js @@ -0,0 +1,36 @@ +module.exports = { + fields: [ + { + name: 'title', + config: { boost: 10 } + }, { + name: 'body' + } + ], + documents: [ + { + "title": "一首歌", + "body": "汉字的 起源,有 传说中的 仓颉 造字 。我们现在 能够确 认距今约3000多年的甲骨文已经是非常成熟的文字体系,于1899年被发现。可以考证的汉字发展经历了甲骨文、金文、小篆、汉隶、楷书、行书、草书等过程,可以划分为两个大阶段。从甲骨文字到小篆是一个阶段;从秦汉时代的隶书以下是另一个阶段。前者属于古文字的范畴,后者属于近代文字的范畴。", + "id": 1 + }, { + "title": "现代", + "body": "大体说来,从隶书到今 天使用的 现代 汉字 形体 上没有太大的变化。从汉字跟汉语的关系看,汉字是一种语素文字。汉字代表的是汉语里的语素。汉字有独体字与合体字的区别。从构造上讲,合体字比独体字高一个层次 。", + "id": 2 + } + ], + tests: [ + { + what: "find the word %w", + search: "关系", + found: 1 + }, { + what: "find the word %w", + search: "一首歌", + found: 1 + }, { + what: "never find a word that does not exist, like %w", + search: "科学家", + found: 0 + } + ] +} \ No newline at end of file