Skip to content

Commit

Permalink
suport chinese.
Browse files Browse the repository at this point in the history
1. test

Signed-off-by: cq.longhaibin <[email protected]>
  • Loading branch information
cq.longhaibin committed Jun 11, 2018
1 parent a85fd86 commit 7a166e6
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 15 deletions.
27 changes: 13 additions & 14 deletions lunr.zh-cn.js → lunr.zhcn.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
* This example returns an object, but the module
* can return a function as the exported value.
*/
return function(lunr) {
return function(lunr) {
/* throw error if lunr is not yet included */
if ('undefined' === typeof lunr) {
throw new Error('Lunr is not present. Please include / require Lunr before this script.');
Expand All @@ -41,44 +41,43 @@
var isLunr2 = lunr.version[0] == "2";

/* register specific locale function */
lunr.zhch = function() {
lunr.zhcn = function() {
this.pipeline.reset();
this.pipeline.add(
/*lunr.th.stopWordFilter,*/
lunr.zhch.trimmer
lunr.zhcn.trimmer
);

if (isLunr2) { // for lunr version 2.0.0
this.tokenizer = lunr.zhch.tokenizer;
this.tokenizer = lunr.zhcn.tokenizer;
} else {
if (lunr.tokenizer) { // for lunr version 0.6.0
lunr.tokenizer = lunr.zhch.tokenizer;
lunr.tokenizer = lunr.zhcn.tokenizer;
}
if (this.tokenizerFn) { // for lunr version 0.7.0 -> 1.0.0
this.tokenizerFn = lunr.zhch.tokenizer;
this.tokenizerFn = lunr.zhcn.tokenizer;
}
}
};

/* lunr trimmer function */
lunr.zhch.isChineseChar = function(str){
lunr.zhcn.isChineseChar = function(str){
var reg = /[\u4E00-\u9FA5\uF900-\uFA2D]/;
return reg.test(str);
}
lunr.zhch.trimmer = function(token){
if(this.isChineseChar(token)){
lunr.zhcn.trimmer = function(token){
if(lunr.zhcn.isChineseChar(token)){
return token;
}
return token.replace(/^\W+/, '').replace(/^\W+$/, '');
}
lunr.Pipeline.registerFunction(lunr.zhch.trimmer, 'trimmer-zhcn');
lunr.Pipeline.registerFunction(lunr.zhcn.trimmer, 'trimmer-zhcn');

lunr.zhch.tokenizer = function (obj) {
lunr.zhcn.tokenizer = function (obj) {
if (!arguments.length || obj == null || obj == undefined) return []
if (Array.isArray(obj)) return obj.map(function (t) { return lunr.utils.asString(t).toLowerCase() })
var str = obj.toString().replace(/^\s+/, '')

for (var i = str.length - 1; i >= 0; i--) {
for (var i = str.length - 1; i >= 0; i--) { //这里需要用标点符号进行分隔
if (/\S/.test(str.charAt(i))) {
str = str.substring(0, i + 1)
break
Expand All @@ -101,7 +100,7 @@
// This should be removed when version 1.0.0 is released.
var separator = lunr.tokenizer.seperator || lunr.tokenizer.separator

return obj.toString().trim().toLowerCase().split(separator)
return obj.toString().trim().toLowerCase().split(separator);
}
};
}))
3 changes: 2 additions & 1 deletion test/VersionsAndLanguagesTest.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ var testDocuments = {
ru: require('./testdata/ru'),
sv: require('./testdata/sv'),
tr: require('./testdata/tr'),
th: require('./testdata/th')
th: require('./testdata/th'),
zhcn: require('./testdata/zhcn')
};

lunrVersions.forEach(function(lunrVersion) {
Expand Down
36 changes: 36 additions & 0 deletions test/testdata/zhcn.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
module.exports = {
fields: [
{
name: 'title',
config: { boost: 10 }
}, {
name: 'body'
}
],
documents: [
{
"title": "一首歌",
"body": "汉字的 起源,有 传说中的 仓颉 造字 。我们现在 能够确 认距今约3000多年的甲骨文已经是非常成熟的文字体系,于1899年被发现。可以考证的汉字发展经历了甲骨文、金文、小篆、汉隶、楷书、行书、草书等过程,可以划分为两个大阶段。从甲骨文字到小篆是一个阶段;从秦汉时代的隶书以下是另一个阶段。前者属于古文字的范畴,后者属于近代文字的范畴。",
"id": 1
}, {
"title": "现代",
"body": "大体说来,从隶书到今 天使用的 现代 汉字 形体 上没有太大的变化。从汉字跟汉语的关系看,汉字是一种语素文字。汉字代表的是汉语里的语素。汉字有独体字与合体字的区别。从构造上讲,合体字比独体字高一个层次 。",
"id": 2
}
],
tests: [
{
what: "find the word %w",
search: "关系",
found: 1
}, {
what: "find the word %w",
search: "一首歌",
found: 1
}, {
what: "never find a word that does not exist, like %w",
search: "科学家",
found: 0
}
]
}

0 comments on commit 7a166e6

Please sign in to comment.