Skip to content

Commit

Permalink
1. support simple chinese
Browse files Browse the repository at this point in the history
Signed-off-by: cq.longhaibin <[email protected]>
  • Loading branch information
cq.longhaibin committed Jun 11, 2018
1 parent b16c6e7 commit a85fd86
Showing 1 changed file with 107 additions and 0 deletions.
107 changes: 107 additions & 0 deletions lunr.zh-cn.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/**
* lunr对中文分词的支持
*/
;
(function(root, factory) {
if (typeof define === 'function' && define.amd) {
// AMD. Register as an anonymous module.
define(factory)
} else if (typeof exports === 'object') {
/**
* Node. Does not work with strict CommonJS, but
* only CommonJS-like environments that support module.exports,
* like Node.
*/
module.exports = factory()
} else {
// Browser globals (root is window)
factory()(root.lunr);
}
}(this, function() {
/**
* Just return a value to define the module export.
* This example returns an object, but the module
* can return a function as the exported value.
*/
return function(lunr) {
/* throw error if lunr is not yet included */
if ('undefined' === typeof lunr) {
throw new Error('Lunr is not present. Please include / require Lunr before this script.');
}

/* throw error if lunr stemmer support is not yet included */
if ('undefined' === typeof lunr.stemmerSupport) {
throw new Error('Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.');
}

/*
Thai tokenization is the same to Japanense, which does not take into account spaces.
So, it uses the same logic to assign tokenization function due to different Lunr versions.
*/
var isLunr2 = lunr.version[0] == "2";

/* register specific locale function */
lunr.zhch = function() {
this.pipeline.reset();
this.pipeline.add(
/*lunr.th.stopWordFilter,*/
lunr.zhch.trimmer
);

if (isLunr2) { // for lunr version 2.0.0
this.tokenizer = lunr.zhch.tokenizer;
} else {
if (lunr.tokenizer) { // for lunr version 0.6.0
lunr.tokenizer = lunr.zhch.tokenizer;
}
if (this.tokenizerFn) { // for lunr version 0.7.0 -> 1.0.0
this.tokenizerFn = lunr.zhch.tokenizer;
}
}
};

/* lunr trimmer function */
lunr.zhch.isChineseChar = function(str){
var reg = /[\u4E00-\u9FA5\uF900-\uFA2D]/;
return reg.test(str);
}
lunr.zhch.trimmer = function(token){
if(this.isChineseChar(token)){
return token;
}
return token.replace(/^\W+/, '').replace(/^\W+$/, '');
}
lunr.Pipeline.registerFunction(lunr.zhch.trimmer, 'trimmer-zhcn');

lunr.zhch.tokenizer = function (obj) {
if (!arguments.length || obj == null || obj == undefined) return []
if (Array.isArray(obj)) return obj.map(function (t) { return lunr.utils.asString(t).toLowerCase() })
var str = obj.toString().replace(/^\s+/, '')

for (var i = str.length - 1; i >= 0; i--) {
if (/\S/.test(str.charAt(i))) {
str = str.substring(0, i + 1)
break
}
}


var rs = str
.split(/[\ |\~|\`|\!|\@|\#|\$|\%|\^|\&|\*|\uFE30-\uFFA0|\(|\)|\-|\_|\+|\=|\||\\|\[|\]|\{|\}|\;|\:|\"|\'|\,|\<|\.|\>|\/|\?]+/)
.map(function (token) {
var t = token.replace(/[\ |\~|\`|\!|\@|\#|\$|\%|\^|\&|\*|\uFE30-\uFFA0|\(|\)|\-|\_|\+|\=|\||\\|\[|\]|\{|\}|\;|\:|\"|\'|\,|\<|\.|\>|\/|\?]/g, '').toLowerCase()

return t;
});

// TODO: This exists so that the deprecated property lunr.tokenizer.seperator can still be used. By
// default it is set to false and so the correctly spelt lunr.tokenizer.separator is used unless
// the user is using the old property to customise the tokenizer.
//
// This should be removed when version 1.0.0 is released.
var separator = lunr.tokenizer.seperator || lunr.tokenizer.separator

return obj.toString().trim().toLowerCase().split(separator)
}
};
}))

0 comments on commit a85fd86

Please sign in to comment.