Skip to content

Commit

Permalink
更新替换算法,解决部分词语找不到的bug
Browse files Browse the repository at this point in the history
  • Loading branch information
overtrue committed Oct 6, 2014
1 parent 4a778d5 commit 2d47ad2
Show file tree
Hide file tree
Showing 4 changed files with 106,767 additions and 106,794 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ echo Pinyin::pinyin('带着希望去旅行,比到达终点更美好');
# TODO
- [x] 添加获取首字母;
- [x] 支持繁体;
- [ ] 添加补充词典;
- [x] 添加补充词典;
- [x] 添加词频字典,根据词频提高未匹配词典时多音字准确度;

# 参考
Expand Down
124 changes: 47 additions & 77 deletions src/Overtrue/Pinyin.php
Original file line number Diff line number Diff line change
Expand Up @@ -192,50 +192,8 @@ public static function letter($string, $delimiter = null)
*/
protected function string2pinyin($string)
{
$stringLength = $this->getStringLength($string);
$pingyin = [];

// do replace
for ($i = 0; $i < $stringLength; ) {
$str = $this->getChar($string, $i);
$next = $str . $this->getChar($string, ++$i);

while ((
!$this->containsChinese($str)
&& !$this->containsChinese($next)
) || ($i < $stringLength && $this->hasPinyin($next))) {
$str = $next;
$next = $str . $this->getChar($string, ++$i);
};

$pingyin[] = $this->getPinyin($str);
}

return join(' ', $pingyin);
}

/**
* detect the pinyin of string.
*
* @param string $string source string.
*
* @return boolean
*/
protected function hasPinyin($string)
{
return isset(self::$dictionary[$string]);
}

/**
* get string pinyin
*
* @param string $string source string.
*
* @return string
*/
protected function getPinyin($string)
{
$pinyin = $this->hasPinyin($string) ? self::$dictionary[$string] : $string;
$string = $this->prepare($string);
$pinyin = strtr($string, self::$dictionary);

// add accents
if (self::$settings['accent']) {
Expand All @@ -247,51 +205,44 @@ protected function getPinyin($string)
return $pinyin;
}

/**
* get char
*
* @param string $string source string.
* @param integer $offset offset.
*
* @return string
*/
protected function getChar($string, $offset)
{
return mb_substr($string, $offset, 1, 'UTF-8');
}

/**
* get length of string
*
* @param string $string source string.
*
* @return integer
*/
protected function getStringLength($string)
{
return mb_strlen($string, 'UTF-8');
}

/**
* load dictionary content
*
* @return array
*/
protected function loadDictionary()
{
$dictFilename = __DIR__ .'/data/dict.php';
$ceditDictFilename = __DIR__ .'/data/cedict/cedict_ts.u8';
$dictFile = __DIR__ .'/data/dict.php';
$ceditDictFile = __DIR__ .'/data/cedict/cedict_ts.u8';
$additionalWords = $this->getAdditionalWords();

// load from cache
if (file_exists($dictFilename)) {
return $this->loadFromCache($dictFilename);
if (file_exists($dictFile)) {
return $this->loadFromCache($dictFile);
}

// parse and cache
$parsedDictionary = $this->parseDictionary($ceditDictFilename);
$this->cache($dictFilename, $parsedDictionary);
$parsedDictionary = $this->parseDictionary($ceditDictFile);

$dictionary = array_merge($parsedDictionary, $additionalWords);

$this->cache($dictFile, $dictionary);

return $dictionary;
}

/**
* return additional words
*
* @return array
*/
protected function getAdditionalWords()
{
$additionalWords = include __DIR__ . '/data/additional.php';

return $parsedDictionary;
return array_map(function($pinyin){
return "$pinyin ";
}, $additionalWords);
}

/**
Expand Down Expand Up @@ -322,7 +273,7 @@ protected function parseDictionary($dictionaryFile)

// frequency check
if (!isset($content[$key]) || $this->moreCommonly($matches['pinyin'], $content[$key])) {
$content[$key] = $matches['pinyin'];
$content[$key] = "{$matches['pinyin']} ";
}
}

Expand All @@ -338,6 +289,9 @@ protected function parseDictionary($dictionaryFile)
*/
protected function moreCommonly($pinyin, $target)
{
$pinyin = trim($pinyin);
$target = trim($target);

return isset(self::$frequency[$pinyin])
&& isset(self::$frequency[$target])
&& self::$frequency[$pinyin] > self::$frequency[$target];
Expand Down Expand Up @@ -437,6 +391,22 @@ protected function containsChinese($string)
return preg_match('/\p{Han}+/u', $string);
}

/**
* prepare the string.
*
* @param string $string source string.
*
* @return string
*/
protected function prepare($string)
{
$pattern = array(
'/([a-z])+(\d)/' => '\\1\\\2', // test4 => test\4
);

return preg_replace(array_keys($pattern), $pattern, $string);
}

/**
* Credits for this function go to velcrow, who shared this
* at http://stackoverflow.com/questions/1162491/alternative-to-mysql-real-escape-string-without-connecting-to-db
Expand Down
Loading

0 comments on commit 2d47ad2

Please sign in to comment.