From 48c70a963e803b93fe68a191e62d0770b5446f0b Mon Sep 17 00:00:00 2001 From: Ilija Tovilo Date: Thu, 15 Mar 2018 22:23:44 +0100 Subject: [PATCH] Fix issues with unicode characters - fixes #71 (#72) * Fix issues with unicode characters * Prevent empty elements when splitting string into words --- lib/Caxy/HtmlDiff/AbstractDiff.php | 32 ++++++++++++------- lib/Caxy/HtmlDiff/HtmlDiff.php | 28 ++++++++-------- lib/Caxy/HtmlDiff/HtmlDiffConfig.php | 2 +- lib/Caxy/HtmlDiff/ListDiff.php | 12 +++---- lib/Caxy/HtmlDiff/Preprocessor.php | 16 +++++----- .../Strategy/ListItemMatchStrategy.php | 8 ++--- lib/Caxy/HtmlDiff/Table/TableDiff.php | 2 +- 7 files changed, 55 insertions(+), 45 deletions(-) diff --git a/lib/Caxy/HtmlDiff/AbstractDiff.php b/lib/Caxy/HtmlDiff/AbstractDiff.php index 3506c74..46f7402 100644 --- a/lib/Caxy/HtmlDiff/AbstractDiff.php +++ b/lib/Caxy/HtmlDiff/AbstractDiff.php @@ -398,9 +398,9 @@ protected function getClosingTag($tag) */ protected function getStringBetween($str, $start, $end) { - $expStr = explode($start, $str, 2); + $expStr = mb_split($start, $str, 2); if (count($expStr) > 1) { - $expStr = explode($end, $expStr[ 1 ]); + $expStr = mb_split($end, $expStr[ 1 ]); if (count($expStr) > 1) { array_pop($expStr); @@ -461,7 +461,7 @@ protected function setNewWords(array $newWords) */ protected function isPartOfWord($text) { - return ctype_alnum(str_replace($this->config->getSpecialCaseChars(), '', $text)); + return $this->ctypeAlphanumUnicode(str_replace($this->config->getSpecialCaseChars(), '', $text)); } /** @@ -485,15 +485,15 @@ protected function convertHtmlToListOfWords($characterString) $current_word = '<'; $mode = 'tag'; - } elseif (preg_match("/\s/", $character)) { + } elseif (preg_match("/\s/u", $character)) { if ($current_word !== '') { $words[] = $current_word; } - $current_word = $keepNewLines ? $character : preg_replace('/\s+/S', ' ', $character); + $current_word = $keepNewLines ? $character : preg_replace('/\s+/Su', ' ', $character); $mode = 'whitespace'; } else { if ( - (ctype_alnum($character) && (strlen($current_word) == 0 || $this->isPartOfWord($current_word))) || + (($this->ctypeAlphanumUnicode($character)) && (mb_strlen($current_word) == 0 || $this->isPartOfWord($current_word))) || (in_array($character, $this->config->getSpecialCaseChars()) && isset($characterString[$i + 1]) && $this->isPartOfWord($characterString[$i + 1])) ) { $current_word .= $character; @@ -509,7 +509,7 @@ protected function convertHtmlToListOfWords($characterString) $words[] = $current_word; $current_word = ''; - if (!preg_match('[^\s]', $character)) { + if (!preg_match('[^\s]u', $character)) { $mode = 'whitespace'; } else { $mode = 'character'; @@ -525,9 +525,9 @@ protected function convertHtmlToListOfWords($characterString) } $current_word = '<'; $mode = 'tag'; - } elseif (preg_match("/\s/", $character)) { + } elseif (preg_match("/\s/u", $character)) { $current_word .= $character; - if (!$keepNewLines) $current_word = preg_replace('/\s+/S', ' ', $current_word); + if (!$keepNewLines) $current_word = preg_replace('/\s+/Su', ' ', $current_word); } else { if ($current_word != '') { $words[] = $current_word; @@ -574,7 +574,7 @@ protected function isEndOfTag($val) */ protected function isWhiteSpace($value) { - return !preg_match('[^\s]', $value); + return !preg_match('[^\s]u', $value); } /** @@ -585,6 +585,16 @@ protected function isWhiteSpace($value) protected function explode($value) { // as suggested by @onassar - return preg_split('//u', $value); + return preg_split('//u', $value, -1, PREG_SPLIT_NO_EMPTY); + } + + /** + * @param string $str + * + * @return bool + */ + protected function ctypeAlphanumUnicode($str) + { + return preg_match("/^[a-zA-Z0-9\pL]+$/u", $str); } } diff --git a/lib/Caxy/HtmlDiff/HtmlDiff.php b/lib/Caxy/HtmlDiff/HtmlDiff.php index 28d1e63..3f4a0d4 100644 --- a/lib/Caxy/HtmlDiff/HtmlDiff.php +++ b/lib/Caxy/HtmlDiff/HtmlDiff.php @@ -158,7 +158,7 @@ protected function createIsolatedDiffTagPlaceholders(&$words) foreach ($words as $index => $word) { $openIsolatedDiffTag = $this->isOpeningIsolatedDiffTag($word, $currentIsolatedDiffTag); if ($openIsolatedDiffTag) { - if ($this->isSelfClosingTag($word) || stripos($word, 'isSelfClosingTag($word) || mb_stripos($word, ' $index, @@ -205,7 +205,7 @@ protected function isOpeningIsolatedDiffTag($item, $currentIsolatedDiffTag = nul $tagsToMatch = $currentIsolatedDiffTag !== null ? array($currentIsolatedDiffTag => $this->config->getIsolatedDiffTagPlaceholder($currentIsolatedDiffTag)) : $this->config->getIsolatedDiffTags(); - $pattern = '#<%s(\s+[^>]*)?>#iU'; + $pattern = '#<%s(\s+[^>]*)?>#iUu'; foreach ($tagsToMatch as $key => $value) { if (preg_match(sprintf($pattern, $key), $item)) { return $key; @@ -217,7 +217,7 @@ protected function isOpeningIsolatedDiffTag($item, $currentIsolatedDiffTag = nul protected function isSelfClosingTag($text) { - return (bool) preg_match('/<[^>]+\/\s*>/', $text); + return (bool) preg_match('/<[^>]+\/\s*>/u', $text); } /** @@ -231,7 +231,7 @@ protected function isClosingIsolatedDiffTag($item, $currentIsolatedDiffTag = nul $tagsToMatch = $currentIsolatedDiffTag !== null ? array($currentIsolatedDiffTag => $this->config->getIsolatedDiffTagPlaceholder($currentIsolatedDiffTag)) : $this->config->getIsolatedDiffTags(); - $pattern = '#]*)?>#iU'; + $pattern = '#]*)?>#iUu'; foreach ($tagsToMatch as $key => $value) { if (preg_match(sprintf($pattern, $key), $item)) { return $key; @@ -354,7 +354,7 @@ protected function diffElements($oldText, $newText, $stripWrappingTags = true) $wrapEnd = ''; if ($stripWrappingTags) { - $pattern = '/(^<[^>]+>)|(<\/[^>]+>$)/i'; + $pattern = '/(^<[^>]+>)|(<\/[^>]+>$)/iu'; $matches = array(); if (preg_match_all($pattern, $newText, $matches)) { @@ -441,7 +441,7 @@ protected function processEqualOperation($operation) protected function getAttributeFromTag($text, $attribute) { $matches = array(); - if (preg_match(sprintf('/<[^>]*\b%s\s*=\s*([\'"])(.*)\1[^>]*>/i', $attribute), $text, $matches)) { + if (preg_match(sprintf('/<[^>]*\b%s\s*=\s*([\'"])(.*)\1[^>]*>/iu', $attribute), $text, $matches)) { return htmlspecialchars_decode($matches[2]); } @@ -567,7 +567,7 @@ protected function insertTag($tag, $cssClass, &$words) } } } - if (count($words) == 0 && strlen($specialCaseTagInjection) == 0) { + if (count($words) == 0 && mb_strlen($specialCaseTagInjection) == 0) { break; } if ($specialCaseTagInjectionIsBefore) { @@ -575,7 +575,7 @@ protected function insertTag($tag, $cssClass, &$words) } else { $workTag = $this->extractConsecutiveWords($words, 'tag'); if (isset($workTag[ 0 ]) && $this->isOpeningTag($workTag[ 0 ]) && !$this->isClosingTag($workTag[ 0 ])) { - if (strpos($workTag[ 0 ], 'class=')) { + if (mb_strpos($workTag[ 0 ], 'class=')) { $workTag[ 0 ] = str_replace('class="', 'class="diffmod ', $workTag[ 0 ]); $workTag[ 0 ] = str_replace("class='", 'class="diffmod ', $workTag[ 0 ]); } else { @@ -584,7 +584,7 @@ protected function insertTag($tag, $cssClass, &$words) } $appendContent = implode('', $workTag).$specialCaseTagInjection; - if (isset($workTag[0]) && false !== stripos($workTag[0], 'wrapText($appendContent, $tag, $cssClass); } $this->content .= $appendContent; @@ -673,7 +673,7 @@ protected function isTag($item) */ protected function isOpeningTag($item) { - return preg_match('#<[^>]+>\\s*#iU', $item); + return preg_match('#<[^>]+>\\s*#iUu', $item); } /** @@ -683,7 +683,7 @@ protected function isOpeningTag($item) */ protected function isClosingTag($item) { - return preg_match('#]+>\\s*#iU', $item); + return preg_match('#]+>\\s*#iUu', $item); } /** @@ -769,10 +769,10 @@ protected function findMatchingBlocks($startInOld, $endInOld, $startInNew, $endI */ protected function stripTagAttributes($word) { - $space = strpos($word, ' ', 1); + $space = mb_strpos($word, ' ', 1); if ($space) { - return '<' . substr($word, 1, $space) . '>'; + return '<' . mb_substr($word, 1, $space) . '>'; } return trim($word, '<>'); @@ -850,7 +850,7 @@ protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew) protected function isOnlyWhitespace($str) { // Slightly faster then using preg_match - return $str !== '' && (strlen(trim($str)) === 0); + return $str !== '' && (mb_strlen(trim($str)) === 0); } /** diff --git a/lib/Caxy/HtmlDiff/HtmlDiffConfig.php b/lib/Caxy/HtmlDiff/HtmlDiffConfig.php index fc98425..2bfdcaa 100644 --- a/lib/Caxy/HtmlDiff/HtmlDiffConfig.php +++ b/lib/Caxy/HtmlDiff/HtmlDiffConfig.php @@ -345,7 +345,7 @@ public function setIsolatedDiffTags($isolatedDiffTags) public function addIsolatedDiffTag($tag, $placeholder = null) { if (null === $placeholder) { - $placeholder = sprintf('[[REPLACE_%s]]', strtoupper($tag)); + $placeholder = sprintf('[[REPLACE_%s]]', mb_strtoupper($tag)); } if ($this->isIsolatedDiffTag($tag) && $this->isolatedDiffTags[$tag] !== $placeholder) { diff --git a/lib/Caxy/HtmlDiff/ListDiff.php b/lib/Caxy/HtmlDiff/ListDiff.php index a0b6363..da054c1 100644 --- a/lib/Caxy/HtmlDiff/ListDiff.php +++ b/lib/Caxy/HtmlDiff/ListDiff.php @@ -233,7 +233,7 @@ protected function buildDiffList($words) $list[] = $word; } } else { - $listType = substr($word, 1, 2); + $listType = mb_substr($word, 1, 2); $listStartTag = $word; } @@ -254,7 +254,7 @@ protected function buildDiffList($words) if ($openListItems === 0) { // New top-level list item $currentListItem = array(); - $listItemType = substr($word, 1, 2); + $listItemType = mb_substr($word, 1, 2); $listItemStart = $word; } else { $currentListItem[] = $word; @@ -290,27 +290,27 @@ protected function isOpeningListTag($word, $type = null) { $filter = $type !== null ? array('<'.$type) : array(' $this->lengthRatioThreshold) { return true; diff --git a/lib/Caxy/HtmlDiff/Table/TableDiff.php b/lib/Caxy/HtmlDiff/Table/TableDiff.php index 2779d77..6d8b171 100644 --- a/lib/Caxy/HtmlDiff/Table/TableDiff.php +++ b/lib/Caxy/HtmlDiff/Table/TableDiff.php @@ -733,7 +733,7 @@ protected function htmlFromNode($node) protected function setInnerHtml($node, $html) { // DOMDocument::loadHTML does not allow empty strings. - if (strlen(trim($html)) === 0) { + if (mb_strlen(trim($html)) === 0) { $html = ''; }