From 123e54c9e88a2d8c3ffa9c604d6d5de0170aca04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juli=C3=A1n=20Guti=C3=A9rrez?= Date: Mon, 3 Jun 2019 12:19:47 +0200 Subject: [PATCH 1/3] fix unicode char encoding over U+1F210 --- SMSCounter.php | 16 +++++++++++++--- Tests/SMSCounterTest.php | 21 ++++++++++++++++----- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/SMSCounter.php b/SMSCounter.php index 3063618..a9b0dab 100644 --- a/SMSCounter.php +++ b/SMSCounter.php @@ -355,9 +355,19 @@ public function utf8ToUnicode($str) $values[] = $thisValue; if (count($values) === $lookingFor) { - $number = ($lookingFor === 3) ? - (($values[0] % 16) * 4096) + (($values[1] % 64) * 64) + ($values[2] % 64) : - (($values[0] % 32) * 64) + ($values[1] % 64); + switch ($lookingFor) { + case 4: + $number = (($values[0] % 16) * 262144) + (($values[1] % 64) * 4096) + (($values[2] % 64) * 64) + ($values[3] % 64); + break; + + case 3: + $number = (($values[0] % 16) * 4096) + (($values[1] % 64) * 64) + ($values[2] % 64); + break; + + case 2: + $number = (($values[0] % 32) * 64) + ($values[1] % 64); + break; + } $unicode[] = $number; $values = []; diff --git a/Tests/SMSCounterTest.php b/Tests/SMSCounterTest.php index 242c2d8..f3447e8 100755 --- a/Tests/SMSCounterTest.php +++ b/Tests/SMSCounterTest.php @@ -164,6 +164,17 @@ public function testCarriageReturn() $this->assertEquals($expected, $count); } + public function testUnicodeChars() + { + $smsCounter = new SMSCounter(); + + $this->assertEquals([96], $smsCounter->utf8ToUnicode('`')); // U+60 + $this->assertEquals([882], $smsCounter->utf8ToUnicode('Ν²')); // U+0372 + $this->assertEquals([2210], $smsCounter->utf8ToUnicode('ΰ’’')); // U+08A2 + $this->assertEquals([11821], $smsCounter->utf8ToUnicode('βΈ­')); // U+2E2D + $this->assertEquals([128526], $smsCounter->utf8ToUnicode('😎')); // U+1F60E + } + public function testUnicode() { $text = '`'; @@ -182,17 +193,17 @@ public function testUnicode() public function testUnicodeEmoji() { - $text = '😎😎'; + $text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎'; $smsCounter = new SMSCounter(); $count = $smsCounter->count($text); $expected = new \stdClass(); $expected->encoding = SMSCounter::UTF16; - $expected->length = 2; - $expected->per_message = 70; - $expected->remaining = 68; - $expected->messages = 1; + $expected->length = 77; + $expected->per_message = 67; + $expected->remaining = 57; + $expected->messages = 2; $this->assertEquals($expected, $count); } From c475537f12ca8a0c0c37004eaf8dde3ad57508fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juli=C3=A1n=20Guti=C3=A9rrez?= Date: Tue, 4 Jun 2019 16:32:02 +0200 Subject: [PATCH 2/3] double bytes for unicode chars over U+10000 --- SMSCounter.php | 15 ++++++++++++++ Tests/SMSCounterTest.php | 44 ++++++++++++++++++++++++++++++++-------- 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/SMSCounter.php b/SMSCounter.php index a9b0dab..1d0c8b2 100644 --- a/SMSCounter.php +++ b/SMSCounter.php @@ -216,6 +216,21 @@ private function doCount($text, $supportShiftTables) // Each exchar in the GSM 7 Bit encoding takes one more space // Hence the length increases by one char for each of those Ex chars. $length += $lengthExchars; + } elseif ($encoding === self::UTF16) { + // Unicode chars over U+10000 occupy an extra byte + $lengthExtra = array_reduce( + $unicodeArray, + function ($carry, $char) { + if ($char >= 65536) { + $carry++; + } + + return $carry; + }, + 0 + ); + + $length += $lengthExtra; } // Select the per message length according to encoding and the message length diff --git a/Tests/SMSCounterTest.php b/Tests/SMSCounterTest.php index f3447e8..7eb07ae 100755 --- a/Tests/SMSCounterTest.php +++ b/Tests/SMSCounterTest.php @@ -164,15 +164,41 @@ public function testCarriageReturn() $this->assertEquals($expected, $count); } - public function testUnicodeChars() + public function testUnicodeEncodingAndLength() { $smsCounter = new SMSCounter(); - $this->assertEquals([96], $smsCounter->utf8ToUnicode('`')); // U+60 - $this->assertEquals([882], $smsCounter->utf8ToUnicode('Ν²')); // U+0372 - $this->assertEquals([2210], $smsCounter->utf8ToUnicode('ΰ’’')); // U+08A2 - $this->assertEquals([11821], $smsCounter->utf8ToUnicode('βΈ­')); // U+2E2D - $this->assertEquals([128526], $smsCounter->utf8ToUnicode('😎')); // U+1F60E + // 1 byte UTF8 + $this->assertEquals([33], $smsCounter->utf8ToUnicode('!')); // U+0021 => 0x21 + $this->assertEquals(1, $smsCounter->count('!')->length); + + if (version_compare(PHP_VERSION, '7.0.0') >= 0) { + $this->assertEquals([127], $smsCounter->utf8ToUnicode("\u{007F}")); // U+007F => 0x7F + $this->assertEquals(1, $smsCounter->count("\u{007F}")->length); + } + + // 2 bytes UTF8 + if (version_compare(PHP_VERSION, '7.0.0') >= 0) { + $this->assertEquals([128], $smsCounter->utf8ToUnicode("\u{0080}")); // U+0080 => 0xC2 0x80 + $this->assertEquals(1, $smsCounter->count("\u{0080}")->length); + } + + $this->assertEquals([2047], $smsCounter->utf8ToUnicode('ίΏ')); // U+07FF => 0xDF 0xBF + $this->assertEquals(1, $smsCounter->count('ίΏ')->length); + + // 3 bytes UTF8 + $this->assertEquals([2048], $smsCounter->utf8ToUnicode('ΰ €')); // U+0800 => 0xE0 0xA0 0x80 + $this->assertEquals(1, $smsCounter->count('ΰ €')->length); + + $this->assertEquals([65535], $smsCounter->utf8ToUnicode('οΏΏ')); // U+FFFF => 0xEF 0xBF 0xBF + $this->assertEquals(1, $smsCounter->count('οΏΏ')->length); + + // 4 bytes UTF8 + $this->assertEquals([65536], $smsCounter->utf8ToUnicode('𐀀')); // U+10000 => 0xF0 0x90 0x80 0x80 + $this->assertEquals(2, $smsCounter->count('𐀀')->length); + + $this->assertEquals([983295], $smsCounter->utf8ToUnicode('σ°ƒΏ')); // U+F00FF => 0xF3 0xB0 0x83 0xBF + $this->assertEquals(2, $smsCounter->count('σ°ƒΏ')->length); } public function testUnicode() @@ -193,16 +219,16 @@ public function testUnicode() public function testUnicodeEmoji() { - $text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎'; + $text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎'; $smsCounter = new SMSCounter(); $count = $smsCounter->count($text); $expected = new \stdClass(); $expected->encoding = SMSCounter::UTF16; - $expected->length = 77; + $expected->length = 132; $expected->per_message = 67; - $expected->remaining = 57; + $expected->remaining = 2; $expected->messages = 2; $this->assertEquals($expected, $count); From c5d7489fe5a43a969abd1c0dcc7cf87f00ac6c45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juli=C3=A1n=20Guti=C3=A9rrez?= Date: Wed, 5 Jun 2019 13:46:58 +0200 Subject: [PATCH 3/3] return remaining chars in last part of UTF16 messages --- README.md | 6 +++++ SMSCounter.php | 18 +++++++++++++- Tests/SMSCounterTest.php | 52 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 71 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 1629e0e..0232c97 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,12 @@ stdClass Object ) ``` +##### UTF16 notice + +When using unicode chars over U+10000 (mainly emoticons 😎) on messages larger than 70 chars the _remaining_ value will actually be the **remaining chars in last message part only**, this is due to how those chars are encoded using two 16bit chars and max part length being an odd number (67) + +#### Sanitization + You can sanitize your text to be a valid strict GSM 03.38 charset ```php diff --git a/SMSCounter.php b/SMSCounter.php index 1d0c8b2..983bec7 100644 --- a/SMSCounter.php +++ b/SMSCounter.php @@ -259,7 +259,23 @@ function ($carry, $char) { } $messages = (int) ceil($length / $perMessage); - $remaining = ($perMessage * $messages) - $length; + + if ($encoding === self::UTF16 && $length > $perMessage) { + $count = 0; + foreach ($unicodeArray as $char) { + if ($count === $perMessage) { + $count = 0; + } elseif ($count > $perMessage) { + $count = 2; + } + + $count += $char >= 65536 ? 2 : 1; + } + + $remaining = $perMessage - ($count > $perMessage ? 2 : $count); + } else { + $remaining = ($perMessage * $messages) - $length; + } $returnset = new \stdClass(); diff --git a/Tests/SMSCounterTest.php b/Tests/SMSCounterTest.php index 7eb07ae..89b73c5 100755 --- a/Tests/SMSCounterTest.php +++ b/Tests/SMSCounterTest.php @@ -217,18 +217,62 @@ public function testUnicode() $this->assertEquals($expected, $count); } - public function testUnicodeEmoji() + public function testUnicodeEmojiSingleMessage() { - $text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎'; + $text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎'; + $smsCounter = new SMSCounter(); + $count = $smsCounter->count($text); + + $expected = new \stdClass(); + $expected->encoding = SMSCounter::UTF16; + $expected->length = 70; + $expected->per_message = 70; + $expected->remaining = 0; + $expected->messages = 1; + + $this->assertEquals($expected, $count); + } + + public function testUnicodeEmojiMultiPartMessage() + { + // A char is lost at the end of first part + $text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎'; + $smsCounter = new SMSCounter(); + $count = $smsCounter->count($text); + + $expected = new \stdClass(); + $expected->encoding = SMSCounter::UTF16; + $expected->length = 72; + $expected->per_message = 67; + $expected->remaining = 61; + $expected->messages = 2; + + $this->assertEquals($expected, $count); + + // First part is completed with a dash char (-) + $text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎-😎😎😎'; + $smsCounter = new SMSCounter(); + $count = $smsCounter->count($text); + + $expected = new \stdClass(); + $expected->encoding = SMSCounter::UTF16; + $expected->length = 73; + $expected->per_message = 67; + $expected->remaining = 61; + $expected->messages = 2; + + $this->assertEquals($expected, $count); + // Both parts are completed with dash chars (-) + $text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎-😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎-'; $smsCounter = new SMSCounter(); $count = $smsCounter->count($text); $expected = new \stdClass(); $expected->encoding = SMSCounter::UTF16; - $expected->length = 132; + $expected->length = 134; $expected->per_message = 67; - $expected->remaining = 2; + $expected->remaining = 0; $expected->messages = 2; $this->assertEquals($expected, $count);