diff --git a/README.md b/README.md index 1629e0e..0232c97 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,12 @@ stdClass Object ) ``` +##### UTF16 notice + +When using unicode chars over U+10000 (mainly emoticons ๐Ÿ˜Ž) on messages larger than 70 chars the _remaining_ value will actually be the **remaining chars in last message part only**, this is due to how those chars are encoded using two 16bit chars and max part length being an odd number (67) + +#### Sanitization + You can sanitize your text to be a valid strict GSM 03.38 charset ```php diff --git a/SMSCounter.php b/SMSCounter.php index 3063618..983bec7 100644 --- a/SMSCounter.php +++ b/SMSCounter.php @@ -216,6 +216,21 @@ private function doCount($text, $supportShiftTables) // Each exchar in the GSM 7 Bit encoding takes one more space // Hence the length increases by one char for each of those Ex chars. $length += $lengthExchars; + } elseif ($encoding === self::UTF16) { + // Unicode chars over U+10000 occupy an extra byte + $lengthExtra = array_reduce( + $unicodeArray, + function ($carry, $char) { + if ($char >= 65536) { + $carry++; + } + + return $carry; + }, + 0 + ); + + $length += $lengthExtra; } // Select the per message length according to encoding and the message length @@ -244,7 +259,23 @@ private function doCount($text, $supportShiftTables) } $messages = (int) ceil($length / $perMessage); - $remaining = ($perMessage * $messages) - $length; + + if ($encoding === self::UTF16 && $length > $perMessage) { + $count = 0; + foreach ($unicodeArray as $char) { + if ($count === $perMessage) { + $count = 0; + } elseif ($count > $perMessage) { + $count = 2; + } + + $count += $char >= 65536 ? 2 : 1; + } + + $remaining = $perMessage - ($count > $perMessage ? 2 : $count); + } else { + $remaining = ($perMessage * $messages) - $length; + } $returnset = new \stdClass(); @@ -355,9 +386,19 @@ public function utf8ToUnicode($str) $values[] = $thisValue; if (count($values) === $lookingFor) { - $number = ($lookingFor === 3) ? - (($values[0] % 16) * 4096) + (($values[1] % 64) * 64) + ($values[2] % 64) : - (($values[0] % 32) * 64) + ($values[1] % 64); + switch ($lookingFor) { + case 4: + $number = (($values[0] % 16) * 262144) + (($values[1] % 64) * 4096) + (($values[2] % 64) * 64) + ($values[3] % 64); + break; + + case 3: + $number = (($values[0] % 16) * 4096) + (($values[1] % 64) * 64) + ($values[2] % 64); + break; + + case 2: + $number = (($values[0] % 32) * 64) + ($values[1] % 64); + break; + } $unicode[] = $number; $values = []; diff --git a/Tests/SMSCounterTest.php b/Tests/SMSCounterTest.php index 242c2d8..89b73c5 100755 --- a/Tests/SMSCounterTest.php +++ b/Tests/SMSCounterTest.php @@ -164,6 +164,43 @@ public function testCarriageReturn() $this->assertEquals($expected, $count); } + public function testUnicodeEncodingAndLength() + { + $smsCounter = new SMSCounter(); + + // 1 byte UTF8 + $this->assertEquals([33], $smsCounter->utf8ToUnicode('!')); // U+0021 => 0x21 + $this->assertEquals(1, $smsCounter->count('!')->length); + + if (version_compare(PHP_VERSION, '7.0.0') >= 0) { + $this->assertEquals([127], $smsCounter->utf8ToUnicode("\u{007F}")); // U+007F => 0x7F + $this->assertEquals(1, $smsCounter->count("\u{007F}")->length); + } + + // 2 bytes UTF8 + if (version_compare(PHP_VERSION, '7.0.0') >= 0) { + $this->assertEquals([128], $smsCounter->utf8ToUnicode("\u{0080}")); // U+0080 => 0xC2 0x80 + $this->assertEquals(1, $smsCounter->count("\u{0080}")->length); + } + + $this->assertEquals([2047], $smsCounter->utf8ToUnicode('฿ฟ')); // U+07FF => 0xDF 0xBF + $this->assertEquals(1, $smsCounter->count('฿ฟ')->length); + + // 3 bytes UTF8 + $this->assertEquals([2048], $smsCounter->utf8ToUnicode('เ €')); // U+0800 => 0xE0 0xA0 0x80 + $this->assertEquals(1, $smsCounter->count('เ €')->length); + + $this->assertEquals([65535], $smsCounter->utf8ToUnicode('๏ฟฟ')); // U+FFFF => 0xEF 0xBF 0xBF + $this->assertEquals(1, $smsCounter->count('๏ฟฟ')->length); + + // 4 bytes UTF8 + $this->assertEquals([65536], $smsCounter->utf8ToUnicode('๐€€')); // U+10000 => 0xF0 0x90 0x80 0x80 + $this->assertEquals(2, $smsCounter->count('๐€€')->length); + + $this->assertEquals([983295], $smsCounter->utf8ToUnicode('๓ฐƒฟ')); // U+F00FF => 0xF3 0xB0 0x83 0xBF + $this->assertEquals(2, $smsCounter->count('๓ฐƒฟ')->length); + } + public function testUnicode() { $text = '`'; @@ -180,23 +217,67 @@ public function testUnicode() $this->assertEquals($expected, $count); } - public function testUnicodeEmoji() + public function testUnicodeEmojiSingleMessage() { - $text = '๐Ÿ˜Ž๐Ÿ˜Ž'; - + $text = '๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž'; $smsCounter = new SMSCounter(); $count = $smsCounter->count($text); $expected = new \stdClass(); $expected->encoding = SMSCounter::UTF16; - $expected->length = 2; + $expected->length = 70; $expected->per_message = 70; - $expected->remaining = 68; + $expected->remaining = 0; $expected->messages = 1; $this->assertEquals($expected, $count); } + public function testUnicodeEmojiMultiPartMessage() + { + // A char is lost at the end of first part + $text = '๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž'; + $smsCounter = new SMSCounter(); + $count = $smsCounter->count($text); + + $expected = new \stdClass(); + $expected->encoding = SMSCounter::UTF16; + $expected->length = 72; + $expected->per_message = 67; + $expected->remaining = 61; + $expected->messages = 2; + + $this->assertEquals($expected, $count); + + // First part is completed with a dash char (-) + $text = '๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž-๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž'; + $smsCounter = new SMSCounter(); + $count = $smsCounter->count($text); + + $expected = new \stdClass(); + $expected->encoding = SMSCounter::UTF16; + $expected->length = 73; + $expected->per_message = 67; + $expected->remaining = 61; + $expected->messages = 2; + + $this->assertEquals($expected, $count); + + // Both parts are completed with dash chars (-) + $text = '๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž-๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž๐Ÿ˜Ž-'; + $smsCounter = new SMSCounter(); + $count = $smsCounter->count($text); + + $expected = new \stdClass(); + $expected->encoding = SMSCounter::UTF16; + $expected->length = 134; + $expected->per_message = 67; + $expected->remaining = 0; + $expected->messages = 2; + + $this->assertEquals($expected, $count); + } + public function testRemoveNonGSMChars() { $text = 'รกno-unicode-remaining` รฑ';