From 123e54c9e88a2d8c3ffa9c604d6d5de0170aca04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juli=C3=A1n=20Guti=C3=A9rrez?= <juliangut@gmail.com>
Date: Mon, 3 Jun 2019 12:19:47 +0200
Subject: [PATCH 1/3] fix unicode char encoding over U+1F210

---
 SMSCounter.php           | 16 +++++++++++++---
 Tests/SMSCounterTest.php | 21 ++++++++++++++++-----
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/SMSCounter.php b/SMSCounter.php
index 3063618..a9b0dab 100644
--- a/SMSCounter.php
+++ b/SMSCounter.php
@@ -355,9 +355,19 @@ public function utf8ToUnicode($str)
                 $values[] = $thisValue;
 
                 if (count($values) === $lookingFor) {
-                    $number = ($lookingFor === 3) ?
-                    (($values[0] % 16) * 4096) + (($values[1] % 64) * 64) + ($values[2] % 64) :
-                    (($values[0] % 32) * 64) + ($values[1] % 64);
+                    switch ($lookingFor) {
+                        case 4:
+                            $number = (($values[0] % 16) * 262144) + (($values[1] % 64) * 4096) + (($values[2] % 64) * 64) + ($values[3] % 64);
+                            break;
+
+                        case 3:
+                            $number = (($values[0] % 16) * 4096) + (($values[1] % 64) * 64) + ($values[2] % 64);
+                            break;
+
+                        case 2:
+                            $number = (($values[0] % 32) * 64) + ($values[1] % 64);
+                            break;
+                    }
 
                     $unicode[] = $number;
                     $values = [];
diff --git a/Tests/SMSCounterTest.php b/Tests/SMSCounterTest.php
index 242c2d8..f3447e8 100755
--- a/Tests/SMSCounterTest.php
+++ b/Tests/SMSCounterTest.php
@@ -164,6 +164,17 @@ public function testCarriageReturn()
         $this->assertEquals($expected, $count);
     }
 
+    public function testUnicodeChars()
+    {
+        $smsCounter = new SMSCounter();
+
+        $this->assertEquals([96], $smsCounter->utf8ToUnicode('`')); // U+60
+        $this->assertEquals([882], $smsCounter->utf8ToUnicode('Ͳ')); // U+0372
+        $this->assertEquals([2210], $smsCounter->utf8ToUnicode('ࢢ')); // U+08A2
+        $this->assertEquals([11821], $smsCounter->utf8ToUnicode('⸭')); // U+2E2D
+        $this->assertEquals([128526], $smsCounter->utf8ToUnicode('😎')); // U+1F60E
+    }
+
     public function testUnicode()
     {
         $text = '`';
@@ -182,17 +193,17 @@ public function testUnicode()
 
     public function testUnicodeEmoji()
     {
-        $text = '😎😎';
+        $text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎';
 
         $smsCounter = new SMSCounter();
         $count = $smsCounter->count($text);
 
         $expected = new \stdClass();
         $expected->encoding = SMSCounter::UTF16;
-        $expected->length = 2;
-        $expected->per_message = 70;
-        $expected->remaining = 68;
-        $expected->messages = 1;
+        $expected->length = 77;
+        $expected->per_message = 67;
+        $expected->remaining = 57;
+        $expected->messages = 2;
 
         $this->assertEquals($expected, $count);
     }

From c475537f12ca8a0c0c37004eaf8dde3ad57508fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juli=C3=A1n=20Guti=C3=A9rrez?= <juliangut@gmail.com>
Date: Tue, 4 Jun 2019 16:32:02 +0200
Subject: [PATCH 2/3] double bytes for unicode chars over U+10000

---
 SMSCounter.php           | 15 ++++++++++++++
 Tests/SMSCounterTest.php | 44 ++++++++++++++++++++++++++++++++--------
 2 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/SMSCounter.php b/SMSCounter.php
index a9b0dab..1d0c8b2 100644
--- a/SMSCounter.php
+++ b/SMSCounter.php
@@ -216,6 +216,21 @@ private function doCount($text, $supportShiftTables)
             // Each exchar in the GSM 7 Bit encoding takes one more space
             // Hence the length increases by one char for each of those Ex chars.
             $length += $lengthExchars;
+        } elseif ($encoding === self::UTF16) {
+            // Unicode chars over U+10000 occupy an extra byte
+            $lengthExtra = array_reduce(
+                $unicodeArray,
+                function ($carry, $char) {
+                    if ($char >= 65536) {
+                        $carry++;
+                    }
+
+                    return $carry;
+                },
+                0
+            );
+
+            $length += $lengthExtra;
         }
 
         // Select the per message length according to encoding and the message length
diff --git a/Tests/SMSCounterTest.php b/Tests/SMSCounterTest.php
index f3447e8..7eb07ae 100755
--- a/Tests/SMSCounterTest.php
+++ b/Tests/SMSCounterTest.php
@@ -164,15 +164,41 @@ public function testCarriageReturn()
         $this->assertEquals($expected, $count);
     }
 
-    public function testUnicodeChars()
+    public function testUnicodeEncodingAndLength()
     {
         $smsCounter = new SMSCounter();
 
-        $this->assertEquals([96], $smsCounter->utf8ToUnicode('`')); // U+60
-        $this->assertEquals([882], $smsCounter->utf8ToUnicode('Ͳ')); // U+0372
-        $this->assertEquals([2210], $smsCounter->utf8ToUnicode('ࢢ')); // U+08A2
-        $this->assertEquals([11821], $smsCounter->utf8ToUnicode('⸭')); // U+2E2D
-        $this->assertEquals([128526], $smsCounter->utf8ToUnicode('😎')); // U+1F60E
+        // 1 byte UTF8
+        $this->assertEquals([33], $smsCounter->utf8ToUnicode('!')); // U+0021 => 0x21
+        $this->assertEquals(1, $smsCounter->count('!')->length);
+
+        if (version_compare(PHP_VERSION, '7.0.0') >= 0) {
+            $this->assertEquals([127], $smsCounter->utf8ToUnicode("\u{007F}")); // U+007F => 0x7F
+            $this->assertEquals(1, $smsCounter->count("\u{007F}")->length);
+        }
+
+        // 2 bytes UTF8
+        if (version_compare(PHP_VERSION, '7.0.0') >= 0) {
+            $this->assertEquals([128], $smsCounter->utf8ToUnicode("\u{0080}")); // U+0080 => 0xC2 0x80
+            $this->assertEquals(1, $smsCounter->count("\u{0080}")->length);
+        }
+
+        $this->assertEquals([2047], $smsCounter->utf8ToUnicode('߿')); // U+07FF => 0xDF 0xBF
+        $this->assertEquals(1, $smsCounter->count('߿')->length);
+
+        // 3 bytes UTF8
+        $this->assertEquals([2048], $smsCounter->utf8ToUnicode('ࠀ')); // U+0800 => 0xE0 0xA0 0x80
+        $this->assertEquals(1, $smsCounter->count('ࠀ')->length);
+
+        $this->assertEquals([65535], $smsCounter->utf8ToUnicode('￿')); // U+FFFF => 0xEF 0xBF 0xBF
+        $this->assertEquals(1, $smsCounter->count('￿')->length);
+
+        // 4 bytes UTF8
+        $this->assertEquals([65536], $smsCounter->utf8ToUnicode('𐀀')); // U+10000 => 0xF0 0x90 0x80 0x80
+        $this->assertEquals(2, $smsCounter->count('𐀀')->length);
+
+        $this->assertEquals([983295], $smsCounter->utf8ToUnicode('󰃿')); // U+F00FF => 0xF3 0xB0 0x83 0xBF
+        $this->assertEquals(2, $smsCounter->count('󰃿')->length);
     }
 
     public function testUnicode()
@@ -193,16 +219,16 @@ public function testUnicode()
 
     public function testUnicodeEmoji()
     {
-        $text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎';
+        $text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎';
 
         $smsCounter = new SMSCounter();
         $count = $smsCounter->count($text);
 
         $expected = new \stdClass();
         $expected->encoding = SMSCounter::UTF16;
-        $expected->length = 77;
+        $expected->length = 132;
         $expected->per_message = 67;
-        $expected->remaining = 57;
+        $expected->remaining = 2;
         $expected->messages = 2;
 
         $this->assertEquals($expected, $count);

From c5d7489fe5a43a969abd1c0dcc7cf87f00ac6c45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juli=C3=A1n=20Guti=C3=A9rrez?= <juliangut@gmail.com>
Date: Wed, 5 Jun 2019 13:46:58 +0200
Subject: [PATCH 3/3] return remaining chars in last part of UTF16 messages

---
 README.md                |  6 +++++
 SMSCounter.php           | 18 +++++++++++++-
 Tests/SMSCounterTest.php | 52 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 71 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 1629e0e..0232c97 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,12 @@ stdClass Object
 )
 ```
 
+##### UTF16 notice
+
+When using unicode chars over U+10000 (mainly emoticons 😎) on messages larger than 70 chars the _remaining_ value will actually be the **remaining chars in last message part only**, this is due to how those chars are encoded using two 16bit chars and max part length being an odd number (67)
+
+#### Sanitization
+
 You can sanitize your text to be a valid strict GSM 03.38 charset
 
 ```php
diff --git a/SMSCounter.php b/SMSCounter.php
index 1d0c8b2..983bec7 100644
--- a/SMSCounter.php
+++ b/SMSCounter.php
@@ -259,7 +259,23 @@ function ($carry, $char) {
         }
 
         $messages = (int) ceil($length / $perMessage);
-        $remaining = ($perMessage * $messages) - $length;
+
+        if ($encoding === self::UTF16 && $length > $perMessage) {
+            $count = 0;
+            foreach ($unicodeArray as $char) {
+                if ($count === $perMessage) {
+                    $count = 0;
+                } elseif ($count > $perMessage) {
+                    $count = 2;
+                }
+
+                $count += $char >= 65536 ? 2 : 1;
+            }
+
+            $remaining = $perMessage - ($count > $perMessage ? 2 : $count);
+        } else {
+            $remaining = ($perMessage * $messages) - $length;
+        }
 
         $returnset = new \stdClass();
 
diff --git a/Tests/SMSCounterTest.php b/Tests/SMSCounterTest.php
index 7eb07ae..89b73c5 100755
--- a/Tests/SMSCounterTest.php
+++ b/Tests/SMSCounterTest.php
@@ -217,18 +217,62 @@ public function testUnicode()
         $this->assertEquals($expected, $count);
     }
 
-    public function testUnicodeEmoji()
+    public function testUnicodeEmojiSingleMessage()
     {
-        $text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎';
+        $text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎';
+        $smsCounter = new SMSCounter();
+        $count = $smsCounter->count($text);
+
+        $expected = new \stdClass();
+        $expected->encoding = SMSCounter::UTF16;
+        $expected->length = 70;
+        $expected->per_message = 70;
+        $expected->remaining = 0;
+        $expected->messages = 1;
+
+        $this->assertEquals($expected, $count);
+    }
+
+    public function testUnicodeEmojiMultiPartMessage()
+    {
+        // A char is lost at the end of first part
+        $text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎';
+        $smsCounter = new SMSCounter();
+        $count = $smsCounter->count($text);
+
+        $expected = new \stdClass();
+        $expected->encoding = SMSCounter::UTF16;
+        $expected->length = 72;
+        $expected->per_message = 67;
+        $expected->remaining = 61;
+        $expected->messages = 2;
+
+        $this->assertEquals($expected, $count);
+
+        // First part is completed with a dash char (-)
+        $text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎-😎😎😎';
+        $smsCounter = new SMSCounter();
+        $count = $smsCounter->count($text);
+
+        $expected = new \stdClass();
+        $expected->encoding = SMSCounter::UTF16;
+        $expected->length = 73;
+        $expected->per_message = 67;
+        $expected->remaining = 61;
+        $expected->messages = 2;
+
+        $this->assertEquals($expected, $count);
 
+        // Both parts are completed with dash chars (-)
+        $text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎-😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎-';
         $smsCounter = new SMSCounter();
         $count = $smsCounter->count($text);
 
         $expected = new \stdClass();
         $expected->encoding = SMSCounter::UTF16;
-        $expected->length = 132;
+        $expected->length = 134;
         $expected->per_message = 67;
-        $expected->remaining = 2;
+        $expected->remaining = 0;
         $expected->messages = 2;
 
         $this->assertEquals($expected, $count);