Skip to content

Commit

Permalink
Merge pull request #16 from juliangut/fix/unicode
Browse files Browse the repository at this point in the history
fix unicode chars over U+10000 length detection
  • Loading branch information
juliangut authored Jun 6, 2019
2 parents d80bcbf + c5d7489 commit a0f017d
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 9 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@ stdClass Object
)
```

##### UTF16 notice

When using unicode chars over U+10000 (mainly emoticons 😎) on messages larger than 70 chars the _remaining_ value will actually be the **remaining chars in last message part only**, this is due to how those chars are encoded using two 16bit chars and max part length being an odd number (67)

#### Sanitization

You can sanitize your text to be a valid strict GSM 03.38 charset

```php
Expand Down
49 changes: 45 additions & 4 deletions SMSCounter.php
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,21 @@ private function doCount($text, $supportShiftTables)
// Each exchar in the GSM 7 Bit encoding takes one more space
// Hence the length increases by one char for each of those Ex chars.
$length += $lengthExchars;
} elseif ($encoding === self::UTF16) {
// Unicode chars over U+10000 occupy an extra byte
$lengthExtra = array_reduce(
$unicodeArray,
function ($carry, $char) {
if ($char >= 65536) {
$carry++;
}

return $carry;
},
0
);

$length += $lengthExtra;
}

// Select the per message length according to encoding and the message length
Expand Down Expand Up @@ -244,7 +259,23 @@ private function doCount($text, $supportShiftTables)
}

$messages = (int) ceil($length / $perMessage);
$remaining = ($perMessage * $messages) - $length;

if ($encoding === self::UTF16 && $length > $perMessage) {
$count = 0;
foreach ($unicodeArray as $char) {
if ($count === $perMessage) {
$count = 0;
} elseif ($count > $perMessage) {
$count = 2;
}

$count += $char >= 65536 ? 2 : 1;
}

$remaining = $perMessage - ($count > $perMessage ? 2 : $count);
} else {
$remaining = ($perMessage * $messages) - $length;
}

$returnset = new \stdClass();

Expand Down Expand Up @@ -355,9 +386,19 @@ public function utf8ToUnicode($str)
$values[] = $thisValue;

if (count($values) === $lookingFor) {
$number = ($lookingFor === 3) ?
(($values[0] % 16) * 4096) + (($values[1] % 64) * 64) + ($values[2] % 64) :
(($values[0] % 32) * 64) + ($values[1] % 64);
switch ($lookingFor) {
case 4:
$number = (($values[0] % 16) * 262144) + (($values[1] % 64) * 4096) + (($values[2] % 64) * 64) + ($values[3] % 64);
break;

case 3:
$number = (($values[0] % 16) * 4096) + (($values[1] % 64) * 64) + ($values[2] % 64);
break;

case 2:
$number = (($values[0] % 32) * 64) + ($values[1] % 64);
break;
}

$unicode[] = $number;
$values = [];
Expand Down
91 changes: 86 additions & 5 deletions Tests/SMSCounterTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,43 @@ public function testCarriageReturn()
$this->assertEquals($expected, $count);
}

public function testUnicodeEncodingAndLength()
{
$smsCounter = new SMSCounter();

// 1 byte UTF8
$this->assertEquals([33], $smsCounter->utf8ToUnicode('!')); // U+0021 => 0x21
$this->assertEquals(1, $smsCounter->count('!')->length);

if (version_compare(PHP_VERSION, '7.0.0') >= 0) {
$this->assertEquals([127], $smsCounter->utf8ToUnicode("\u{007F}")); // U+007F => 0x7F
$this->assertEquals(1, $smsCounter->count("\u{007F}")->length);
}

// 2 bytes UTF8
if (version_compare(PHP_VERSION, '7.0.0') >= 0) {
$this->assertEquals([128], $smsCounter->utf8ToUnicode("\u{0080}")); // U+0080 => 0xC2 0x80
$this->assertEquals(1, $smsCounter->count("\u{0080}")->length);
}

$this->assertEquals([2047], $smsCounter->utf8ToUnicode('ίΏ')); // U+07FF => 0xDF 0xBF
$this->assertEquals(1, $smsCounter->count('ίΏ')->length);

// 3 bytes UTF8
$this->assertEquals([2048], $smsCounter->utf8ToUnicode('ΰ €')); // U+0800 => 0xE0 0xA0 0x80
$this->assertEquals(1, $smsCounter->count('ΰ €')->length);

$this->assertEquals([65535], $smsCounter->utf8ToUnicode('οΏΏ')); // U+FFFF => 0xEF 0xBF 0xBF
$this->assertEquals(1, $smsCounter->count('οΏΏ')->length);

// 4 bytes UTF8
$this->assertEquals([65536], $smsCounter->utf8ToUnicode('𐀀')); // U+10000 => 0xF0 0x90 0x80 0x80
$this->assertEquals(2, $smsCounter->count('𐀀')->length);

$this->assertEquals([983295], $smsCounter->utf8ToUnicode('σ°ƒΏ')); // U+F00FF => 0xF3 0xB0 0x83 0xBF
$this->assertEquals(2, $smsCounter->count('σ°ƒΏ')->length);
}

public function testUnicode()
{
$text = '`';
Expand All @@ -180,23 +217,67 @@ public function testUnicode()
$this->assertEquals($expected, $count);
}

public function testUnicodeEmoji()
public function testUnicodeEmojiSingleMessage()
{
$text = '😎😎';

$text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎';
$smsCounter = new SMSCounter();
$count = $smsCounter->count($text);

$expected = new \stdClass();
$expected->encoding = SMSCounter::UTF16;
$expected->length = 2;
$expected->length = 70;
$expected->per_message = 70;
$expected->remaining = 68;
$expected->remaining = 0;
$expected->messages = 1;

$this->assertEquals($expected, $count);
}

public function testUnicodeEmojiMultiPartMessage()
{
// A char is lost at the end of first part
$text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎';
$smsCounter = new SMSCounter();
$count = $smsCounter->count($text);

$expected = new \stdClass();
$expected->encoding = SMSCounter::UTF16;
$expected->length = 72;
$expected->per_message = 67;
$expected->remaining = 61;
$expected->messages = 2;

$this->assertEquals($expected, $count);

// First part is completed with a dash char (-)
$text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎-😎😎😎';
$smsCounter = new SMSCounter();
$count = $smsCounter->count($text);

$expected = new \stdClass();
$expected->encoding = SMSCounter::UTF16;
$expected->length = 73;
$expected->per_message = 67;
$expected->remaining = 61;
$expected->messages = 2;

$this->assertEquals($expected, $count);

// Both parts are completed with dash chars (-)
$text = '😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎-😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎😎-';
$smsCounter = new SMSCounter();
$count = $smsCounter->count($text);

$expected = new \stdClass();
$expected->encoding = SMSCounter::UTF16;
$expected->length = 134;
$expected->per_message = 67;
$expected->remaining = 0;
$expected->messages = 2;

$this->assertEquals($expected, $count);
}

public function testRemoveNonGSMChars()
{
$text = 'Γ‘no-unicode-remaining` Γ±';
Expand Down

0 comments on commit a0f017d

Please sign in to comment.