-
Notifications
You must be signed in to change notification settings - Fork 7
/
utf8.c
150 lines (122 loc) · 4.03 KB
/
utf8.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/* utf8.c -- implements utf8.h, conversion between unicode and UTF-8 */
#include "utf8.h"
#include <assert.h>
#include <wchar.h> /* either _XOPEN_SOURCE or _GNU_SOURCE */
/* Display width of UTF-8 character */
int _utf8_width( unicode_t c) {
#if __SIZEOF_WCHAR_T__ == 2 /* wcwidth only supports UTF-16 */
return (c < 0x10000) ? wcwidth( (wchar_t) c) : -1 ;
#else
return wcwidth( (wchar_t) c) ;
#endif
}
int utf8_width( unicode_t c) {
int w = _utf8_width( c) ;
return (w < 0) ? 2 : w ; /* display \u if can't figure out width */
}
/* utf8_to_unicode()
*
* Convert a UTF-8 sequence to its unicode value, and return the length of
* the sequence in bytes.
*
* NOTE! Invalid UTF-8 will be converted to a one-byte sequence, so you can
* either use it as-is (ie as Latin1) or you can check for invalid UTF-8
* by checking for a length of 1 and a result > 127.
*
* NOTE 2! This does *not* verify things like minimality. So overlong forms
* are happily accepted and decoded, as are the various "invalid values".
*/
unsigned utf8_to_unicode( const char *line, unsigned index, unsigned len,
unicode_t *res) {
assert( index < len) ;
unsigned c = *res = (unsigned char) line[ index] ;
/* 0xxxxxxx is valid one byte utf8
* 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1
* 1100000x is start of overlong encoding sequence
* Sequence longer than 4 bytes are invalid
* Last valid code is 0x10FFFF, encoding start with 0xF4
*/
if( c <= 0xC1 || c > 0xF4)
return 1 ;
/* Ok, it's 11xxxxxx, do a stupid decode */
unsigned mask = 0x20 ;
unsigned bytes = 2 ;
while( (c & mask) != 0) {
bytes++ ;
mask >>= 1 ;
}
/* bytes is in range [2..4] as c was in range [C2..F4] */
len -= index ;
if( bytes > len)
return 1 ;
unicode_t value = c & (mask - 1) ;
/* Ok, do the bytes */
line += index ;
for( unsigned i = 2 ; i <= bytes ; i++) {
c = (unsigned char) *++line ;
if( (c & 0xc0) != 0x80)
return 1 ;
value = (value << 6) | (c & 0x3f) ;
}
if( value > 0x10FFFF) /* Avoid 110000 - 13FFFF */
return 1 ;
*res = value ;
return bytes ;
}
/* unicode_to_utf8()
*
* Convert a unicode value to its canonical utf-8 sequence.
*
* NOTE! This does not check for - or care about - the "invalid" unicode
* values. Also, converting a utf-8 sequence to unicode and back does
* *not* guarantee the same sequence, since this generates the shortest
* possible sequence, while utf8_to_unicode() accepts both Latin1 and
* overlong utf-8 sequences.
*/
unsigned unicode_to_utf8( unicode_t c, char *utf8) {
unsigned bytes = 1 ;
assert( c <= 0x10FFFF) ;
#ifdef NDEBUG
if( c > 0x10FFFF) /* Let's assume this is due to sign extension */
c &= 0xFF ;
#endif
if( c <= 0x7f)
*utf8 = (char) c ;
else {
unsigned prefix = 0x40 ;
char *p = utf8 ;
do {
*p++ = (char) (0x80 + (c & 0x3f)) ;
bytes++ ;
prefix >>= 1 ;
c >>= 6 ;
} while( c >= prefix) ;
*p-- = *utf8 ;
*utf8++ = (char) (c - 2 * prefix) ;
if( utf8 < p) { /* swap middle two bytes if 4 bytes utf-8 code */
char c = *p ;
*p = *utf8 ;
*utf8 = c ;
}
}
return bytes ;
}
unsigned utf8_revdelta( unsigned char *p, unsigned pos) {
unsigned delta = 0 ;
if( (*p & 0xC0) == 0x80) {
unsigned char c ;
c = *--p ;
if( (c & 0xE0) == 0xC0) /* valid 2 bytes unicode seq */
delta = 1 ;
else if( ((c & 0xC0) == 0x80) && (pos > 1)) {
c = *--p ;
if( (c & 0xF0) == 0xE0) /* valid 3 bytes unicode seq */
delta = 2 ;
else if( ((c & 0xC0) == 0x80) && (pos > 2))
if( (p[ -1] & 0xF8) == 0xF0) /* valid 4 bytes unicode seq */
delta = 3 ;
}
}
return delta ;
}
/* end of utf8.c */