1#include "include/pw.h"
2#include "src/types/string/string_internal.h"
3
4char32_t _pw_decode_utf8_char(uint8_t** str)
5{
6 uint8_t* p = *str;
7 uint8_t c = *p++;
8 if (c < 0x80) {
9 *str = p;
10 return c;
11 }
12
13 char32_t codepoint;
14 uint8_t next;
15
16# define APPEND_NEXT \
17 next = *p++; \
18 if (_pw_unlikely(next == 0)) goto end_of_string; \
19 if (_pw_unlikely((next & 0b1100'0000) != 0b1000'0000)) goto bad_utf8; \
20 codepoint <<= 6; \
21 codepoint |= next & 0x3F;
22
23 if ((c & 0b1110'0000) == 0b1100'0000) {
24 codepoint = c & 0b0011'1111;
25 APPEND_NEXT
26 } else if ((c & 0b1111'0000) == 0b1110'0000) {
27 codepoint = c & 0b0001'1111;
28 APPEND_NEXT
29 APPEND_NEXT
30 } else if ((c & 0b1111'1000) == 0b1111'0000) {
31 codepoint = c & 0b0000'1111;
32 APPEND_NEXT
33 APPEND_NEXT
34 APPEND_NEXT
35 } else {
36 goto bad_utf8;
37 }
38 if (codepoint == 0) {
39 // zero codepoint encoded with 2 or more bytes,
40 // make it invalid to avoid mixing up with 1-byte null character
41 codepoint = 0xFFFFFFFF;
42 } else if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
43 // surrogate pairs are prohibited, return inalid codepoint for them
44 codepoint = 0xFFFFFFFF;
45 }
46 *str = p;
47 return codepoint;
48
49end_of_string:
50 *str = p;
51 return 0;
52
53bad_utf8:
54 return 0xFFFFFFFF;
55
56# undef APPEND_NEXT
57}
58
59char32_t _pw_decode_utf8_char_reverse(uint8_t** ptr)
60{
61 // XXX work in progress
62 // XXX if sequence is wrong, always decrement ptr by 1
63
64 char32_t codepoint;
65 uint8_t c;
66 uint8_t next;
67 uint8_t* p = *ptr;
68 uint8_t* end_ptr = p;
69
70 // seek to the start of UTF-8 sequence
71 uint8_t* str_start = p - 4;
72 for (;;) {
73 if (p < str_start) {
74 goto bad_utf8;
75 }
76 c = *--p;
77 if (c < 0x80) {
78 codepoint = c;
79 goto done;
80 }
81 if ((c & 0b1100'0000) != 0b1000'0000) {
82 break;
83 }
84 }
85 uint8_t* np = p + 1;
86
87# define APPEND_NEXT \
88 if (_pw_unlikely(np >= end_ptr)) goto bad_utf8; \
89 next = *np++; \
90 if (_pw_unlikely((next & 0b1100'0000) != 0b1000'0000)) goto bad_utf8; \
91 codepoint <<= 6; \
92 codepoint |= next & 0x3F;
93
94 if ((c & 0b1110'0000) == 0b1100'0000) {
95 codepoint = c & 0b0011'1111;
96 APPEND_NEXT
97 } else if ((c & 0b1111'0000) == 0b1110'0000) {
98 codepoint = c & 0b0001'1111;
99 APPEND_NEXT
100 APPEND_NEXT
101 } else if ((c & 0b1111'1000) == 0b1111'0000) {
102 codepoint = c & 0b0000'1111;
103 APPEND_NEXT
104 APPEND_NEXT
105 APPEND_NEXT
106 } else {
107 goto bad_utf8;
108 }
109 if (codepoint == 0) {
110 // zero codepoint encoded with 2 or more bytes,
111 // make it invalid to avoid mixing up with 1-byte null character
112 codepoint = 0xFFFFFFFF;
113 goto done;
114 }
115 if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
116 // surrogate pairs are prohibited, return inalid codepoint for them
117 codepoint = 0xFFFFFFFF;
118 goto done;
119 }
120
121done:
122 *ptr = p;
123 return codepoint;
124
125bad_utf8:
126 *ptr = p;
127 return 0xFFFFFFFF;
128
129# undef APPEND_NEXT
130}
131
132bool _pw_decode_utf8_buffer(uint8_t** ptr, unsigned* bytes_remaining, char32_t* result)
133{
134 uint8_t* p = *ptr;
135 unsigned remaining = *bytes_remaining;
136 if (!remaining) {
137 return false;
138 }
139
140 char32_t codepoint;
141 uint8_t next;
142
143# define APPEND_NEXT \
144 next = *p++; \
145 remaining--; \
146 if (_pw_unlikely((next & 0b1100'0000) != 0b1000'0000)) goto bad_utf8; \
147 codepoint <<= 6; \
148 codepoint |= next & 0x3F;
149
150 uint8_t c = *p++;
151 remaining--;
152 if (c < 0x80) {
153 codepoint = c;
154 goto done;
155 }
156 if ((c & 0b1110'0000) == 0b1100'0000) {
157 if (_pw_unlikely(!remaining)) return false;
158 codepoint = c & 0b0011'1111;
159 APPEND_NEXT
160 } else if ((c & 0b1111'0000) == 0b1110'0000) {
161 if (_pw_unlikely(remaining < 2)) return false;
162 codepoint = c & 0b0001'1111;
163 APPEND_NEXT
164 APPEND_NEXT
165 } else if ((c & 0b1111'1000) == 0b1111'0000) {
166 if (_pw_unlikely(remaining < 3)) return false;
167 codepoint = c & 0b0000'1111;
168 APPEND_NEXT
169 APPEND_NEXT
170 APPEND_NEXT
171 } else {
172 goto bad_utf8;
173 }
174 if (codepoint == 0) {
175 // zero codepoint encoded with 2 or more bytes,
176 // make it invalid to avoid mixing up with 1-byte null character
177 codepoint = 0xFFFFFFFF;
178 } else if (0xD800 <= codepoint && codepoint <= 0xDFFF) {
179 // surrogate pairs are prohibited, return inalid codepoint for them
180 codepoint = 0xFFFFFFFF;
181 }
182
183done:
184 *ptr = p;
185 *bytes_remaining = remaining;
186 *result = codepoint;
187 return true;
188
189bad_utf8:
190 codepoint = 0xFFFFFFFF;
191 goto done;
192
193# undef APPEND_NEXT
194}
195
196unsigned pw_char32_to_utf8(char32_t codepoint, char* buffer)
197{
198 /*
199 * U+0000 - U+007F 0xxxxxxx
200 * U+0080 - U+07FF 110xxxxx 10xxxxxx
201 * U+0800 - U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
202 * U+010000 - U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
203 */
204 if (codepoint < 0x80) {
205 *buffer++ = (char) codepoint;
206 return 1;
207 }
208 if (codepoint < 0b1'00000'000000) {
209 *buffer++ = (char) (0xC0 | (codepoint >> 6));
210 *buffer++ = (char) (0x80 | (codepoint & 0x3F));
211 return 2;
212 }
213 if (codepoint < 0b1'0000'000000'000000) {
214 *buffer++ = (char) (0xE0 | (codepoint >> 12));
215 *buffer++ = (char) (0x80 | ((codepoint >> 6) & 0x3F));
216 *buffer++ = (char) (0x80 | (codepoint & 0x3F));
217 return 3;
218 }
219 *buffer++ = (char) (0xF0 | ((codepoint >> 18) & 0x07));
220 *buffer++ = (char) (0x80 | ((codepoint >> 12) & 0x3F));
221 *buffer++ = (char) (0x80 | ((codepoint >> 6) & 0x3F));
222 *buffer++ = (char) (0x80 | (codepoint & 0x3F));
223 return 4;
224}
225
226unsigned utf8_strlen(char8_t* str)
227{
228 unsigned length = 0;
229 while(*str != 0) {
230 char32_t c = _pw_decode_utf8_char(&str);
231 if (c != 0xFFFFFFFF) {
232 length++;
233 }
234 }
235 return length;
236}
237
238unsigned utf8_strlen2(char8_t* str, uint8_t* char_size)
239{
240 unsigned length = 0;
241 char32_t width = 0;
242 while(*str != 0) {
243 char32_t c = _pw_decode_utf8_char(&str);
244 if (c != 0xFFFFFFFF) {
245 width |= c;
246 length++;
247 }
248 }
249 *char_size = calc_char_size(width);
250 return length;
251}
252
253unsigned utf8_strlen3(char8_t* str, uint8_t* char_size, char8_t** end_ptr)
254{
255 unsigned length = 0;
256 char32_t width = 0;
257 while(*str != 0) {
258 char32_t c = _pw_decode_utf8_char(&str);
259 if (c != 0xFFFFFFFF) {
260 width |= c;
261 length++;
262 }
263 }
264 *char_size = calc_char_size(width);
265 *end_ptr = str;
266 return length;
267}
268
269unsigned utf8_strlen2_buf(char8_t* buffer, unsigned* size, uint8_t* char_size)
270{
271 char8_t* ptr = buffer;
272 unsigned bytes_remaining = *size;
273 unsigned length = 0;
274 char32_t width = 0;
275
276 while (bytes_remaining) {
277 char32_t c;
278 if (!_pw_decode_utf8_buffer(&ptr, &bytes_remaining, &c)) {
279 break;
280 }
281 if (c != 0xFFFFFFFF) {
282 width |= c;
283 length++;
284 }
285 }
286 *size -= bytes_remaining;
287
288 if (char_size) {
289 *char_size = calc_char_size(width);
290 }
291
292 return length;
293}
294
295uint8_t utf8_char_size(char8_t* str, unsigned max_len)
296{
297 char32_t width = 0;
298 while(*str != 0) {
299 char32_t c = _pw_decode_utf8_char(&str);
300 if (c != 0xFFFFFFFF) {
301 width |= c;
302 }
303 }
304 return calc_char_size(width);
305}
306
307char8_t* utf8_skip(char8_t* str, unsigned n)
308{
309 while(n--) {
310 _pw_decode_utf8_char(&str);
311 if (*str == 0) {
312 break;
313 }
314 }
315 return str;
316}
317
318void _pw_putchar32_utf8(FILE* fp, char32_t codepoint)
319{
320 char buffer[5];
321 char* start = buffer;
322 unsigned n = pw_char32_to_utf8(codepoint, buffer);
323 while (n--) {
324 fputc(*start++, fp);
325 }
326}
327
328unsigned utf32_strlen(char32_t* str)
329{
330 unsigned length = 0;
331 while (*str++) {
332 length++;
333 }
334 return length;
335}
336
337unsigned utf32_strlen2(char32_t* str, uint8_t* char_size)
338{
339 unsigned length = 0;
340 char32_t width = 0;
341 char32_t c;
342 while ((c = *str++) != 0) {
343 width |= c;
344 length++;
345 }
346 *char_size = calc_char_size(width);
347 return length;
348}
349
350/*
351int utf32_strcmp(char32_t* a, char32_t* b)
352{
353 if (a == b) {
354 return 0;
355 }
356 for (;;) {
357 char32_t ca = *a++;
358 char32_t cb = *b++;
359 if (ca < cb) {
360 return -1;
361 } else if (ca > cb) {
362 return 1;
363 } else if (ca == 0) {
364 return 0;
365 }
366 }
367}
368
369int utf32_strcmp_utf8(char32_t* a, char8_t* b)
370{
371 for (;;) {
372 char32_t ca = *a++;
373 char32_t cb = _pw_decode_utf8_char(&b);
374 if (ca < cb) {
375 return -1;
376 } else if (ca > cb) {
377 return 1;
378 } else if (ca == 0) {
379 return 0;
380 }
381 }
382}
383*/
384
385char32_t* utf32_strchr(char32_t* str, char32_t chr)
386{
387 char32_t c;
388 while ((c = *str) != 0) {
389 if (c == chr) {
390 return str;
391 }
392 str++;
393 }
394 return nullptr;
395}
396
397uint8_t utf32_char_size(char32_t* str, unsigned max_len)
398{
399 char32_t width = 0;
400 while (max_len--) {
401 char32_t c = *str++;
402 if (c == 0) {
403 break;
404 }
405 width |= c;
406 }
407 return calc_char_size(width);
408}
409
410unsigned pw_strlen_in_utf8(PwValuePtr str)
411{
412 unsigned length = 0;
413 PwStringIter iter;
414 _pw_string_iter(str, &iter);
415 char32_t c;
416 while (_pw_string_iter_next(&iter, &c)) {
417 if (c < 0x80) {
418 length++;
419 } else if (c < 0b1'00000'000000) {
420 length += 2;
421 } else if (c < 0b1'0000'000000'000000) {
422 length += 3;
423 } else {
424 length += 4;
425 }
426 }
427 return length;
428}
429
430// integral types:
431
432#define STR_COPY_TO_UTF8_IMPL(type_name_self) \
433 static void _cp_to_utf8_##type_name_self(uint8_t* self_ptr, char* dest, unsigned length) \
434 { \
435 type_name_self* src_ptr = (type_name_self*) self_ptr; \
436 while (length--) { \
437 dest += pw_char32_to_utf8(*src_ptr++, dest); \
438 } \
439 *dest = 0; \
440 }
441
442STR_COPY_TO_UTF8_IMPL(uint8_t)
443STR_COPY_TO_UTF8_IMPL(uint16_t)
444STR_COPY_TO_UTF8_IMPL(uint32_t)
445
446// uint24_t
447
448static void _cp_to_utf8_uint24_t(uint8_t* self_ptr, char* dest, unsigned length)
449{
450 while (length--) {
451 char32_t c = *self_ptr++;
452 c |= (*self_ptr++) << 8;
453 c |= (*self_ptr++) << 16;
454 dest += pw_char32_to_utf8(c, dest);
455 }
456 *dest = 0;
457}
458
459typedef void (*CopyToUtf8)(uint8_t* self_ptr, char* dest_ptr, unsigned length);
460
461static CopyToUtf8 _pw_copy_to_utf8_variants[5] = {
462 nullptr,
463 _cp_to_utf8_uint8_t,
464 _cp_to_utf8_uint16_t,
465 _cp_to_utf8_uint24_t,
466 _cp_to_utf8_uint32_t
467};
468
469void pw_string_to_utf8(PwValuePtr str, char* buffer)
470{
471 pw_hard_assert(pw_is_string(str));
472 unsigned length;
473 uint8_t* ptr = _pw_string_start_length(str, &length);
474 CopyToUtf8 fn_copy_to_utf8 = _pw_copy_to_utf8_variants[str->str_params.char_size];
475 fn_copy_to_utf8(ptr, buffer, length);
476}
477
478void pw_substr_to_utf8(PwValuePtr str, unsigned start_pos, unsigned end_pos, char* buffer)
479{
480 pw_hard_assert(pw_is_string(str));
481 unsigned length;
482 uint8_t* ptr = _pw_string_start_length(str, &length);
483 if (end_pos >= length) {
484 end_pos = length;
485 }
486 if (end_pos <= start_pos) {
487 *buffer = 0;
488 return;
489 }
490 CopyToUtf8 fn_copy_to_utf8 = _pw_copy_to_utf8_variants[str->str_params.char_size];
491 fn_copy_to_utf8(
492 ptr + start_pos * str->str_params.char_size,
493 buffer,
494 end_pos - start_pos
495 );
496}