Stroika Library 3.0d16
 
Loading...
Searching...
No Matches
UTFConvert.cpp
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#include "Stroika/Foundation/StroikaPreComp.h"
5
8#include "Stroika/Foundation/Execution/Exceptions.h"
9
10#include "UTFConvert.h"
11
12using namespace Stroika::Foundation;
14
15namespace {
16 // crazy - from https://en.cppreference.com/w/cpp/locale/codecvt
17 template <typename FACET>
18 struct deletable_facet_ : FACET {
19 template <typename... Args>
20 deletable_facet_ (Args&&... args)
21 : FACET{forward<Args> (args)...}
22 {
23 }
24 ~deletable_facet_ ()
25 {
26 }
27 };
28}
29
30/**
31 * BASED on
32 * https://github.com/codebrainz/libutfxx/blob/master/utf/ConvertUTF.h
33 * https://github.com/codebrainz/libutfxx/blob/master/utf/ConvertUTF.c
34 * http://docs.ros.org/lunar/api/rtabmap/html/ConvertUTF_8h_source.html,
35 *
36 * but updated for C++.
37 *
38 * Copyright 2001-2004 Unicode, Inc.
39 *
40 * Disclaimer
41 *
42 * This source code is provided as is by Unicode, Inc. No claims are
43 * made as to fitness for any particular purpose. No warranties of any
44 * kind are expressed or implied. The recipient agrees to determine
45 * applicability of information provided. If this file has been
46 * purchased on magnetic or optical media from Unicode, Inc., the
47 * sole remedy for any claim will be exchange of defective media
48 * within 90 days of receipt.
49 *
50 * Limitations on Rights to Redistribute This Code
51 *
52 * Unicode, Inc. hereby grants the right to freely use the information
53 * supplied in this file in the creation of products supporting the
54 * Unicode Standard, and to make copies of this file in any form
55 * for internal or external distribution as long as this notice
56 * remains attached.
57 *
58 *
59 * Author: Mark E. Davis, 1994.
60 * Rev History: Rick McGowan, fixes & updates May 2001.
61 * Fixes & updates, Sept 2001.
62 */
63namespace {
64 namespace UTFConvert_libutfxx_ {
65 // static constexpr char32_t UNI_REPLACEMENT_CHAR = (char32_t)0x0000FFFD;
66 static constexpr char32_t UNI_MAX_BMP = (char32_t)0x0000FFFF;
67 static constexpr char32_t UNI_MAX_UTF16 = (char32_t)0x0010FFFF;
68 static constexpr char32_t UNI_MAX_LEGAL_UTF32 = (char32_t)0x0010FFFF;
69
70 enum ConversionResult {
71 conversionOK, /* conversion successful */
72 sourceExhausted, /* partial character in source, but hit end */
73 targetExhausted, /* insuff. room in target for conversion */
74 sourceIllegal /* source sequence is illegal/malformed */
75 };
76
77 UTFConvert::ConversionStatusFlag cvt_ (ConversionResult cr)
78 {
79 switch (cr) {
80 case conversionOK:
81 return UTFConvert::ConversionStatusFlag::ok;
82 case sourceExhausted:
83 return UTFConvert::ConversionStatusFlag::sourceExhausted;
84 case targetExhausted:
85 RequireNotReached (); // API doesn't allow this
86 return UTFConvert::ConversionStatusFlag::sourceIllegal;
87 case sourceIllegal:
88 return UTFConvert::ConversionStatusFlag::sourceIllegal;
89 default:
90 RequireNotReached (); // API doesn't allow this
91 return UTFConvert::ConversionStatusFlag::sourceIllegal;
92 }
93 }
94
95 constexpr int halfShift = 10; /* used for shifting by 10 bits */
96
97 constexpr char32_t halfBase = 0x0010000UL;
98 constexpr char32_t halfMask = 0x3FFUL;
99
100 /*
101 * Magic values subtracted from a buffer value during UTF8 conversion.
102 * This table contains as many values as there might be trailing bytes
103 * in a UTF-8 sequence.
104 */
105 constexpr char32_t offsetsFromUTF8[6] = {0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL};
106
107 /*
108 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
109 * into the first byte, depending on how many bytes follow. There are
110 * as many entries in this table as there are UTF-8 sequence types.
111 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
112 * for *legal* UTF-8 will be 4 or fewer bytes total.
113 */
114 constexpr char8_t firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
115
116 /*
117 * Index into the table below with the first byte of a UTF-8 sequence to
118 * get the number of trailing bytes that are supposed to follow it.
119 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
120 * left as-is for anyone who may want to do such conversion, which was
121 * allowed in earlier algorithms.
122 */
123 constexpr char trailingBytesForUTF8[256] = {
124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
126 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5};
130
131 /*
132 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
133 * This must be called with the length pre-determined by the first byte.
134 * If not calling this from ConvertUTF8to*, then the length can be set by:
135 * length = trailingBytesForUTF8[*source]+1;
136 * and the sequence is illegal right away if there aren't that many bytes
137 * available.
138 * If presented with a length > 4, this returns false. The Unicode
139 * definition of UTF-8 goes up to 4-byte sequences.
140 */
141 bool isLegalUTF8_ (const char8_t* source, int length)
142 {
143 char8_t a;
144 const char8_t* srcptr = source + length;
145 switch (length) {
146 default:
147 return false;
148 /* Everything else falls through when "true"... */
149 case 4:
150 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
151 return false;
152 case 3:
153 if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
154 return false;
155 case 2:
156 if ((a = (*--srcptr)) > 0xBF)
157 return false;
158
159 switch ((unsigned char)*source) {
160 /* no fall-through in this inner switch */
161 case 0xE0:
162 if (a < 0xA0)
163 return false;
164 break;
165 case 0xED:
166 if (a > 0x9F)
167 return false;
168 break;
169 case 0xF0:
170 if (a < 0x90)
171 return false;
172 break;
173 case 0xF4:
174 if (a > 0x8F)
175 return false;
176 break;
177 default:
178 if (a < 0x80)
179 return false;
180 }
181
182 case 1:
183 if (*source >= 0x80 && *source < 0xC2)
184 return false;
185 }
186 if (*source > 0xF4)
187 return false;
188 return true;
189 }
190
191 inline ConversionResult ConvertUTF8toUTF16_ (const char8_t** sourceStart, const char8_t* sourceEnd, char16_t** targetStart,
192 char16_t* targetEnd, optional<char32_t> missingCharacterReplacement)
193 {
194 ConversionResult result = conversionOK;
195 const char8_t* source = *sourceStart;
196 char16_t* target = *targetStart;
197 auto addMissing = [&] () {
198 if (Character{*missingCharacterReplacement}.IsSurrogatePair ()) {
199 auto p = Character{*missingCharacterReplacement}.GetSurrogatePair ();
200 *target++ = p.first;
201 *target++ = p.second;
202 }
203 else {
204 *target++ = static_cast<char16_t> (*missingCharacterReplacement);
205 }
206 };
207 while (source < sourceEnd) {
208 char32_t ch = 0;
209 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
210 if (source + extraBytesToRead >= sourceEnd) {
211 result = sourceExhausted;
212 break;
213 }
214 if (!isLegalUTF8_ (source, extraBytesToRead + 1)) {
215 if (missingCharacterReplacement) {
216 AssertNotImplemented (); // @todo - not hard - but not done
217 }
218 else {
219 result = sourceIllegal;
220 break;
221 }
222 }
223 /*
224 * The cases all fall through. See "Note A" below.
225 */
226 switch (extraBytesToRead) {
227 case 5:
228 ch += *source++;
229 ch <<= 6; /* remember, illegal UTF-8 */
230 case 4:
231 ch += *source++;
232 ch <<= 6; /* remember, illegal UTF-8 */
233 case 3:
234 ch += *source++;
235 ch <<= 6;
236 case 2:
237 ch += *source++;
238 ch <<= 6;
239 case 1:
240 ch += *source++;
241 ch <<= 6;
242 case 0:
243 ch += *source++;
244 }
245 ch -= offsetsFromUTF8[extraBytesToRead];
246
247 if (target >= targetEnd) [[unlikely]] {
248 source -= (extraBytesToRead + 1); /* Back up source pointer! */
249 result = targetExhausted;
250 break;
251 }
252 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
253 /* UTF-16 surrogate values are illegal in UTF-32 */
254 if (ch >= Character::kUNICODESurrogate_High_Start && ch <= Character::kUNICODESurrogate_Low_End) {
255 if (missingCharacterReplacement) {
256 addMissing ();
257 }
258 else {
259 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
260 result = sourceIllegal;
261 break;
262 }
263 }
264 else {
265 *target++ = (char16_t)ch; /* normal case */
266 }
267 }
268 else if (ch > UNI_MAX_UTF16) {
269 if (missingCharacterReplacement) {
270 addMissing ();
271 }
272 else {
273 result = sourceIllegal;
274 source -= (extraBytesToRead + 1); /* return to the start */
275 break; /* Bail out; shouldn't continue */
276 }
277 }
278 else {
279 /* target is a character in range 0xFFFF - 0x10FFFF. */
280 if (target + 1 >= targetEnd) {
281 source -= (extraBytesToRead + 1); /* Back up source pointer! */
282 result = targetExhausted;
283 break;
284 }
285 ch -= halfBase;
286 *target++ = (char16_t)((ch >> halfShift) + Character::kUNICODESurrogate_High_Start);
287 *target++ = (char16_t)((ch & halfMask) + Character::kUNICODESurrogate_Low_Start);
288 }
289 }
290 *sourceStart = source;
291 *targetStart = target;
292 return result;
293 }
294 inline ConversionResult ConvertUTF16toUTF8_ (const char16_t** sourceStart, const char16_t* sourceEnd, char8_t** targetStart,
295 char8_t* targetEnd, optional<char32_t> missingCharacterReplacement)
296 {
297 ConversionResult result = conversionOK;
298 const char16_t* source = *sourceStart;
299 char8_t* target = *targetStart;
300 while (source < sourceEnd) {
301 char32_t ch;
302 unsigned short bytesToWrite = 0;
303 constexpr char32_t byteMask = 0xBF;
304 constexpr char32_t byteMark = 0x80;
305 const char16_t* oldSource = source; /* In case we have to back up because of target overflow. */
306 ch = *source++;
307 /* If we have a surrogate pair, convert to char32_t first. */
308 if (ch >= Character::kUNICODESurrogate_High_Start && ch <= Character::kUNICODESurrogate_High_End) [[unlikely]] {
309 /* If the 16 bits following the high surrogate are in the source buffer... */
310 if (source < sourceEnd) [[likely]] {
311 char32_t ch2 = *source;
312 /* If it's a low surrogate, convert to char32_t. */
313 if (ch2 >= Character::kUNICODESurrogate_Low_Start && ch2 <= Character::kUNICODESurrogate_Low_End) {
314 ch = ((ch - Character::kUNICODESurrogate_High_Start) << halfShift) + (ch2 - Character::kUNICODESurrogate_Low_Start) + halfBase;
315 ++source;
316 }
317 else if (missingCharacterReplacement == nullopt) { /* it's an unpaired high surrogate */
318 --source; /* return to the illegal value itself */
319 result = sourceIllegal;
320 break;
321 }
322 }
323 else { /* We don't have the 16 bits following the high surrogate. */
324 --source; /* return to the high surrogate */
325 result = sourceExhausted;
326 break;
327 }
328 }
329 else if (missingCharacterReplacement == nullopt) {
330 /* UTF-16 surrogate values are illegal in UTF-32 */
331 if (ch >= Character::kUNICODESurrogate_Low_Start && ch <= Character::kUNICODESurrogate_Low_End) {
332 --source; /* return to the illegal value itself */
333 result = sourceIllegal;
334 break;
335 }
336 }
337 /* Figure out how many bytes the result will require */
338 if (ch < (char32_t)0x80) {
339 bytesToWrite = 1;
340 }
341 else if (ch < (char32_t)0x800) {
342 bytesToWrite = 2;
343 }
344 else if (ch < (char32_t)0x10000) {
345 bytesToWrite = 3;
346 }
347 else if (ch < (char32_t)0x110000) {
348 bytesToWrite = 4;
349 }
350 else {
351 bytesToWrite = 3;
352 Assert (missingCharacterReplacement.has_value ()); // I THINK sb caught above if 'strict' mode
353 ch = *missingCharacterReplacement;
354 }
355
356 target += bytesToWrite;
357 if (target > targetEnd) {
358 source = oldSource; /* Back up source pointer! */
359 target -= bytesToWrite;
360 result = targetExhausted;
361 break;
362 }
363 switch (bytesToWrite) { /* note: everything falls through. */
364 case 4:
365 *--target = (char8_t)((ch | byteMark) & byteMask);
366 ch >>= 6;
367 case 3:
368 *--target = (char8_t)((ch | byteMark) & byteMask);
369 ch >>= 6;
370 case 2:
371 *--target = (char8_t)((ch | byteMark) & byteMask);
372 ch >>= 6;
373 case 1:
374 *--target = (char8_t)(ch | firstByteMark[bytesToWrite]);
375 }
376 target += bytesToWrite;
377 }
378 *sourceStart = source;
379 *targetStart = target;
380 return result;
381 }
382 DISABLE_COMPILER_MSC_WARNING_START (4701) // potentially uninitialized local variable 'ch' used (WRONG cuz if we get into loop, initialized
383 DISABLE_COMPILER_GCC_WARNING_START ("GCC diagnostic ignored \"-Wmaybe-uninitialized\""); // potentially uninitialized local variable 'ch' used (WRONG cuz if we get into loop, initialized
384 inline ConversionResult ConvertUTF16toUTF32_ (const char16_t** sourceStart, const char16_t* sourceEnd, char32_t** targetStart,
385 char32_t* targetEnd, optional<char32_t> missingCharacterReplacement)
386 {
387 ConversionResult result = conversionOK;
388 const char16_t* source = *sourceStart;
389 char32_t* target = *targetStart;
390 char32_t ch, ch2;
391 while (source < sourceEnd) {
392 const char16_t* oldSource = source; /* In case we have to back up because of target overflow. */
393 ch = *source++;
394 /* If we have a surrogate pair, convert to UTF32 first. */
395 if (ch >= Character::kUNICODESurrogate_High_Start && ch <= Character::kUNICODESurrogate_High_End) [[unlikely]] {
396 /* If the 16 bits following the high surrogate are in the source buffer... */
397 if (source < sourceEnd) {
398 ch2 = *source;
399 /* If it's a low surrogate, convert to UTF32. */
400 if (ch2 >= Character::kUNICODESurrogate_Low_Start && ch2 <= Character::kUNICODESurrogate_Low_End) {
401 ch = ((ch - Character::kUNICODESurrogate_High_Start) << halfShift) + (ch2 - Character::kUNICODESurrogate_Low_Start) + halfBase;
402 ++source;
403 }
404 else if (missingCharacterReplacement == nullopt) { /* it's an unpaired high surrogate */
405 --source; /* return to the illegal value itself */
406 result = sourceIllegal;
407 break;
408 }
409 }
410 else { /* We don't have the 16 bits following the high surrogate. */
411 --source; /* return to the high surrogate */
412 result = sourceExhausted;
413 break;
414 }
415 }
416 else if (missingCharacterReplacement == nullopt) {
417 /* UTF-16 surrogate values are illegal in UTF-32 */
418 if (ch >= Character::kUNICODESurrogate_Low_Start && ch <= Character::kUNICODESurrogate_Low_End) {
419 --source; /* return to the illegal value itself */
420 result = sourceIllegal;
421 break;
422 }
423 }
424 if (target >= targetEnd) {
425 source = oldSource; /* Back up source pointer! */
426 result = targetExhausted;
427 break;
428 }
429 *target++ = ch;
430 }
431 *sourceStart = source;
432 *targetStart = target;
433 if (result == sourceIllegal) {
434 using namespace Characters::Literals;
435 DbgTrace ("ConvertUTF16toUTF32 illegal seq 0x{:x},0x{:x}"_f, static_cast<int> (ch), static_cast<int> (ch2));
436 }
437 return result;
438 }
439 DISABLE_COMPILER_MSC_WARNING_END (4701)
440 DISABLE_COMPILER_GCC_WARNING_END ("GCC diagnostic ignored \"-Wmaybe-uninitialized\""); // potentially uninitialized local variable 'ch' used (WRONG cuz if we get into loop, initialized
441 inline ConversionResult ConvertUTF32toUTF16_ (const char32_t** sourceStart, const char32_t* sourceEnd, char16_t** targetStart,
442 char16_t* targetEnd, optional<char32_t> missingCharacterReplacement)
443 {
444 ConversionResult result = conversionOK;
445 const char32_t* source = *sourceStart;
446 char16_t* target = *targetStart;
447 auto addMissing = [&] () {
448 if (Character{*missingCharacterReplacement}.IsSurrogatePair ()) {
449 auto p = Character{*missingCharacterReplacement}.GetSurrogatePair ();
450 *target++ = p.first;
451 *target++ = p.second;
452 }
453 else {
454 *target++ = static_cast<char16_t> (*missingCharacterReplacement);
455 }
456 };
457 while (source < sourceEnd) {
458 char32_t ch;
459 if (target >= targetEnd) {
460 result = targetExhausted;
461 break;
462 }
463 ch = *source++;
464 if (ch <= UNI_MAX_BMP) [[likely]] { /* Target is a character <= 0xFFFF */
465 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
466 if (ch >= Character::kUNICODESurrogate_High_Start && ch <= Character::kUNICODESurrogate_Low_End) [[unlikely]] {
467 if (missingCharacterReplacement == nullopt) {
468 --source; /* return to the illegal value itself */
469 result = sourceIllegal;
470 break;
471 }
472 else {
473 addMissing ();
474 }
475 }
476 else {
477 *target++ = (char16_t)ch; /* normal case */
478 }
479 }
480 else if (ch > UNI_MAX_LEGAL_UTF32) {
481 if (missingCharacterReplacement) {
482 addMissing ();
483 }
484 else {
485 result = sourceIllegal;
486 }
487 }
488 else {
489 /* target is a character in range 0xFFFF - 0x10FFFF. */
490 if (target + 1 >= targetEnd) {
491 --source; /* Back up source pointer! */
492 result = targetExhausted;
493 break;
494 }
495 ch -= halfBase;
496 *target++ = (char16_t)((ch >> halfShift) + Character::kUNICODESurrogate_High_Start);
497 *target++ = (char16_t)((ch & halfMask) + Character::kUNICODESurrogate_Low_Start);
498 }
499 }
500 *sourceStart = source;
501 *targetStart = target;
502 return result;
503 }
504 inline ConversionResult ConvertUTF8toUTF32_ (const char8_t** sourceStart, const char8_t* sourceEnd, char32_t** targetStart,
505 char32_t* targetEnd, optional<char32_t> missingCharacterReplacement)
506 {
507 ConversionResult result = conversionOK;
508 const char8_t* source = *sourceStart;
509 char32_t* target = *targetStart;
510 while (source < sourceEnd) {
511 char32_t ch = 0;
512 unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
513 if (source + extraBytesToRead >= sourceEnd) [[unlikely]] {
514 result = sourceExhausted;
515 break;
516 }
517 if (!isLegalUTF8_ (source, extraBytesToRead + 1)) {
518 if (missingCharacterReplacement) {
519 AssertNotImplemented (); // @todo - not hard - but not done
520 }
521 else {
522 result = sourceIllegal;
523 break;
524 }
525 }
526 /*
527 * The cases all fall through. See "Note A" below.
528 */
529 switch (extraBytesToRead) {
530 case 5:
531 ch += *source++;
532 ch <<= 6;
533 case 4:
534 ch += *source++;
535 ch <<= 6;
536 case 3:
537 ch += *source++;
538 ch <<= 6;
539 case 2:
540 ch += *source++;
541 ch <<= 6;
542 case 1:
543 ch += *source++;
544 ch <<= 6;
545 case 0:
546 ch += *source++;
547 }
548 ch -= offsetsFromUTF8[extraBytesToRead];
549
550 if (target >= targetEnd) [[unlikely]] {
551 source -= (extraBytesToRead + 1); /* Back up the source pointer! */
552 result = targetExhausted;
553 break;
554 }
555 if (ch <= UNI_MAX_LEGAL_UTF32) [[likely]] {
556 /*
557 * UTF-16 surrogate values are illegal in UTF-32, and anything
558 * over Plane 17 (> 0x10FFFF) is illegal.
559 */
560 if (ch >= Character::kUNICODESurrogate_High_Start && ch <= Character::kUNICODESurrogate_Low_End) {
561 if (missingCharacterReplacement) {
562 *target++ = *missingCharacterReplacement;
563 }
564 else {
565 source -= (extraBytesToRead + 1); /* return to the illegal value itself */
566 result = sourceIllegal;
567 break;
568 }
569 }
570 else {
571 *target++ = ch;
572 }
573 }
574 else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
575 if (missingCharacterReplacement) {
576 *target++ = *missingCharacterReplacement;
577 }
578 else {
579 result = sourceIllegal;
580 break;
581 }
582 }
583 }
584 *sourceStart = source;
585 *targetStart = target;
586 return result;
587 }
588 inline ConversionResult ConvertUTF32toUTF8_ (const char32_t** sourceStart, const char32_t* sourceEnd, char8_t** targetStart,
589 char8_t* targetEnd, optional<char32_t> missingCharacterReplacement)
590 {
591 ConversionResult result = conversionOK;
592 const char32_t* source = *sourceStart;
593 char8_t* target = *targetStart;
594 while (source < sourceEnd) {
595 char32_t ch;
596 unsigned short bytesToWrite = 0;
597 const char32_t byteMask = 0xBF;
598 const char32_t byteMark = 0x80;
599 ch = *source++;
600 if (missingCharacterReplacement == nullopt) {
601 /* UTF-16 surrogate values are illegal in UTF-32 */
602 if (ch >= Character::kUNICODESurrogate_High_Start && ch <= Character::kUNICODESurrogate_Low_End) [[unlikely]] {
603 --source; /* return to the illegal value itself */
604 result = sourceIllegal;
605 break;
606 }
607 }
608 /*
609 * Figure out how many bytes the result will require. Turn any
610 * illegally large UTF32 things (> Plane 17) into replacement chars.
611 */
612 if (ch < (char32_t)0x80) {
613 bytesToWrite = 1;
614 }
615 else if (ch < (char32_t)0x800) {
616 bytesToWrite = 2;
617 }
618 else if (ch < (char32_t)0x10000) {
619 bytesToWrite = 3;
620 }
621 else if (ch <= UNI_MAX_LEGAL_UTF32) {
622 bytesToWrite = 4;
623 }
624 else {
625 if (missingCharacterReplacement) {
626 ch = *missingCharacterReplacement;
627 bytesToWrite = 3; // @todo WRONG - must get right number for this character
628 }
629 else {
630 result = sourceIllegal;
631 break;
632 }
633 }
634
635 target += bytesToWrite;
636 if (target > targetEnd) {
637 --source; /* Back up source pointer! */
638 target -= bytesToWrite;
639 result = targetExhausted;
640 break;
641 }
642 switch (bytesToWrite) { /* note: everything falls through. */
643 case 4:
644 *--target = (char8_t)((ch | byteMark) & byteMask);
645 ch >>= 6;
646 case 3:
647 *--target = (char8_t)((ch | byteMark) & byteMask);
648 ch >>= 6;
649 case 2:
650 *--target = (char8_t)((ch | byteMark) & byteMask);
651 ch >>= 6;
652 case 1:
653 *--target = (char8_t)(ch | firstByteMark[bytesToWrite]);
654 }
655 target += bytesToWrite;
656 }
657 *sourceStart = source;
658 *targetStart = target;
659 return result;
660 }
661
662 }
663}
664
665namespace {
666 namespace UTFConvert_codecvtSupport_ {
667 inline UTFConvert::ConversionStatusFlag cvt_stdcodecvt_results_ (int i)
668 {
669 switch (i) {
670 case codecvt_base::ok:
671 return UTFConvert::ConversionStatusFlag::ok;
672 case codecvt_base::error:
673 return UTFConvert::ConversionStatusFlag::sourceIllegal;
674 case codecvt_base::partial:
675 return UTFConvert::ConversionStatusFlag::sourceExhausted; // not quite - could be target exhausted?
676 case codecvt_base::noconv:
677 return UTFConvert::ConversionStatusFlag::sourceIllegal; // not quite
678 default:
679 Assert (false);
680 return UTFConvert::ConversionStatusFlag::sourceIllegal;
681 }
682 }
683 inline UTFConvert::ConversionStatusFlag ConvertUTF8toUTF16_codecvt_ (const char8_t** sourceStart, const char8_t* sourceEnd,
684 char16_t** targetStart, char16_t* targetEnd)
685 {
686 DISABLE_COMPILER_MSC_WARNING_START (4996)
687 static const deletable_facet_<codecvt<char16_t, char8_t, mbstate_t>> cvt;
688 mbstate_t ignoredMBState{};
689 const char8_t* sourceCursor = *sourceStart;
690 char16_t* outCursor = *targetStart;
691 codecvt_base::result rr = cvt.in (ignoredMBState, *sourceStart, sourceEnd, sourceCursor, *targetStart, targetEnd, outCursor);
692 *sourceStart = reinterpret_cast<const char8_t*> (sourceCursor);
693 *targetStart = outCursor;
694 return cvt_stdcodecvt_results_ (rr);
695 DISABLE_COMPILER_MSC_WARNING_END (4996)
696 }
697 inline UTFConvert::ConversionStatusFlag ConvertUTF16toUTF8_codecvt_ (const char16_t** sourceStart, const char16_t* sourceEnd,
698 char8_t** targetStart, char8_t* targetEnd)
699 {
700 DISABLE_COMPILER_MSC_WARNING_START (4996)
701 static const deletable_facet_<codecvt<char16_t, char8_t, mbstate_t>> cvt;
702 mbstate_t ignoredMBState{};
703 const char16_t* sourceCursor = *sourceStart;
704 char8_t* outCursor = *targetStart;
705 codecvt_base::result rr = cvt.out (ignoredMBState, *sourceStart, sourceEnd, sourceCursor, *targetStart, targetEnd, outCursor);
706 *sourceStart = reinterpret_cast<const char16_t*> (sourceCursor);
707 *targetStart = reinterpret_cast<char8_t*> (outCursor);
708 return cvt_stdcodecvt_results_ (rr);
709 DISABLE_COMPILER_MSC_WARNING_END (4996)
710 }
711 inline UTFConvert::ConversionStatusFlag ConvertUTF8toUTF32_codecvt_ (const char8_t** sourceStart, const char8_t* sourceEnd,
712 char32_t** targetStart, char32_t* targetEnd)
713 {
714 DISABLE_COMPILER_MSC_WARNING_START (4996)
715 static const deletable_facet_<codecvt<char32_t, char8_t, mbstate_t>> cvt;
716 mbstate_t ignoredState{};
717
718 const char8_t* sourceCursor = *sourceStart;
719 char32_t* outCursor = *targetStart;
720 codecvt_base::result rr = cvt.in (ignoredState, *sourceStart, sourceEnd, sourceCursor, *targetStart, targetEnd, outCursor);
721 *sourceStart = reinterpret_cast<const char8_t*> (sourceCursor);
722 *targetStart = outCursor;
723 return cvt_stdcodecvt_results_ (rr);
724 DISABLE_COMPILER_MSC_WARNING_END (4996)
725 }
726 inline UTFConvert::ConversionStatusFlag ConvertUTF32toUTF8_codecvt_ (const char32_t** sourceStart, const char32_t* sourceEnd,
727 char8_t** targetStart, char8_t* targetEnd)
728 {
729 DISABLE_COMPILER_MSC_WARNING_START (4996)
730 static const deletable_facet_<codecvt<char32_t, char8_t, mbstate_t>> cvt;
731 mbstate_t ignoredState{};
732 const char32_t* sourceCursor = *sourceStart;
733 char8_t* outCursor = *targetStart;
734 codecvt_base::result rr = cvt.out (ignoredState, *sourceStart, sourceEnd, sourceCursor, *targetStart, targetEnd, outCursor);
735 *sourceStart = reinterpret_cast<const char32_t*> (sourceCursor);
736 *targetStart = reinterpret_cast<char8_t*> (outCursor);
737 return cvt_stdcodecvt_results_ (rr);
738 DISABLE_COMPILER_MSC_WARNING_END (4996)
739 }
740 }
741}
742
743/*
744 ********************************************************************************
745 ***************************** Characters::UTFConvert ***************************
746 ********************************************************************************
747 */
748namespace {
749 using ConversionResultWithStatus = Characters::UTFConvert::ConversionResultWithStatus;
750 using ConversionStatusFlag = Characters::UTFConvert::ConversionStatusFlag;
751
752 template <typename IN_T, typename OUT_T, regular_invocable<const IN_T**, const IN_T*, OUT_T**, OUT_T*, optional<char32_t>> FUN2DO_REAL_WORK>
753 inline auto ConvertQuietly_StroikaPortable_helper_ (optional<Character> invalidCharacterReplacement, span<const IN_T> source,
754 span<OUT_T> target, FUN2DO_REAL_WORK&& realWork) -> ConversionResultWithStatus
755 {
756 using namespace UTFConvert_libutfxx_;
757 const IN_T* sourceStart = source.data ();
758 const IN_T* sourceEnd = sourceStart + source.size ();
759 OUT_T* targetStart = target.data ();
760 OUT_T* targetEnd = targetStart + target.size ();
761 // convert replacement character to target character set, and then pass that
762 ConversionResult r =
763 realWork (&sourceStart, sourceEnd, &targetStart, targetEnd,
764 invalidCharacterReplacement.has_value () ? invalidCharacterReplacement->As<char32_t> () : optional<char32_t>{});
765 return ConversionResultWithStatus{
766 {static_cast<size_t> (sourceStart - source.data ()), static_cast<size_t> (targetStart - target.data ())}, cvt_ (r)};
767 }
768}
769auto UTFConvert::ConvertQuietly_StroikaPortable_ (optional<Character> invalidCharacterReplacement, span<const char8_t> source, span<char16_t> target)
770 -> ConversionResultWithStatus
771{
772#if qCompilerAndStdLib_arm_ubsan_callDirectFunInsteadOfThruLamdba_Buggy
773 if (Debug::kBuiltWithUndefinedBehaviorSanitizer) {
774 return ConvertQuietly_StroikaPortable_helper_ (invalidCharacterReplacement, source, target,
775 [] (const char8_t** sourceStart, const char8_t* sourceEnd, char16_t** targetStart,
776 char16_t* targetEnd, optional<char32_t> missingCharacterReplacement) {
777 return UTFConvert_libutfxx_::ConvertUTF8toUTF16_ (
778 sourceStart, sourceEnd, targetStart, targetEnd, missingCharacterReplacement);
779 });
780 }
781#endif
782 return ConvertQuietly_StroikaPortable_helper_ (invalidCharacterReplacement, source, target, UTFConvert_libutfxx_::ConvertUTF8toUTF16_);
783}
784auto UTFConvert::ConvertQuietly_StroikaPortable_ (optional<Character> invalidCharacterReplacement, span<const char8_t> source, span<char32_t> target)
785 -> ConversionResultWithStatus
786{
787#if qCompilerAndStdLib_arm_ubsan_callDirectFunInsteadOfThruLamdba_Buggy
788 if (Debug::kBuiltWithUndefinedBehaviorSanitizer) {
789 return ConvertQuietly_StroikaPortable_helper_ (invalidCharacterReplacement, source, target,
790 [] (const char8_t** sourceStart, const char8_t* sourceEnd, char32_t** targetStart,
791 char32_t* targetEnd, optional<char32_t> missingCharacterReplacement) {
792 return UTFConvert_libutfxx_::ConvertUTF8toUTF32_ (
793 sourceStart, sourceEnd, targetStart, targetEnd, missingCharacterReplacement);
794 });
795 }
796#endif
797 return ConvertQuietly_StroikaPortable_helper_ (invalidCharacterReplacement, source, target, UTFConvert_libutfxx_::ConvertUTF8toUTF32_);
798}
799auto UTFConvert::ConvertQuietly_StroikaPortable_ (optional<Character> invalidCharacterReplacement, span<const char16_t> source,
800 span<char32_t> target) -> ConversionResultWithStatus
801{
802#if qCompilerAndStdLib_arm_ubsan_callDirectFunInsteadOfThruLamdba_Buggy
803 if (Debug::kBuiltWithUndefinedBehaviorSanitizer) {
804 return ConvertQuietly_StroikaPortable_helper_ (invalidCharacterReplacement, source, target,
805 [] (const char16_t** sourceStart, const char16_t* sourceEnd, char32_t** targetStart,
806 char32_t* targetEnd, optional<char32_t> missingCharacterReplacement) {
807 return UTFConvert_libutfxx_::ConvertUTF16toUTF32_ (
808 sourceStart, sourceEnd, targetStart, targetEnd, missingCharacterReplacement);
809 });
810 }
811#endif
812 return ConvertQuietly_StroikaPortable_helper_ (invalidCharacterReplacement, source, target, UTFConvert_libutfxx_::ConvertUTF16toUTF32_);
813}
814auto UTFConvert::ConvertQuietly_StroikaPortable_ (optional<Character> invalidCharacterReplacement, span<const char32_t> source,
815 span<char16_t> target) -> ConversionResultWithStatus
816{
817#if qCompilerAndStdLib_arm_ubsan_callDirectFunInsteadOfThruLamdba_Buggy
818 if (Debug::kBuiltWithUndefinedBehaviorSanitizer) {
819 return ConvertQuietly_StroikaPortable_helper_ (invalidCharacterReplacement, source, target,
820 [] (const char32_t** sourceStart, const char32_t* sourceEnd, char16_t** targetStart,
821 char16_t* targetEnd, optional<char32_t> missingCharacterReplacement) {
822 return UTFConvert_libutfxx_::ConvertUTF32toUTF16_ (
823 sourceStart, sourceEnd, targetStart, targetEnd, missingCharacterReplacement);
824 });
825 }
826#endif
827 return ConvertQuietly_StroikaPortable_helper_ (invalidCharacterReplacement, source, target, UTFConvert_libutfxx_::ConvertUTF32toUTF16_);
828}
829auto UTFConvert::ConvertQuietly_StroikaPortable_ (optional<Character> invalidCharacterReplacement, span<const char32_t> source, span<char8_t> target)
830 -> ConversionResultWithStatus
831{
832#if qCompilerAndStdLib_arm_ubsan_callDirectFunInsteadOfThruLamdba_Buggy
833 if (Debug::kBuiltWithUndefinedBehaviorSanitizer) {
834 return ConvertQuietly_StroikaPortable_helper_ (invalidCharacterReplacement, source, target,
835 [] (const char32_t** sourceStart, const char32_t* sourceEnd, char8_t** targetStart,
836 char8_t* targetEnd, optional<char32_t> missingCharacterReplacement) {
837 return UTFConvert_libutfxx_::ConvertUTF32toUTF8_ (
838 sourceStart, sourceEnd, targetStart, targetEnd, missingCharacterReplacement);
839 });
840 }
841#endif
842 return ConvertQuietly_StroikaPortable_helper_ (invalidCharacterReplacement, source, target, UTFConvert_libutfxx_::ConvertUTF32toUTF8_);
843}
844auto UTFConvert::ConvertQuietly_StroikaPortable_ (optional<Character> invalidCharacterReplacement, span<const char16_t> source, span<char8_t> target)
845 -> ConversionResultWithStatus
846{
847#if qCompilerAndStdLib_arm_ubsan_callDirectFunInsteadOfThruLamdba_Buggy
848 if (Debug::kBuiltWithUndefinedBehaviorSanitizer) {
849 return ConvertQuietly_StroikaPortable_helper_ (invalidCharacterReplacement, source, target,
850 [] (const char16_t** sourceStart, const char16_t* sourceEnd, char8_t** targetStart,
851 char8_t* targetEnd, optional<char32_t> missingCharacterReplacement) {
852 return UTFConvert_libutfxx_::ConvertUTF16toUTF8_ (
853 sourceStart, sourceEnd, targetStart, targetEnd, missingCharacterReplacement);
854 });
855 }
856#endif
857 return ConvertQuietly_StroikaPortable_helper_ (invalidCharacterReplacement, source, target, UTFConvert_libutfxx_::ConvertUTF16toUTF8_);
858}
859
860namespace {
861 template <typename IN_T, typename OUT_T, typename FUN2DO_REAL_WORK>
862 inline auto ConvertQuietly_codeCvt_helper_ (span<const IN_T> source, const span<OUT_T> target, FUN2DO_REAL_WORK&& realWork) -> ConversionResultWithStatus
863 {
864 using namespace UTFConvert_codecvtSupport_;
865 const IN_T* sourceStart = reinterpret_cast<const IN_T*> (source.data ());
866 const IN_T* sourceEnd = sourceStart + source.size ();
867 OUT_T* targetStart = reinterpret_cast<OUT_T*> (target.data ());
868 OUT_T* targetEnd = targetStart + target.size ();
869 ConversionStatusFlag r = realWork (&sourceStart, sourceEnd, &targetStart, targetEnd);
870 if (r == ConversionStatusFlag::ok) {
871 return ConversionResultWithStatus{{static_cast<size_t> (sourceStart - reinterpret_cast<const IN_T*> (source.data ())),
872 static_cast<size_t> (targetStart - reinterpret_cast<const OUT_T*> (target.data ()))},
873 ConversionStatusFlag::ok};
874 }
875 else {
876 return ConversionResultWithStatus{{0, 0}, r};
877 }
878 }
879}
880auto UTFConvert::ConvertQuietly_codeCvt_ (span<const char8_t> source, span<char16_t> target) -> ConversionResultWithStatus
881{
882 return ConvertQuietly_codeCvt_helper_ (source, target, UTFConvert_codecvtSupport_::ConvertUTF8toUTF16_codecvt_);
883}
884auto UTFConvert::ConvertQuietly_codeCvt_ (span<const char16_t> source, span<char8_t> target) -> ConversionResultWithStatus
885{
886 return ConvertQuietly_codeCvt_helper_ (source, target, UTFConvert_codecvtSupport_::ConvertUTF16toUTF8_codecvt_);
887}
888auto UTFConvert::ConvertQuietly_codeCvt_ (span<const char8_t> source, span<char32_t> target) -> ConversionResultWithStatus
889{
890 return ConvertQuietly_codeCvt_helper_ (source, target, UTFConvert_codecvtSupport_::ConvertUTF8toUTF32_codecvt_);
891}
892auto UTFConvert::ConvertQuietly_codeCvt_ (span<const char32_t> source, span<char8_t> target) -> ConversionResultWithStatus
893{
894 return ConvertQuietly_codeCvt_helper_ (source, target, UTFConvert_codecvtSupport_::ConvertUTF32toUTF8_codecvt_);
895}
896
897void UTFConvert::Throw (ConversionStatusFlag cr, size_t errorAtSourceOffset)
898{
899 switch (cr) {
901 static const auto kException_ = Execution::RuntimeErrorException{"Invalid UNICODE source string (incomplete UTF character)"sv};
902 Execution::Throw (kException_);
903 }
906 }
907 default:
909 }
910}
#define AssertNotImplemented()
Definition Assertions.h:401
#define RequireNotReached()
Definition Assertions.h:385
#define AssertNotReached()
Definition Assertions.h:355
#define DbgTrace
Definition Trace.h:309
An error occurred encoding or decoding a character
constexpr pair< char16_t, char16_t > GetSurrogatePair() const
ConversionStatusFlag
used for ConvertQuietly
Definition UTFConvert.h:276
void Throw(T &&e2Throw)
identical to builtin C++ 'throw' except that it does helpful, type dependent DbgTrace() messages firs...
Definition Throw.inl:43