Stroika Library 3.0d16
 
Loading...
Searching...
No Matches
UTFConvert.h
Go to the documentation of this file.
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#ifndef _Stroika_Foundation_Characters_UTFConvert_h_
5#define _Stroika_Foundation_Characters_UTFConvert_h_ 1
6
7#include "Stroika/Foundation/StroikaPreComp.h"
8
9#include <optional>
10#include <span>
11
12#if __has_include("boost/locale/encoding_utf.hpp")
13#include <boost/locale/encoding_utf.hpp>
14#endif
15
17
18/**
19 * \file
20 * This module is designed to provide mappings between various UTF encodings of UNICODE characters.
21 */
22
24
25 /**
26 * \brief list of external UNICODE character encodings, for file IO (eDEFAULT = eUTF8)
27 *
28 * \note - UTF-7 **not** supported because very few places support it/ever used it, and
29 * https://en.wikipedia.org/wiki/UTF-7 says its obsolete. So don't bother.
30 */
32 eUTF8,
33 eUTF16_BE,
34 eUTF16_LE,
35 eUTF16 = std::endian::native == std::endian::big ? eUTF16_BE : eUTF16_LE,
36 eUTF32_BE,
37 eUTF32_LE,
38 eUTF32 = std::endian::native == std::endian::big ? eUTF32_BE : eUTF32_LE,
39
40 eDEFAULT = eUTF8,
41 };
42
43 /**
44 * \brief UTFConvert is designed to provide mappings between various UTF encodings of UNICODE characters.
45 *
46 * This area of C++ is a confusingly broken cluster-fuck. Its pretty simple, and well defined, but painful todo
47 * with std::codecvt variants...
48 *
49 * Available (plausible) implementations:
50 * o std C++ code_cvt (on windows, slow)
51 * o Boost locale utf_to_utf (untested so not sure about this)
52 * o Windows API (appears most performant)
53 * o Stroika portable implementation, based on libutfxx (slow but portable, and works)
54 * o nemtrif/utfcpp (haven't tried yet)
55 * o simdutf (allegedly fastest, but haven't tried yet)
56 *
57 * Design Choices:
58 * o Could have API to COMPUTE size of output buffer. But thats as much work to compute as actually doing the conversion (generally close).
59 * So - instead - have ComputeTargetBufferSize () API, which quickly computes a reasonable buffer size, and just
60 * assert we never run out of space. Not a great plan, but probably pretty good, most of the time.
61 *
62 * API setup so the compute-buf-size routine COULD walk the source and compute the exact needed size, without changing API.
63 * o Invalid character handling - defaults to generating exception, but can specify options.fInvalidCharacterReplacement in CTOR.
64 *
65 * \note Byte Order Markers
66 * UTFConvert does NOT support byte order marks (BOM) - for that - see Streams::BinaryToText::Reader, and Streams::TextToBinary::Writer or TextConvert
67 * The reason is - the conversion methods are templated on the char8_t, char16_t e etc char TYPE, and this doesn't work well with
68 * dynamically deteecting the character type at runtime.
69 *
70 * \notes about mbstate_t
71 * mbstate_t is used by the std::codecvt apis and nothing else, and seems opaque and not any obvious use, so just
72 * not used in this API, and faked when needed for codecvt.
73 *
74 * Web Pages/ Specs:
75 * o https://en.wikipedia.org/wiki/UTF-8
76 * o https://en.wikipedia.org/wiki/UTF-16
77 *
78 * Though you can construct your own UTFConvert with different options, a typical application will just use
79 * \code
80 * UTFConvert::kThe
81 * \endcode
82 */
83 class UTFConvert final {
84 public:
85 /**
86 */
87 struct Options {
88 /**
89 * if fInvalidCharacterReplacement is nullopt (the default) - throw on invalid characters, and
90 * otherwise use the value provided in fInvalidCharacterReplacement as the replacement.
91 *
92 * \see kDefaultMissingReplacementCharacter
93 */
94 optional<Character> fInvalidCharacterReplacement;
95
96 /**
97 * Sensible replacement character (value for fInvalidCharacterReplacement)
98 * 'UNI_REPLACEMENT_CHAR' from https://github.com/codebrainz/libutfxx/blob/master/utf/ConvertUTF.h
99 */
100 static inline constexpr Character kDefaultMissingReplacementCharacter = Character{(char32_t)0x0000FFFD};
101
102 /**
103 * Different implementations of UTF character conversion
104 */
105 enum class Implementation {
106
107 // Based on libutfxx
108 eStroikaPortable,
109
110#if __has_include("boost/locale/encoding_utf.hpp")
111 // unteseted/unimplmeneted so far
112 eBoost_Locale,
113#endif
114
115#if qStroika_Foundation_Common_Platform_Windows
116 // Seems fastest for windows
117 eWindowsAPIWide2FromMultibyte,
118#endif
119
120 // CPPREFERENCE docs on std::codecvt somewhat confusing about what is deprecated, but it appears the interface as a whole is still viable as of C++20
121 // and just several sub-interfaces are deprecated -- LGP 2023-06-28
122 // https://cplusplus.github.io/LWG/issue3767
123 eCodeCVT,
124
125 // @todo LIBS TO LOOK AT
126 // https://github.com/nemtrif/utfcpp
127 // https://github.com/simdutf/simdutf ((probably best/fastest - so try))
128 };
129
130 /**
131 * \note that since not all implementations support all APIs, this is just a hint. Other implementations maybe
132 * used as needed.
133 */
134 optional<Implementation> fPreferredImplementation;
135 };
136
137 public:
138 /**
139 * As of Stroika v3.0d2, if options.fInvalidCharacterReplacement specified, options.fPreferredImplementation must be null or eStroikaPortable.
140 * That limitation could be lifted in the future.
141 */
142#if qCompilerAndStdLib_DefaultMemberInitializerNeededEnclosingForDefaultFunArg_Buggy
143 constexpr UTFConvert ();
144 constexpr UTFConvert (const Options& options);
145#else
146 constexpr UTFConvert (const Options& options = Options{});
147#endif
148
149 public:
150 /**
151 */
152 constexpr Options GetOptions () const;
153
154 public:
155 /**
156 * Quickly compute the buffer size needed for a call to Convert (or ConvertSpan)
157 *
158 * This will frequently (greatly) over-estimate the amount of space needed but it will always produce a sufficient answer without much computation.
159 *
160 * \note buffer size NOT in 'bytes' but in units of 'TO' - so char32_t, or char8_t, or whatever.
161 *
162 * \note future implementations might do more work to compute a sometimes smaller, less wasteful buffer size. Maybe
163 * look at size, and if small just overestimate, but if input span is large, maybe worth the trouble and count
164 * multibyte characters?
165 *
166 * For the overload taking a size, and not the actual FROM span, this computes the upper bound size required.
167 *
168 * @See ConvertQuietly ()
169 * @See Convert ()
170 * @aliases used to be called QuickComputeConversionOutputBufferSize
171 */
172 template <IUNICODECanUnambiguouslyConvertTo TO, IUNICODECanUnambiguouslyConvertFrom FROM>
173 static constexpr size_t ComputeTargetBufferSize (span<const FROM> src)
174 requires (not is_const_v<TO>);
175 template <IUNICODECanUnambiguouslyConvertTo TO, IUNICODECanUnambiguouslyConvertFrom FROM>
176 static constexpr size_t ComputeTargetBufferSize (span<FROM> src)
177 requires (not is_const_v<TO>);
178 template <IUNICODECanUnambiguouslyConvertTo TO, IUNICODECanUnambiguouslyConvertFrom FROM>
179 static constexpr size_t ComputeTargetBufferSize (size_t srcSize);
180
181 public:
182 /**
183 * Check if each character in the span fits in a 2-byte encoding (ie no UTF-16 surrogate pairs)
184 */
185 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
186 static constexpr bool AllFitsInTwoByteEncoding (span<const CHAR_T> s) noexcept;
187
188 public:
189 /**
190 * \brief Convert between UTF-N encoded (including the special case of ASCII, and Latin1) character spans (e.g. UTF8 to UTF32), throw on failure, resulting span<>.
191 *
192 * Compared with the Convert () API, this loses information (number of source characters consumed).
193 * Not a general purpose API. But very frequently this is all you need, for the next stage, a new span,
194 * and for that case, this saves a little typing.
195 *
196 * NOTE - the returned span is ALWAYS (not necessarily proper) sub-span of its 'target' argument
197 *
198 * \par Example Usage
199 * \code
200 * StackBuffer<wchar_t> buf{Memory::eUninitialized, UTFConvert::ComputeTargetBufferSize<wchar_t> (src)};
201 * span<wchar_t> spanOfTargetBufferUsed = UTFConvert::kThe.ConvertSpan (src, span{buf});
202 * return String{spanOfTargetBufferUsed};
203 * \endcode
204 */
205 template <IUNICODECanUnambiguouslyConvertFrom SRC_T, IUNICODECanUnambiguouslyConvertTo TRG_T>
206 nonvirtual span<TRG_T> ConvertSpan (span<const SRC_T> source, span<TRG_T> target) const
207 requires (not is_const_v<TRG_T>);
208 template <IUNICODECanUnambiguouslyConvertFrom SRC_T, IUNICODECanUnambiguouslyConvertTo TRG_T>
209 nonvirtual span<TRG_T> ConvertSpan (span<SRC_T> source, span<TRG_T> target) const
210 requires (not is_const_v<TRG_T>);
211
212 public:
213 /**
214 * Result of Convert() call - saying how much of the source was consumed, and how many units of the target were produced.
215 * units depend on the call, char8_ts, or char16_ts, or char32_ts.
216 */
218 size_t fSourceConsumed{};
219 size_t fTargetProduced{};
220 };
221
222 public:
223 /**
224 * \brief Convert between UTF-N encoded strings/spans (including the special case of ASCII, and Latin1) (e.g. UTF8 to UTF32), throw on failure
225 *
226 * \note Failures
227 * Failures can be because of illegal input code-point or source exhausted. If you want to deal with partial characters,
228 * you must use the ConvertQuietly API.
229 *
230 * For overloads taking a target span:
231 * \pre size of target span must be at least as large as specified by ComputeTargetBufferSize
232 *
233 * Wrapper on ConvertQuietly, that throws when bad source data input, and asserts out when bad target size (insuffient for buffer).
234 *
235 * Variations from char8_t are overloaded to optionally take a multibyteConversionState parameter.
236 *
237 * The types
238 * o char
239 * o wchar_t
240 * are mapped to the appropriate above type.
241 *
242 * Source and target spans can be of any IUNICODECanUnambiguouslyConvertFrom character type (but source const and target non-const)
243 * (or basic_string of said)...
244 *
245 * \par Example Usage
246 * \code
247 * StackBuffer<wchar_t> buf{Memory::eUninitialized, UTFConvert::ComputeTargetBufferSize<wchar_t> (src)};
248 * auto result = UTFConvert::kThe.Convert (src, span{buf});
249 * return String{buf.begin (), buf.begin () + result.fTargetProduced}; // OR better yet see ConvertSpan
250 * \endcode
251 *
252 * @see ConvertQuietly for span overloads
253 *
254 * String overloads are simple wrappers on the span code but with simpler to use arguments:
255 * \par Example Usage
256 * \code
257 * wstring wide_fred = UTFConvert::kThe.Convert<wstring> (u8"fred");
258 * u16string u16_fred = UTFConvert::kThe.Convert<u16string> (U"fred");
259 * \endcode
260 */
261 template <IUNICODECanUnambiguouslyConvertFrom SRC_T, IUNICODECanUnambiguouslyConvertTo TRG_T>
262 nonvirtual ConversionResult Convert (span<const SRC_T> source, span<TRG_T> target) const;
263 template <IUNICODECanUnambiguouslyConvertFrom SRC_T, IUNICODECanUnambiguouslyConvertTo TRG_T>
264 nonvirtual ConversionResult Convert (span<SRC_T> source, span<TRG_T> target) const;
265 template <typename TO, typename FROM>
266 nonvirtual TO Convert (const FROM& from) const
267 requires ((same_as<TO, string> or same_as<TO, wstring> or same_as<TO, u8string> or same_as<TO, u16string> or same_as<TO, u32string>) and
268 (same_as<FROM, string> or same_as<FROM, wstring> or same_as<FROM, u8string> or same_as<FROM, u16string> or same_as<FROM, u32string>));
269
270 public:
271 /**
272 * \brief used for ConvertQuietly
273 *
274 * \note no need to have status code for 'targetExhausted' because we assert error in that case. DONT DO IT.
275 */
277 /**
278 * conversion successful
279 */
280 ok,
281
282 /**
283 * partial character in source, but hit end
284 */
285 sourceExhausted,
286
287 /**
288 * Source sequence is illegal/malformed - only generated if fOptions.fInvalidCharacterReplacement == nullopt
289 */
290 sourceIllegal
291 };
292
293 public:
294 /**
295 */
296 struct ConversionResultWithStatus : ConversionResult {
297 ConversionStatusFlag fStatus{};
298 };
299
300 public:
301 /**
302 * \brief Convert UTF encoded (char8_t, char16_t, char32_t, char, wchar_t, ASCII, Latin1) characters to from each other without format exceptions (still may raise memory exceptions if not enuf space)
303 *
304 * \see Convert () above for details. This only differs from Convert, in that it returns a result flag instead
305 * of throwing on errors.
306 *
307 * \note - possible error status values include 'illegal source', and 'source exhausted'. Source exhausted isn't always an
308 * error, but it is more often than not, so its treated as an error, and you must special case handling if you want
309 * to treat otherwise.
310 *
311 * So - ConvertQuietly () of many characters, but where the LAST character is complete WILL convert all the data up to the last
312 * character, return the number of characters consumed and produced, but ALSO indicate the source exhausted status - not OK.
313 *
314 * In case of errors, the return value still indicates how many characters were consumed before the error occurred, and the
315 * target produced before the error occured.
316 *
317 * Source and target spans can be of any IUNICODECanUnambiguouslyConvertFrom character type (but source const and target non-const)
318 *
319 * \pre target.size () >= ComputeTargetBufferSize<TRG_T> (source)
320 *
321 * \note, if given illegal UTF-8, or illegal ascii (ASCII) source input, the will either return
322 * with fStatus==sourceIllegal (if fOptions.fInvalidCharacterReplacement == nullopt), or will just use that
323 * fInvalidCharacterReplacement character, and treat this as not an error.
324 *
325 * \note multibyteConversionState is often ignored, but since some implementations may use it, it is required (to allow
326 * interface as a whole to always work without knowing which implementations require it).
327 */
328 template <IUNICODECanUnambiguouslyConvertFrom SRC_T, IUNICODECanUnambiguouslyConvertTo TRG_T>
329 nonvirtual ConversionResultWithStatus ConvertQuietly (span<const SRC_T> source, span<TRG_T> target) const
330 requires (not is_const_v<TRG_T>);
331
332 public:
333 /**
334 * See what the given offset in the source text translates to in the target text
335 *
336 * For example, if you are translating UTF32 text to UTF8 text, the 3rd character in
337 * UTF32 text would start at offset 3, but in the corresponding UTF8 text it might
338 * start at offset 6.
339 */
340 template <IUNICODECanUnambiguouslyConvertTo TRG_T, IUNICODECanUnambiguouslyConvertFrom SRC_T>
341 nonvirtual size_t ConvertOffset (span<const SRC_T> source, size_t srcIndex) const;
342
343 public:
344 /**
345 * Given a span, return the number of code-point units in the (full UNICODE) character, or return nullopt if the span of characters is invalid or incomplete
346 *
347 * if argument span empty, or insufficient to complete a character, this returns nullopt.
348 *
349 * For example, if CHAR_T == char32_t, or Character, this returns 1.
350 * If CHAR_T is ASCII, this returns 1.
351 */
352 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
353 static constexpr optional<size_t> NextCharacter (span<const CHAR_T> s);
354
355 public:
356 /**
357 * Given a span of UTF-encoded characters, return the number of characters (unicode code points) in the span, or nullopt if any character is incomplete/invalid
358 * (should we throw or skip or ???) - not sure
359 *
360 * \note for 'char' - the characters are ASSUMED/REQUIRED to be ASCII
361 */
362 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
363 static constexpr optional<size_t> ComputeCharacterLength (span<const CHAR_T> s);
364
365 public:
366 /**
367 * \brief Nearly always use this default UTFConvert.
368 */
369 static const UTFConvert kThe;
370
371 public:
372 static void Throw (ConversionStatusFlag cr, size_t errorAtSourceOffset);
373
374 private:
375 // find same size, and then remove_const, and then add back const
376 template <typename SRC_OF_CONSTNESS_T, typename TYPE_T>
377 using AddConstIfMatching_ = conditional_t<is_const_v<SRC_OF_CONSTNESS_T>, add_const_t<TYPE_T>, TYPE_T>;
378 template <typename TYPE_T>
379 using MapSizes_ =
380 conditional_t<sizeof (TYPE_T) == 1, conditional_t<same_as<TYPE_T, Latin1>, TYPE_T, char8_t>, conditional_t<sizeof (TYPE_T) == 2, char16_t, char32_t>>;
381 template <typename TYPE_T>
382 using CompatibleT_ = AddConstIfMatching_<TYPE_T, MapSizes_<TYPE_T>>;
383
384 private:
385 // need generic way to convert char to char8_t, and wchar_t to char16_t or char32_t, Character etc
386 template <IUNICODECanUnambiguouslyConvertFrom FromT>
387 static constexpr span<CompatibleT_<FromT>> ConvertToPrimitiveSpan_ (span<FromT> f);
388
389 private:
390 Options fOriginalOptions_;
391 Options fUsingOptions;
392
393#if qStroika_Foundation_Common_Platform_Windows
394 private:
395 static ConversionResultWithStatus ConvertQuietly_Win32_ (span<const char8_t> source, span<char16_t> target);
396 static ConversionResultWithStatus ConvertQuietly_Win32_ (span<const char16_t> source, span<char8_t> target);
397#endif
398
399 private:
400 static ConversionResultWithStatus ConvertQuietly_StroikaPortable_ (optional<Character> invalidCharacterReplacement,
401 span<const char8_t> source, span<char16_t> target);
402 static ConversionResultWithStatus ConvertQuietly_StroikaPortable_ (optional<Character> invalidCharacterReplacement,
403 span<const char8_t> source, span<char32_t> target);
404 static ConversionResultWithStatus ConvertQuietly_StroikaPortable_ (optional<Character> invalidCharacterReplacement,
405 span<const char16_t> source, span<char32_t> target);
406 static ConversionResultWithStatus ConvertQuietly_StroikaPortable_ (optional<Character> invalidCharacterReplacement,
407 span<const char32_t> source, span<char16_t> target);
408 static ConversionResultWithStatus ConvertQuietly_StroikaPortable_ (optional<Character> invalidCharacterReplacement,
409 span<const char32_t> source, span<char8_t> target);
410 static ConversionResultWithStatus ConvertQuietly_StroikaPortable_ (optional<Character> invalidCharacterReplacement,
411 span<const char16_t> source, span<char8_t> target);
412
413#if __has_include("boost/locale/encoding_utf.hpp")
414 private:
415 static ConversionResultWithStatus ConvertQuietly_boost_locale_ (span<const char8_t> source, const span<char16_t> target);
416#endif
417
418 private:
419 static ConversionResultWithStatus ConvertQuietly_codeCvt_ (span<const char8_t> source, span<char16_t> target);
420 static ConversionResultWithStatus ConvertQuietly_codeCvt_ (span<const char8_t> source, span<char32_t> target);
421 static ConversionResultWithStatus ConvertQuietly_codeCvt_ (span<const char16_t> source, span<char8_t> target);
422 static ConversionResultWithStatus ConvertQuietly_codeCvt_ (span<const char32_t> source, span<char8_t> target);
423
424 private:
425 static void ThrowIf_ (ConversionStatusFlag cr, size_t errorAtSourceOffset);
426
427 //********************** DEPRECATED BELOW ****************************
428 public:
429 enum [[deprecated ("Since Stroika v3.0d1, use class UTFConvert")]] LEGACY_ConversionResult {
430 conversionOK, /* conversion successful */
431 sourceExhausted, /* partial character in source, but hit end */
432 targetExhausted, /* insuff. room in target for conversion */
433 sourceIllegal /* source sequence is illegal/malformed */
434 };
435 enum [[deprecated ("Since Stroika v3.0d1, use class UTFConvert")]] ConversionFlags {
436 strictConversion = 0,
437 lenientConversion
438 };
439
440 /**
441 */
442 [[deprecated ("Since Stroika v3.0d1, could support, but not clearly any reason")]] static bool IsLegalUTF8Sequence (const char* source,
443 const char* sourceEnd);
444 [[deprecated ("Since Stroika v3.0d1, could support, but not clearly any reason")]] static bool IsLegalUTF8Sequence (const char8_t* source,
445 const char8_t* sourceEnd);
446
447 template <typename FROM, typename TO>
448 [[deprecated ("Since Stroika v3.0d1, use class UTFConvert")]] static inline size_t
449 QuickComputeConversionOutputBufferSize (const FROM* sourceStart, const FROM* sourceEnd)
450 {
451 return UTFConvert::ComputeTargetBufferSize<TO> (span<const FROM>{sourceStart, sourceEnd});
452 }
453 DISABLE_COMPILER_MSC_WARNING_START (4996);
454 DISABLE_COMPILER_GCC_WARNING_START ("GCC diagnostic ignored \"-Wdeprecated-declarations\"");
455 DISABLE_COMPILER_CLANG_WARNING_START ("clang diagnostic ignored \"-Wdeprecated-declarations\"");
456 template <typename FROM, typename TO>
457 [[deprecated ("Since Stroika v3.0d1, use class UTFConvert::kThe")]] static LEGACY_ConversionResult
458 ConvertQuietly (const FROM** sourceStart, const FROM* sourceEnd, TO** targetStart, TO* targetEnd, ConversionFlags flags)
459 {
460 auto r = UTFConvert::kThe.ConvertQuietly (span{*sourceStart, sourceEnd}, span{*targetStart, targetEnd});
461 *sourceStart += get<1> (r);
462 *targetStart += get<2> (r);
463 switch (get<0> (r)) {
465 return LEGACY_ConversionResult::conversionOK;
467 return LEGACY_ConversionResult::sourceExhausted;
469 return LEGACY_ConversionResult::sourceIllegal;
470 default:
472 return LEGACY_ConversionResult::sourceIllegal;
473 }
474 }
475 template <typename FROM, typename TO>
476 [[deprecated ("Since Stroika v3.0d1, use class UTFConvert::kThe")]] static inline void
477 Convert (const FROM** sourceStart, const FROM* sourceEnd, TO** targetStart, TO* targetEnd, ConversionFlags /*flags*/)
478 {
479 RequireNotNull (sourceStart);
480 RequireNotNull (targetStart);
481 Require ((static_cast<size_t> (targetEnd - *targetStart) >= QuickComputeConversionOutputBufferSize<FROM, TO> (*sourceStart, sourceEnd)));
482
483 auto r = UTFConvert::kThe.Convert (span{*sourceStart, sourceEnd}, span{*targetStart, targetEnd});
484 *sourceStart += get<0> (r);
485 *targetStart += get<1> (r);
486 }
487 DISABLE_COMPILER_MSC_WARNING_END (4996);
488 DISABLE_COMPILER_GCC_WARNING_END ("GCC diagnostic ignored \"-Wdeprecated-declarations\"");
489 DISABLE_COMPILER_CLANG_WARNING_END ("clang diagnostic ignored \"-Wdeprecated-declarations\"");
490 };
491
492 /**
493 * This is a function that takes a span of bytes, and an OPTIONAL mbstate_t (TBD), and targetBuffer, translates into targetBuffer, and returns the changes.
494 * This utility wrapper funciton is meant to capture what you can easily put together from a (configured or default) UTFConvert,
495 * but in a form more easily used/consumed by a the BinaryToText::Reader code.
496 *
497 * @todo NEED EXAMPLE OR TO LOSE THIS... -- LGP 2023-09-12
498 */
499 template <typename OUTPUT_CHAR_T>
500 using UTFCodeConverter = function<UTFConvert::ConversionResult (span<const byte> source, span<OUTPUT_CHAR_T> targetBuffer)>;
501
502}
503
504/*
505 ********************************************************************************
506 ***************************** Implementation Details ***************************
507 ********************************************************************************
508 */
509#include "UTFConvert.inl"
510
511#endif /*_Stroika_Foundation_Characters_UTFConvert_h_*/
#define RequireNotNull(p)
Definition Assertions.h:347
#define AssertNotReached()
Definition Assertions.h:355
UTFConvert is designed to provide mappings between various UTF encodings of UNICODE characters.
Definition UTFConvert.h:83
nonvirtual ConversionResult Convert(span< const SRC_T > source, span< TRG_T > target) const
Convert between UTF-N encoded strings/spans (including the special case of ASCII, and Latin1) (e....
static const UTFConvert kThe
Nearly always use this default UTFConvert.
Definition UTFConvert.h:369
nonvirtual ConversionResultWithStatus ConvertQuietly(span< const SRC_T > source, span< TRG_T > target) const
Convert UTF encoded (char8_t, char16_t, char32_t, char, wchar_t, ASCII, Latin1) characters to from ea...
static constexpr bool AllFitsInTwoByteEncoding(span< const CHAR_T > s) noexcept
ConversionStatusFlag
used for ConvertQuietly
Definition UTFConvert.h:276
static constexpr optional< size_t > ComputeCharacterLength(span< const CHAR_T > s)
static constexpr optional< size_t > NextCharacter(span< const CHAR_T > s)
nonvirtual span< TRG_T > ConvertSpan(span< const SRC_T > source, span< TRG_T > target) const
Convert between UTF-N encoded (including the special case of ASCII, and Latin1) character spans (e....
nonvirtual size_t ConvertOffset(span< const SRC_T > source, size_t srcIndex) const
static constexpr size_t ComputeTargetBufferSize(span< const FROM > src)
UnicodeExternalEncodings
list of external UNICODE character encodings, for file IO (eDEFAULT = eUTF8)
Definition UTFConvert.h:31
function< UTFConvert::ConversionResult(span< const byte > source, span< OUTPUT_CHAR_T > targetBuffer)> UTFCodeConverter
Definition UTFConvert.h:500