4#if qStroika_Foundation_Common_Platform_Windows
11#include "Stroika/Foundation/Memory/Common.h"
19 inline T ValueOf_ (
const optional<T>& t)
31#if qCompilerAndStdLib_DefaultMemberInitializerNeededEnclosingForDefaultFunArg_Buggy
33 : UTFConvert{Options{}}
37 constexpr UTFConvert::UTFConvert (
const Options& options)
38 : fOriginalOptions_{options}
39 , fUsingOptions{options}
41#if qStroika_Foundation_Common_Platform_Windows
42 if (fUsingOptions.fPreferredImplementation == nullopt and options.fInvalidCharacterReplacement == nullopt) {
43 fUsingOptions.fPreferredImplementation = Options::Implementation::eWindowsAPIWide2FromMultibyte;
46 if (fUsingOptions.fPreferredImplementation == nullopt) {
47 fUsingOptions.fPreferredImplementation = Options::Implementation::eStroikaPortable;
49 if (options.fInvalidCharacterReplacement) {
53 Require (fUsingOptions.fPreferredImplementation == Options::Implementation::eStroikaPortable);
56 inline constexpr auto UTFConvert::GetOptions () const -> Options
58 return fOriginalOptions_;
61 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
65 return optional<size_t>{};
68 if constexpr (same_as<CHAR_T, ASCII> or same_as<CHAR_T, Latin1>) {
71 else if constexpr (same_as<CHAR_T, char8_t>) {
74 uint8_t firstByte =
static_cast<uint8_t
> (*i);
75 if (Memory::BitSubstring (firstByte, 7, 8) == 0b0) {
80 if (Memory::BitSubstring (firstByte, 5, 8) == 0b110) {
81 return i == s.end () ? optional<size_t>{} : 2;
86 if (Memory::BitSubstring (firstByte, 4, 8) == 0b1110) {
87 return i == s.end () ? optional<size_t>{} : 3;
92 if (Memory::BitSubstring (firstByte, 3, 8) == 0b11110) {
93 return i == s.end () ? optional<size_t>{} : 4;
98 else if constexpr (
sizeof (CHAR_T) == 2) {
105 else if constexpr (
sizeof (CHAR_T) == 4) {
111 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
114 if constexpr (
sizeof (CHAR_T) == 4) {
120 while (optional<size_t> nOctets =
NextCharacter (s.subspan (i))) {
123 if (i == s.size ()) [[unlikely]] {
127 if (s.size () == i) [[likely]] {
135 template <IUNICODECanUnambiguouslyConvertTo TO, IUNICODECanUnambiguouslyConvertFrom FROM>
138 if constexpr (
sizeof (FROM) ==
sizeof (TO)) {
139 if constexpr (same_as<FROM, TO>) {
142 if constexpr (same_as<FROM, Latin1> and same_as<TO, char8_t>) {
147 if constexpr (
sizeof (FROM) == 1) {
152 else if constexpr (
sizeof (FROM) == 2) {
153 if constexpr (
sizeof (TO) == 1) {
162 Require (
sizeof (TO) == 4);
166 else if constexpr (
sizeof (FROM) == 4) {
167 if constexpr (
sizeof (TO) == 1) {
172 else if constexpr (
sizeof (TO) == 2) {
184 template <IUNICODECanUnambiguouslyConvertTo TO, IUNICODECanUnambiguouslyConvertFrom FROM>
186 requires (not is_const_v<TO>)
196 if constexpr (
sizeof (FROM) !=
sizeof (TO)) {
197 if constexpr (
sizeof (FROM) == 1) {
200 if constexpr (
sizeof (TO) == 4) {
201 if (src.size () *
sizeof (TO) > Memory::kStackBuffer_TargetInlineByteBufferSize) {
202 if (
auto i = ComputeCharacterLength (src)) {
208 else if constexpr (
sizeof (FROM) == 4) {
209 if constexpr (
sizeof (TO) == 1) {
214 if (src.size () * 4 > Memory::kStackBuffer_TargetInlineByteBufferSize) {
218 if (isascii (
static_cast<char32_t> (c))) {
230 return ComputeTargetBufferSize<TO, FROM> (src.size ());
232 template <IUNICODECanUnambiguouslyConvertTo TO, IUNICODECanUnambiguouslyConvertFrom FROM>
234 requires (not is_const_v<TO>)
236 return ComputeTargetBufferSize<TO> (Memory::ConstSpan (src));
238 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
241 if constexpr (same_as<CHAR_T, ASCII> or same_as<CHAR_T, Latin1>) {
248 if constexpr (same_as<remove_cv_t<CHAR_T>,
Character>) {
250 if (c.GetCharacterCode () > 0xd7ff) [[unlikely]] {
255 else if constexpr (same_as<remove_cv_t<CHAR_T>,
Latin1>) {
258 else if constexpr (same_as<remove_cv_t<CHAR_T>,
char8_t>) {
259 const char8_t* b = s.data ();
260 const char8_t* e = b + s.size ();
261 for (
const char8_t* i = b; i < e;) {
262 auto n = NextCharacter (span<const char8_t>{i, e});
263 if (not n.has_value () or *n > 2) [[unlikely]] {
271 if (
static_cast<make_unsigned_t<CHAR_T>
> (c) > 0xd7ff) [[unlikely]] {
278 template <IUNICODECanUnambiguouslyConvertFrom SRC_T, IUNICODECanUnambiguouslyConvertTo TRG_T>
279 inline auto UTFConvert::Convert (span<const SRC_T> source, span<TRG_T> target)
const -> ConversionResult
281 Require ((target.size () >= ComputeTargetBufferSize<TRG_T> (source)));
282 ConversionResultWithStatus result = ConvertQuietly (source, target);
283 ThrowIf_ (result.fStatus, result.fSourceConsumed);
286 template <IUNICODECanUnambiguouslyConvertFrom SRC_T, IUNICODECanUnambiguouslyConvertTo TRG_T>
287 inline auto UTFConvert::Convert (span<SRC_T> source, span<TRG_T> target)
const -> ConversionResult
289 return Convert (Memory::ConstSpan (source), target);
291 template <
typename TO,
typename FROM>
293 requires ((same_as<TO, string> or same_as<TO, wstring> or same_as<TO, u8string> or same_as<TO, u16string> or same_as<TO, u32string>) and
294 (same_as<FROM, string> or same_as<FROM, wstring> or same_as<FROM, u8string> or same_as<FROM, u16string> or same_as<FROM, u32string>))
296 if constexpr (same_as<TO, FROM>) {
300 size_t cvtBufSize = ComputeTargetBufferSize<typename TO::value_type> (span{from});
301 Memory::StackBuffer<typename TO::value_type> buf{Memory::eUninitialized, cvtBufSize};
302 return TO{buf.begin (), get<1> (Convert (span{from}, span{buf}))};
305 template <IUNICODECanUnambiguouslyConvertFrom SRC_T, IUNICODECanUnambiguouslyConvertTo TRG_T>
307 requires (not is_const_v<TRG_T>)
309 return span{target.data (),
Convert (source, target).fTargetProduced};
311 template <IUNICODECanUnambiguouslyConvertFrom SRC_T, IUNICODECanUnambiguouslyConvertTo TRG_T>
313 requires (not is_const_v<TRG_T>)
315 return ConvertSpan (Memory::ConstSpan (source), target);
317 template <IUNICODECanUnambiguouslyConvertFrom SRC_T, IUNICODECanUnambiguouslyConvertTo TRG_T>
319 requires (not is_const_v<TRG_T>)
321 Require ((target.size () >= ComputeTargetBufferSize<TRG_T> (source)));
322 using PRIMITIVE_SRC_T =
typename decltype (this->ConvertToPrimitiveSpan_ (source))::value_type;
323 using PRIMITIVE_TRG_T =
typename decltype (this->ConvertToPrimitiveSpan_ (target))::value_type;
324 if constexpr (same_as<SRC_T, TRG_T>) {
325 Memory::CopyBytes (source, target);
326 return ConversionResultWithStatus{{.fSourceConsumed = source.size (), .fTargetProduced = source.size ()}, ConversionStatusFlag::ok};
328 else if constexpr (same_as<PRIMITIVE_SRC_T, PRIMITIVE_TRG_T> and
sizeof (PRIMITIVE_SRC_T) != 1) {
329 static_assert (not same_as<SRC_T, TRG_T>);
330 static_assert (
sizeof (SRC_T) ==
sizeof (TRG_T));
331 Memory::CopyBytes (Memory::SpanBytesCast<span<const TRG_T>> (source), target);
332 return ConversionResultWithStatus{{.fSourceConsumed = source.size (), .fTargetProduced = source.size ()}, ConversionStatusFlag::ok};
334 else if constexpr (same_as<SRC_T, Latin1>) {
335 if constexpr (same_as<TRG_T, char8_t>) {
337 char8_t* outPtr = target.data ();
338 for (
const SRC_T ch : source) {
343 *outPtr++ = 0xc0 | ch >> 6;
344 *outPtr++ = 0x80 | (ch & 0x3f);
347 Assert (outPtr <= target.data () + target.size ());
348 return ConversionResultWithStatus{
349 {.fSourceConsumed = source.size (), .fTargetProduced =
static_cast<size_t> (outPtr - target.data ())}, ConversionStatusFlag::ok};
353 Memory::CopySpanData (source, target);
354 return ConversionResultWithStatus{{.fSourceConsumed = source.size (), .fTargetProduced = source.size ()}, ConversionStatusFlag::ok};
358 switch (Private_::ValueOf_ (fUsingOptions.fPreferredImplementation)) {
359 case Options::Implementation::eStroikaPortable: {
360 return ConvertQuietly_StroikaPortable_ (fUsingOptions.fInvalidCharacterReplacement, ConvertToPrimitiveSpan_ (source),
361 ConvertToPrimitiveSpan_ (target));
363#if qStroika_Foundation_Common_Platform_Windows
364 case Options::Implementation::eWindowsAPIWide2FromMultibyte: {
365 if constexpr ((
sizeof (SRC_T) == 1 and
sizeof (TRG_T) == 2) or (
sizeof (SRC_T) == 2 and
sizeof (TRG_T) == 1)) {
366 return ConvertQuietly_Win32_ (ConvertToPrimitiveSpan_ (source), ConvertToPrimitiveSpan_ (target));
370#if __has_include("boost/locale/encoding_utf.hpp")
371 case Options::Implementation::eBoost_Locale: {
372 if constexpr (same_as<SRC_T, char8_t> and same_as<TRG_T, char16_t>) {
373 return ConvertQuietly_boost_locale_ (ConvertToPrimitiveSpan_ (source), ConvertToPrimitiveSpan_ (target));
377 case Options::Implementation::eCodeCVT: {
378 if constexpr ((same_as<SRC_T, char16_t> or same_as<SRC_T, char32_t>) and same_as<TRG_T, char8_t>) {
379 return ConvertQuietly_codeCvt_ (source, target);
381 if constexpr (same_as<SRC_T, char8_t> and (same_as<TRG_T, char16_t> or same_as<SRC_T, char32_t>)) {
382 return ConvertQuietly_codeCvt_ (source, target);
386 return ConvertQuietly_StroikaPortable_ (fUsingOptions.fInvalidCharacterReplacement, ConvertToPrimitiveSpan_ (source),
387 ConvertToPrimitiveSpan_ (target));
390 template <IUNICODECanUnambiguouslyConvertTo TRG_T, IUNICODECanUnambiguouslyConvertFrom SRC_T>
393 static_assert (not is_const_v<TRG_T>);
396 Require (srcIndex <= source.size ());
397 span<const SRC_T> fakeSrc{source.begin (), srcIndex};
398 Memory::StackBuffer<TRG_T> fakeOut{ComputeTargetBufferSize<TRG_T> (fakeSrc)};
399 ConversionResult r =
Convert (fakeSrc, span{fakeOut});
400 return r.fTargetProduced;
402 template <IUNICODECanUnambiguouslyConvertFrom FromT>
403 constexpr auto UTFConvert::ConvertToPrimitiveSpan_ (span<FromT> f) -> span<CompatibleT_<FromT>>
405 return span{(CompatibleT_<FromT>*)f.data (), f.size ()};
407#if qStroika_Foundation_Common_Platform_Windows
408 inline auto UTFConvert::ConvertQuietly_Win32_ (span<const char8_t> source, span<char16_t> target) -> ConversionResultWithStatus
410 if (source.begin () == source.end ()) {
411 return ConversionResultWithStatus{{0, 0}, ConversionStatusFlag::ok};
414 int srcLen =
static_cast<int> (source.size ());
415 int trgLen =
static_cast<int> (target.size ());
416 int convertedLength = ::MultiByteToWideChar (CP_UTF8, 0,
reinterpret_cast<const char*
> (source.data ()), srcLen,
417 reinterpret_cast<WCHAR*
> (&*target.begin ()), trgLen);
418 return ConversionResultWithStatus{{
static_cast<size_t> (srcLen),
419 static_cast<size_t> (convertedLength)},
420 convertedLength == 0 ? ConversionStatusFlag::sourceIllegal : ConversionStatusFlag::ok};
423 inline auto UTFConvert::ConvertQuietly_Win32_ (span<const char16_t> source, span<char8_t> target) -> ConversionResultWithStatus
425 if (source.begin () == source.end ()) {
426 return ConversionResultWithStatus{{0, 0}, ConversionStatusFlag::ok};
429 int srcLen =
static_cast<int> (source.size ());
430 int trgLen =
static_cast<int> (target.size ());
431 int convertedLength = ::WideCharToMultiByte (CP_UTF8, 0,
reinterpret_cast<const WCHAR*
> (source.data ()), srcLen,
432 reinterpret_cast<char*
> (target.data ()), trgLen,
nullptr,
nullptr);
433 return ConversionResultWithStatus{{
static_cast<size_t> (srcLen),
434 static_cast<size_t> (convertedLength)},
435 convertedLength == 0 ? ConversionStatusFlag::sourceIllegal : ConversionStatusFlag::ok};
439 inline void UTFConvert::ThrowIf_ (ConversionStatusFlag cr,
size_t errorAtSourceOffset)
445 Throw (cr, errorAtSourceOffset);
448#if __has_include("boost/locale/encoding_utf.hpp")
449 inline auto UTFConvert::ConvertQuietly_boost_locale_ (span<const char8_t> source,
const span<char16_t> target) -> ConversionResultWithStatus
451 if (source.empty ()) {
452 return ConversionResultWithStatus{{0, 0}, ConversionStatusFlag::ok};
454 basic_string<char8_t> src = basic_string<char8_t>{source.data (), source.size ()};
455 u16string r = boost::locale::conv::utf_to_utf<char16_t> (src.c_str ());
456 Memory::CopyBytes (span<char16_t>{r}, target);
457 return ConversionResultWithStatus{{source.size (), r.size ()}, ConversionStatusFlag::ok};
460 char16_t* p = target.begin ();
462 c=utf::utf_traits<char8_t>::template decode<char8_t const *>(begin,end);
463 if(c==utf::illegal || c==utf::incomplete) {
468 utf::utf_traits<char16_t>::template encode(c,p);
473 std::basic_string<CharOut> result;
474 result.reserve(end-begin);
475 typedef std::back_insert_iterator<std::basic_string<CharOut> > inserter_type;
476 inserter_type inserter(result);
479 c=utf::utf_traits<CharIn>::template decode<CharIn const *>(begin,end);
480 if(c==utf::illegal || c==utf::incomplete) {
482 throw conversion_error();
485 utf::utf_traits<CharOut>::template encode<inserter_type>(c,inserter);
491 return ConversionResultWithStatus{{0, 0}, ConversionStatusFlag::ok};
#define AssertNotImplemented()
#define AssertNotReached()
Profile Convert(const VariantValue &v)
nonvirtual ConversionResult Convert(span< const SRC_T > source, span< TRG_T > target) const
Convert between UTF-N encoded strings/spans (including the special case of ASCII, and Latin1) (e....
static const UTFConvert kThe
Nearly always use this default UTFConvert.
nonvirtual ConversionResultWithStatus ConvertQuietly(span< const SRC_T > source, span< TRG_T > target) const
Convert UTF encoded (char8_t, char16_t, char32_t, char, wchar_t, ASCII, Latin1) characters to from ea...
static constexpr bool AllFitsInTwoByteEncoding(span< const CHAR_T > s) noexcept
static constexpr optional< size_t > ComputeCharacterLength(span< const CHAR_T > s)
static constexpr optional< size_t > NextCharacter(span< const CHAR_T > s)
nonvirtual span< TRG_T > ConvertSpan(span< const SRC_T > source, span< TRG_T > target) const
Convert between UTF-N encoded (including the special case of ASCII, and Latin1) character spans (e....
nonvirtual size_t ConvertOffset(span< const SRC_T > source, size_t srcIndex) const
static constexpr size_t ComputeTargetBufferSize(span< const FROM > src)
constexpr UTFConvert(const Options &options=Options{})