Stroika Library 3.0d16
 
Loading...
Searching...
No Matches
CodeCvt.inl
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#include <bit>
5
11
13
14 namespace Private_ {
15 // crazy - from https://en.cppreference.com/w/cpp/locale/codecvt
16 DISABLE_COMPILER_MSC_WARNING_START (4996)
17 template <typename FACET>
18 struct deletable_facet_ : FACET {
19 template <typename... Args>
20 deletable_facet_ (Args&&... args)
21 : FACET{forward<Args> (args)...}
22 {
23 }
24 ~deletable_facet_ () = default;
25 };
26 DISABLE_COMPILER_MSC_WARNING_END (4996)
27 void ThrowErrorConvertingBytes2Characters_ (size_t nSrcCharsWhereError);
28 void ThrowErrorConvertingCharacters2Bytes_ (size_t nSrcCharsWhereError);
29 void ThrowCodePageNotSupportedException_ (CodePage cp);
30 void ThrowCharsetNotSupportedException_ (const Charset& charset);
31 void ThrowInvalidCharacterProvidedDoesntFitWithProvidedCodeCvt_ ();
32 string AsNarrowSDKString_ (const String& s);
33 }
34
35 /*
36 ********************************************************************************
37 ***************************** CodeCvt<CHAR_T>::IRep ****************************
38 ********************************************************************************
39 */
40 template <IUNICODECanAlwaysConvertTo CHAR_T>
41 size_t CodeCvt<CHAR_T>::IRep::_Bytes2Characters (span<const byte> from) const
42 {
43 Memory::StackBuffer<CHAR_T> to{this->ComputeTargetCharacterBufferSize (from)};
44 return this->Bytes2Characters (&from, span{to}).size ();
45 }
46 template <IUNICODECanAlwaysConvertTo CHAR_T>
47 size_t CodeCvt<CHAR_T>::IRep::_Characters2Bytes (span<const CHAR_T> from) const
48 {
49 Memory::StackBuffer<byte> to{this->ComputeTargetByteBufferSize (from)};
50 return this->Characters2Bytes (from, span{to}).size ();
51 }
52
53 /*
54 ********************************************************************************
55 *********************** CodeCvt<CHAR_T>::UTFConvertRep_ ************************
56 ********************************************************************************
57 */
58 template <IUNICODECanAlwaysConvertTo CHAR_T>
59#if qCompilerAndStdLib_template_second_concept_Buggy
60 template <typename SERIALIZED_CHAR_T>
61#else
62 template <IUNICODECanAlwaysConvertTo SERIALIZED_CHAR_T>
63#endif
64 struct CodeCvt<CHAR_T>::UTFConvertRep_ : CodeCvt<CHAR_T>::IRep {
65 using ConversionResult = UTFConvert::ConversionResult;
66 using ConversionResultWithStatus = UTFConvert::ConversionResultWithStatus;
67 using ConversionStatusFlag = UTFConvert::ConversionStatusFlag;
68 UTFConvertRep_ (const Options& o)
69 : fCodeConverter_{o.fInvalidCharacterReplacement
70 ? UTFConvert{UTFConvert::Options{.fInvalidCharacterReplacement = *o.fInvalidCharacterReplacement}}
71 : UTFConvert::kThe}
72 {
73 }
74 virtual Options GetOptions () const override
75 {
76 return Options{.fInvalidCharacterReplacement = fCodeConverter_.GetOptions ().fInvalidCharacterReplacement};
77 }
78 virtual span<CHAR_T> Bytes2Characters (span<const byte>* from, span<CHAR_T> to) const override
79 {
80 RequireNotNull (from);
81 Require (to.size () >= ComputeTargetCharacterBufferSize (*from));
82 span<const SERIALIZED_CHAR_T> serializedFrom = ReinterpretBytes_ (*from);
83 Assert (serializedFrom.size_bytes () <= from->size ()); // note - serializedFrom could be smaller than from in byte-span
84 ConversionResultWithStatus r = fCodeConverter_.ConvertQuietly (serializedFrom, to);
85 if (r.fStatus == ConversionStatusFlag::sourceIllegal) {
86 UTFConvert::Throw (r.fStatus, r.fSourceConsumed);
87 }
88 *from = from->subspan (r.fSourceConsumed * sizeof (SERIALIZED_CHAR_T)); // from updated to remaining data, if any
89 return to.subspan (0, r.fTargetProduced); // point ACTUAL copied data
90 }
91 virtual span<byte> Characters2Bytes (span<const CHAR_T> from, span<byte> to) const override
92 {
93 Require (to.size () >= ComputeTargetByteBufferSize (from));
94 span<SERIALIZED_CHAR_T> serializedTo = ReinterpretBytes_ (to);
95 ConversionResult r = fCodeConverter_.Convert (from, serializedTo); // cannot have sourceExhausted here so no need to call ConvertQuietly
96 Require (r.fSourceConsumed == from.size ()); // always use all input characters
97 return to.subspan (0, r.fTargetProduced * sizeof (SERIALIZED_CHAR_T)); // point ACTUAL copied data
98 }
99 virtual size_t ComputeTargetCharacterBufferSize (variant<span<const byte>, size_t> src) const override
100 {
101 if (const size_t* i = get_if<size_t> (&src)) {
102 return UTFConvert::ComputeTargetBufferSize<CHAR_T, SERIALIZED_CHAR_T> (*i / sizeof (SERIALIZED_CHAR_T));
103 }
104 else {
105 return UTFConvert::ComputeTargetBufferSize<CHAR_T> (ReinterpretBytes_ (get<span<const byte>> (src)));
106 }
107 }
108 virtual size_t ComputeTargetByteBufferSize (variant<span<const CHAR_T>, size_t> src) const override
109 {
110 if (const size_t* i = get_if<size_t> (&src)) {
111 return UTFConvert::ComputeTargetBufferSize<SERIALIZED_CHAR_T, CHAR_T> (*i) * sizeof (SERIALIZED_CHAR_T);
112 }
113 else {
114 return UTFConvert::ComputeTargetBufferSize<SERIALIZED_CHAR_T> (get<span<const CHAR_T>> (src)) * sizeof (SERIALIZED_CHAR_T);
115 }
116 }
117 /*
118 * essentially 'cast' from bytes to from SERIALIZED_CHAR_T (could be char8_t, char16_t or whatever works with UTFConvert)
119 */
120 static span<const SERIALIZED_CHAR_T> ReinterpretBytes_ (span<const byte> s)
121 {
122 return span<const SERIALIZED_CHAR_T>{reinterpret_cast<const SERIALIZED_CHAR_T*> (s.data ()), s.size () / sizeof (SERIALIZED_CHAR_T)};
123 }
124 static span<SERIALIZED_CHAR_T> ReinterpretBytes_ (span<byte> s)
125 {
126 return span<SERIALIZED_CHAR_T>{reinterpret_cast<SERIALIZED_CHAR_T*> (s.data ()), s.size () / sizeof (SERIALIZED_CHAR_T)};
127 }
128 UTFConvert fCodeConverter_;
129 };
130
131 /*
132 ********************************************************************************
133 ********************* CodeCvt<CHAR_T>::Latin1ConvertRep_ ***********************
134 ********************************************************************************
135 */
136 template <IUNICODECanAlwaysConvertTo CHAR_T>
137 struct CodeCvt<CHAR_T>::Latin1ConvertRep_ : CodeCvt<CHAR_T>::IRep {
138 using ConversionResult = UTFConvert::ConversionResult;
139 using ConversionResultWithStatus = UTFConvert::ConversionResultWithStatus;
140 using ConversionStatusFlag = UTFConvert::ConversionStatusFlag;
141 Latin1ConvertRep_ (const Options& o)
142 : fCodeConverter_{o.fInvalidCharacterReplacement
143 ? UTFConvert{UTFConvert::Options{.fInvalidCharacterReplacement = *o.fInvalidCharacterReplacement}}
144 : UTFConvert::kThe}
145 {
146 }
147 virtual Options GetOptions () const override
148 {
149 return Options{.fInvalidCharacterReplacement = fCodeConverter_.GetOptions ().fInvalidCharacterReplacement};
150 }
151 virtual span<CHAR_T> Bytes2Characters (span<const byte>* from, span<CHAR_T> to) const override
152 {
153 RequireNotNull (from);
154 Require (to.size () >= ComputeTargetCharacterBufferSize (*from));
155 span<const Latin1> serializedFrom = ReinterpretBytes_ (*from);
156 Assert (serializedFrom.size_bytes () <= from->size ()); // note - serializedFrom could be smaller than from in bytespan
157 ConversionResultWithStatus r = fCodeConverter_.ConvertQuietly (serializedFrom, to);
158 if (r.fStatus == ConversionStatusFlag::sourceIllegal) {
159 UTFConvert::Throw (r.fStatus, r.fSourceConsumed);
160 }
161 *from = from->subspan (r.fSourceConsumed); // from updated to remaining data, if any
162 return to.subspan (0, r.fTargetProduced); // point ACTUAL copied data
163 }
164 virtual span<byte> Characters2Bytes ([[maybe_unused]] span<const CHAR_T> from, [[maybe_unused]] span<byte> to) const override
165 {
166 RequireNotReached (); // doesn't work in general, so disallow
167 return span<byte>{};
168 }
169 virtual size_t ComputeTargetCharacterBufferSize (variant<span<const byte>, size_t> src) const override
170 {
171 if (const size_t* i = get_if<size_t> (&src)) {
172 return UTFConvert::ComputeTargetBufferSize<CHAR_T, Latin1> (*i / sizeof (Latin1));
173 }
174 else {
175 return UTFConvert::ComputeTargetBufferSize<CHAR_T> (ReinterpretBytes_ (get<span<const byte>> (src)));
176 }
177 }
178 virtual size_t ComputeTargetByteBufferSize ([[maybe_unused]] variant<span<const CHAR_T>, size_t> src) const override
179 {
180 RequireNotReached (); // doesn't work in general, so disallow
181 return 0;
182 }
183 /*
184 * essentially 'cast' from bytes to from Latin1 (could be char8_t, char16_t or whatever works with UTFConvert)
185 */
186 static span<const Latin1> ReinterpretBytes_ (span<const byte> s)
187 {
188 return span<const Latin1>{reinterpret_cast<const Latin1*> (s.data ()), s.size () / sizeof (Latin1)};
189 }
190 static span<Latin1> ReinterpretBytes_ (span<byte> s)
191 {
192 return span<Latin1>{reinterpret_cast<Latin1*> (s.data ()), s.size () / sizeof (Latin1)};
193 }
194 UTFConvert fCodeConverter_;
195 };
196
197 /*
198 ********************************************************************************
199 ****************** CodeCvt<CHAR_T>::UTFConvertSwappedRep_ **********************
200 ********************************************************************************
201 */
202 template <IUNICODECanAlwaysConvertTo CHAR_T>
203#if qCompilerAndStdLib_template_second_concept_Buggy
204 template <typename SERIALIZED_CHAR_T>
205#else
206 template <IUNICODECanAlwaysConvertTo SERIALIZED_CHAR_T>
207#endif
208 struct CodeCvt<CHAR_T>::UTFConvertSwappedRep_ : UTFConvertRep_<SERIALIZED_CHAR_T> {
209 using inherited = UTFConvertRep_<SERIALIZED_CHAR_T>;
210 UTFConvertSwappedRep_ (const Options& o)
211 : inherited{o}
212 {
213 }
214 virtual span<CHAR_T> Bytes2Characters (span<const byte>* from, span<CHAR_T> to) const override
215 {
216 RequireNotNull (from);
217 Require (to.size () >= this->ComputeTargetCharacterBufferSize (*from));
218 auto r = inherited::Bytes2Characters (from, to);
219 for (CHAR_T& i : to) {
220 if constexpr (same_as<CHAR_T, Character>) {
221 i = Character{Common::StdCompat::byteswap (i.template As<char32_t> ())};
222 }
223 else {
224 i = Common::StdCompat::byteswap (i);
225 }
226 }
227 return r;
228 }
229 virtual span<byte> Characters2Bytes (span<const CHAR_T> from, span<byte> to) const override
230 {
231 Require (to.size () >= this->ComputeTargetByteBufferSize (from));
233 for (CHAR_T& i : buf) {
234 if constexpr (same_as<CHAR_T, Character>) {
235 i = Character{Common::StdCompat::byteswap (i.template As<char32_t> ())};
236 }
237 else {
238 i = Common::StdCompat::byteswap (i);
239 }
240 }
241 return inherited::Characters2Bytes (span<const CHAR_T>{buf.begin (), buf.size ()}, to);
242 }
243 };
244
245 /*
246 ********************************************************************************
247 *********************** CodeCvt<CHAR_T>::UTF2UTFRep_ ***************************
248 ********************************************************************************
249 */
250 /*
251 * Utility rep to wrap some kind of rep along with (optional) UTFConvert, to complete
252 * conversion from bytes to/from desired rep generally through some intermediary rep.
253 *
254 * NOTE - this code allows INTERMEDIATE_CHAR_T == CHAR_T special case, and is optimized to do
255 * nothing for that case (or should be - maybe needs a bit more tweaking of implementation for that to be fully true).
256 */
257 template <IUNICODECanAlwaysConvertTo CHAR_T>
258#if qCompilerAndStdLib_template_second_concept_Buggy
259 template <typename INTERMEDIATE_CHAR_T>
260#else
261 template <IUNICODECanAlwaysConvertTo INTERMEDIATE_CHAR_T>
262#endif
263 struct CodeCvt<CHAR_T>::UTF2UTFRep_ : CodeCvt<CHAR_T>::IRep {
264 using ConversionResultWithStatus = UTFConvert::ConversionResultWithStatus;
265 using ConversionStatusFlag = UTFConvert::ConversionStatusFlag;
266 UTF2UTFRep_ (const CodeCvt<INTERMEDIATE_CHAR_T>& origCodeCvt)
267 requires (sizeof (CHAR_T) == sizeof (INTERMEDIATE_CHAR_T))
268 : fBytesVSIntermediateCvt_{origCodeCvt}
269 {
270 }
271 UTF2UTFRep_ (const CodeCvt<INTERMEDIATE_CHAR_T>& origCodeCvt, const UTFConvert& secondStep = {})
272 requires (sizeof (CHAR_T) != sizeof (INTERMEDIATE_CHAR_T))
273 : fBytesVSIntermediateCvt_{origCodeCvt}
274 , fIntermediateVSFinalCHARCvt_{secondStep}
275 {
276 }
277 virtual Options GetOptions () const override
278 {
279 // Not 100% right cuz ignoring - fIntermediateVSFinalCHARCvt_ - LGP - 2023-08-07
280 return Options{.fInvalidCharacterReplacement = fBytesVSIntermediateCvt_.GetOptions ().fInvalidCharacterReplacement};
281 }
282 virtual span<CHAR_T> Bytes2Characters (span<const byte>* from, span<CHAR_T> to) const override
283 {
284 RequireNotNull (from);
285 Require (to.size () >= ComputeTargetCharacterBufferSize (*from) or to.size () >= this->_Bytes2Characters (*from));
286 if constexpr (sizeof (CHAR_T) == sizeof (INTERMEDIATE_CHAR_T)) {
287 return span<CHAR_T>{to.begin (),
288 fBytesVSIntermediateCvt_.Bytes2Characters (from, Memory::SpanBytesCast<span<INTERMEDIATE_CHAR_T>> (to)).size ()};
289 }
290 else {
291 /*
292 * Big picture: fBytesVSIntermediateCvt_ goes bytes -> INTERMEDIATE_CHAR_T, so we use it first.
293 *
294 * BUT - trick - even if we successfully do first conversion (bytes -> INTERMEDIATE_CHAR_T) - we might still get a split
295 * char on the second conversion (RARE). If so - we need to backup in 'from' - to avoid this. Just allege we consumed less. This MIGHT -
296 * in extreme cases - go all the way back to zero.
297 */
298 while (true) {
299 // Because we KNOW everything will fit (disallow target exhausted), we can allocate a temporary buffer for the intermediate state, and be done with
300 // it by the end of this routine (stay stateless)
301 Memory::StackBuffer<INTERMEDIATE_CHAR_T> intermediateBuf{fBytesVSIntermediateCvt_.ComputeTargetCharacterBufferSize (*from)};
302 span<const INTERMEDIATE_CHAR_T> intermediateSpan = fBytesVSIntermediateCvt_.Bytes2Characters (from, intermediateBuf); // shortens 'from' if needed
303
304 // then use fIntermediateVSFinalCHARCvt_ to perform final mapping INTERMEDIATE_CHAR_T -> CHAR_T
305 ConversionResultWithStatus cr = fIntermediateVSFinalCHARCvt_.ConvertQuietly (intermediateSpan, to);
306 switch (cr.fStatus) {
307 case ConversionStatusFlag::sourceIllegal:
308 UTFConvert::Throw (cr.fStatus, cr.fSourceConsumed);
309 case ConversionStatusFlag::sourceExhausted:
310 // TRICKY - if we have at least one character output, then we need to back out bytes 'from' - til this doesn't happen
311 if (not from->empty ()) {
312 *from = from->subspan (0, from->size () - 1);
313 continue; // 'goto try again'
314 }
315 else {
316 return span<CHAR_T>{}; // no update to 'from' since we consumed no characters
317 }
318 case ConversionStatusFlag::ok:
319 return to.subspan (0, cr.fTargetProduced);
320 default:
322 return span<CHAR_T>{};
323 }
324 }
325 }
326 }
327 virtual span<byte> Characters2Bytes (span<const CHAR_T> from, span<byte> to) const override
328 {
329 Require (to.size () >= ComputeTargetByteBufferSize (from) or to.size () >= this->_Characters2Bytes (from));
330 if constexpr (sizeof (CHAR_T) == sizeof (INTERMEDIATE_CHAR_T)) {
331 return fBytesVSIntermediateCvt_.Characters2Bytes (Memory::SpanBytesCast<span<const INTERMEDIATE_CHAR_T>> (from), to);
332 }
333 else {
334 /*
335 * Because we KNOW everything will fit, we can allocate a temporary buffer for the intermediate state, and be done with
336 * it by the end of this routine (stay stateless)
337 */
338 Memory::StackBuffer<INTERMEDIATE_CHAR_T> intermediateBuf{
339 fIntermediateVSFinalCHARCvt_.template ComputeTargetBufferSize<INTERMEDIATE_CHAR_T> (from)};
340
341 /*
342 * first translate to something usable by fBytesVSIntermediateCvt_
343 */
344 span<INTERMEDIATE_CHAR_T> intermediateSpan =
345 fIntermediateVSFinalCHARCvt_.ConvertSpan (from, span<INTERMEDIATE_CHAR_T>{intermediateBuf.data (), intermediateBuf.size ()});
346
347 // Then use fBytesVSIntermediateCvt_, no need to track anything in intermediateBuf, we require all used, no partials etc.
348 return fBytesVSIntermediateCvt_.Characters2Bytes (intermediateSpan, to);
349 }
350 }
351 virtual size_t ComputeTargetCharacterBufferSize (variant<span<const byte>, size_t> src) const override
352 {
353 size_t intermediateCharCntMax = [&] () {
354 if (const size_t* i = get_if<size_t> (&src)) {
355 return fBytesVSIntermediateCvt_.ComputeTargetCharacterBufferSize (*i);
356 }
357 else {
358 return fBytesVSIntermediateCvt_.ComputeTargetCharacterBufferSize (get<span<const byte>> (src));
359 }
360 }();
361 if constexpr (sizeof (CHAR_T) == sizeof (INTERMEDIATE_CHAR_T)) {
362 return intermediateCharCntMax;
363 }
364 else {
365 return fIntermediateVSFinalCHARCvt_.template ComputeTargetBufferSize<INTERMEDIATE_CHAR_T, CHAR_T> (intermediateCharCntMax);
366 }
367 }
368 virtual size_t ComputeTargetByteBufferSize (variant<span<const CHAR_T>, size_t> src) const override
369 {
370 size_t intermediateCharCntMax = [&] () {
371 if constexpr (sizeof (CHAR_T) == sizeof (INTERMEDIATE_CHAR_T)) {
372 if (const size_t* i = get_if<size_t> (&src)) {
373 return *i;
374 }
375 else {
376 return get<span<const CHAR_T>> (src).size ();
377 }
378 }
379 else {
380 if (const size_t* i = get_if<size_t> (&src)) {
381 return fIntermediateVSFinalCHARCvt_.template ComputeTargetBufferSize<INTERMEDIATE_CHAR_T, CHAR_T> (*i);
382 }
383 else {
384 return fIntermediateVSFinalCHARCvt_.template ComputeTargetBufferSize<INTERMEDIATE_CHAR_T> (get<span<const CHAR_T>> (src));
385 }
386 }
387 }();
388 return fBytesVSIntermediateCvt_.ComputeTargetByteBufferSize (intermediateCharCntMax);
389 }
390 CodeCvt<INTERMEDIATE_CHAR_T> fBytesVSIntermediateCvt_;
391 conditional_t<sizeof (CHAR_T) != sizeof (INTERMEDIATE_CHAR_T), UTFConvert, byte> fIntermediateVSFinalCHARCvt_; // would like to remove field if sizeof ==, but not sure how (void doesnt work)
392 };
393
394 /*
395 * This is crazy complicated because codecvt goes out of its way to be hard to copy, hard to move, but with
396 * a little care, can be made to work with unique_ptr.
397 *
398 * Also, std::codecvt doesn't natively support fInvalidCharacterReplacement, so we have to support manually.
399 */
400 template <IUNICODECanAlwaysConvertTo CHAR_T>
401 template <typename STD_CODE_CVT_T>
402 struct CodeCvt<CHAR_T>::CodeCvt_WrapStdCodeCvt_ : CodeCvt<CHAR_T>::IRep {
403 unique_ptr<STD_CODE_CVT_T> fCodeCvt_;
404 optional<Character> fInvalidCharacterReplacement_;
405 optional<span<byte>> fInvalidCharacterReplacementBytes_;
406 using extern_type = typename STD_CODE_CVT_T::extern_type;
407 extern_type fInvalidCharacterReplacementBytesBuf[8]; // WAG at sufficient size, but sb enuf
408 static_assert (same_as<CHAR_T, typename STD_CODE_CVT_T::intern_type>);
409#if qCompilerAndStdLib_arm_asan_FaultStackUseAfterScope_Buggy
410 Stroika_Foundation_Debug_ATTRIBUTE_NO_SANITIZE_ADDRESS
411#endif
412 CodeCvt_WrapStdCodeCvt_ (const Options& options, unique_ptr<STD_CODE_CVT_T>&& codeCvt)
413 : fCodeCvt_{move (codeCvt)}
414 , fInvalidCharacterReplacement_{options.fInvalidCharacterReplacement}
415 {
416 if (fInvalidCharacterReplacement_) {
417 mbstate_t ignoredMBState{};
418 Memory::StackBuffer<CHAR_T> tmpBuf;
419 span<const CHAR_T> invalCharPartlyEncode = fInvalidCharacterReplacement_->As<CHAR_T> (&tmpBuf);
420 const CHAR_T* ignoreCharsConsumed = nullptr;
421 extern_type* bytesInvalChar = fInvalidCharacterReplacementBytesBuf;
422 DISABLE_COMPILER_MSC_WARNING_START (4996)
423 auto r =
424 fCodeCvt_->out (ignoredMBState, invalCharPartlyEncode.data (), invalCharPartlyEncode.data () + invalCharPartlyEncode.size (),
425 ignoreCharsConsumed, fInvalidCharacterReplacementBytesBuf,
426 fInvalidCharacterReplacementBytesBuf + Memory::NEltsOf (fInvalidCharacterReplacementBytesBuf), bytesInvalChar);
427 DISABLE_COMPILER_MSC_WARNING_END (4996)
428 if (r == STD_CODE_CVT_T::ok) {
429 fInvalidCharacterReplacementBytes_ = as_writable_bytes (
430 span{fInvalidCharacterReplacementBytesBuf}.subspan (0, bytesInvalChar - fInvalidCharacterReplacementBytesBuf));
431 }
432 else {
433 Private_::ThrowInvalidCharacterProvidedDoesntFitWithProvidedCodeCvt_ ();
434 }
435 }
436 }
437 virtual Options GetOptions () const override
438 {
439 return Options{.fInvalidCharacterReplacement = fInvalidCharacterReplacement_};
440 }
441 virtual span<CHAR_T> Bytes2Characters (span<const byte>* from, span<CHAR_T> to) const override
442 {
443 RequireNotNull (from);
444 Require (to.size () >= ComputeTargetCharacterBufferSize (*from));
445 const extern_type* _First1 = reinterpret_cast<const extern_type*> (from->data ());
446 const extern_type* _Last1 = _First1 + from->size ();
447 const extern_type* _Mid1 = _First1; // DOUBLE CHECK SPEC - NOT SURE IF THIS IS USED ON INPUT
448 CHAR_T* _First2 = to.data ();
449 CHAR_T* _Last2 = _First2 + to.size ();
450 CHAR_T* _Mid2 = _First2; // DOUBLE CHECK SPEC - NOT SURE IF THIS IS USED ON INPUT
451 mbstate_t ignoredMBState{};
452 size_t bytesDone = 0;
453 size_t charsDone = 0;
454 continueWith:
455 auto r = fCodeCvt_->in (ignoredMBState, _First1 + bytesDone, _Last1, _Mid1, _First2 + charsDone, _Last2, _Mid2);
456 if (r == STD_CODE_CVT_T::partial) {
457 *from = from->subspan (charsDone + static_cast<size_t> (_Mid2 - _First2)); // reference remaining bytes, could be partial character at end of multibyte sequence
458 Assert (from->size () != 0);
459 }
460 else if (r != STD_CODE_CVT_T::ok) {
461 if (fInvalidCharacterReplacement_) {
462 bytesDone = _Mid1 - _First1 + 1; // skip one byte and try again (no idea how many bytes would have been best to skip)
463 charsDone = _Mid2 - _First2;
464
465 Memory::StackBuffer<CHAR_T> badCharTmpBuf;
466 span<const CHAR_T> badCharReplaceSpan = fInvalidCharacterReplacement_->As<CHAR_T> (&badCharTmpBuf);
467 span<CHAR_T> copied = Memory::CopyBytes (badCharReplaceSpan, span{&_First2[charsDone], _Last2});
468 Assert (copied.size () >= 0);
469 charsDone += copied.size ();
470 Assert (charsDone <= to.size ());
471 goto continueWith;
472 }
473 else {
474 Private_::ThrowErrorConvertingBytes2Characters_ (_Mid1 - _First1);
475 }
476 }
477 else {
478 Require (_Mid1 == _Last1);
479 *from = span<const byte>{}; // used all input
480 }
481 return to.subspan (0, _Mid2 - _First2); // point ACTUAL copied data
482 }
483 virtual span<byte> Characters2Bytes (span<const CHAR_T> from, span<byte> to) const override
484 {
485 Require (to.size () >= ComputeTargetByteBufferSize (from));
486 const CHAR_T* _First1 = from.data ();
487 const CHAR_T* _Last1 = _First1 + from.size ();
488 const CHAR_T* _Mid1 = _First1; // DOUBLE CHECK SPEC - NOT SURE IF THIS IS USED ON INPUT
489 extern_type* _First2 = reinterpret_cast<extern_type*> (to.data ());
490 extern_type* _Last2 = _First2 + to.size ();
491 extern_type* _Mid2 = _First2; // DOUBLE CHECK SPEC - NOT SURE IF THIS IS USED ON INPUT
492 mbstate_t ignoredMBState{};
493 size_t charsDone = 0;
494 size_t bytesDone = 0;
495 continueWith:
496 auto r = fCodeCvt_->out (ignoredMBState, _First1 + charsDone, _Last1, _Mid1, _First2 + bytesDone, _Last2, _Mid2);
497 if (r != STD_CODE_CVT_T::ok) {
498 if (fInvalidCharacterReplacement_) {
499 charsDone = _Mid1 - _First1 + 1; // skip one character and try again
500 bytesDone = _Mid2 - _First2;
501 memcpy (_First2 + bytesDone, fInvalidCharacterReplacementBytes_->data (), fInvalidCharacterReplacementBytes_->size ());
502 bytesDone += fInvalidCharacterReplacementBytes_->size ();
503 goto continueWith;
504 }
505 else {
506 Private_::ThrowErrorConvertingCharacters2Bytes_ (_Mid1 - _First1);
507 }
508 }
509 Require (_Mid1 == _Last1); // used all input
510 return to.subspan (0, _Mid2 - _First2); // point ACTUAL copied data
511 }
512 virtual size_t ComputeTargetCharacterBufferSize (variant<span<const byte>, size_t> src) const override
513 {
514 // at most one character per byte, and std::codecvt doesn't appear to offer API to compute better
515 if (const size_t* i = get_if<size_t> (&src)) {
516 return *i;
517 }
518 else {
519 return get<span<const byte>> (src).size ();
520 }
521 }
522 virtual size_t ComputeTargetByteBufferSize (variant<span<const CHAR_T>, size_t> src) const override
523 {
524 if (const size_t* i = get_if<size_t> (&src)) {
525 return (*i) * fCodeCvt_->max_length ();
526 }
527 else {
528 // std::codecvt doesn't appear to provide an API to compute needed buffer length (just the reverse -
529 // for a buffer length, how many bytes consumed).
530 return get<span<const CHAR_T>> (src).size () * fCodeCvt_->max_length ();
531 }
532 }
533 };
534
535 namespace Private_ {
536 // a lot of old, important character sets can be represented this way (like old PC character sets for non-asian languages)
537 struct BuiltinSingleByteTableCodePageRep_ final : CodeCvt<char16_t>::IRep {
538 BuiltinSingleByteTableCodePageRep_ (CodePage cp, optional<Character> invalidCharacterReplacement);
539 virtual ~BuiltinSingleByteTableCodePageRep_ () = default;
540 virtual CodeCvt<char16_t>::Options GetOptions () const override
541 {
542 optional<char16_t> invalRepChar;
543 if (fInvalidCharacterReplacementByte_ != nullopt) {
544 char16_t x;
545 auto byteSpan = span{&*fInvalidCharacterReplacementByte_, 1};
546 (void)this->Bytes2Characters (&byteSpan, span{&x, 1});
547 invalRepChar = x;
548 }
549 return CodeCvt<char16_t>::Options{.fInvalidCharacterReplacement = invalRepChar};
550 }
551 virtual span<char16_t> Bytes2Characters (span<const byte>* from, span<char16_t> to) const override;
552 virtual span<byte> Characters2Bytes (span<const char16_t> from, span<byte> to) const override;
553 virtual size_t ComputeTargetCharacterBufferSize (variant<span<const byte>, size_t> src) const override;
554 virtual size_t ComputeTargetByteBufferSize (variant<span<const char16_t>, size_t> src) const override;
555 const char16_t* fMap_;
556 optional<byte> fInvalidCharacterReplacementByte_;
557 };
558#if qStroika_Foundation_Common_Platform_Windows
559 struct WindowsNative_ final : CodeCvt<char16_t>::IRep {
560 constexpr WindowsNative_ (CodePage cp)
561 : fCodePage_{cp}
562 {
563 }
564 virtual ~WindowsNative_ () = default;
565 virtual CodeCvt<char16_t>::Options GetOptions () const override
566 {
567 return {};
568 }
569 virtual span<char16_t> Bytes2Characters (span<const byte>* from, span<char16_t> to) const override;
570 virtual span<byte> Characters2Bytes (span<const char16_t> from, span<byte> to) const override;
571 virtual size_t ComputeTargetCharacterBufferSize (variant<span<const byte>, size_t> src) const override;
572 virtual size_t ComputeTargetByteBufferSize (variant<span<const char16_t>, size_t> src) const override;
573 CodePage fCodePage_;
574 };
575#endif
576 }
577
578 /*
579 ********************************************************************************
580 ************************* CodeCvt<CHAR_T>::Options *****************************
581 ********************************************************************************
582 */
583 template <IUNICODECanAlwaysConvertTo CHAR_T>
584 template <qCompilerAndStdLib_ConstraintDiffersInTemplateRedeclaration_BWA (IUNICODECanAlwaysConvertTo) FROM_CHAR_T_OPTIONS>
585 constexpr inline auto CodeCvt<CHAR_T>::Options::New (typename CodeCvt<FROM_CHAR_T_OPTIONS>::Options o) -> Options
586 {
587 return Options{.fInvalidCharacterReplacement = o.fInvalidCharacterReplacement};
588 }
589
590 /*
591 ********************************************************************************
592 ******************************* CodeCvt<CHAR_T> ********************************
593 ********************************************************************************
594 */
595 template <IUNICODECanAlwaysConvertTo CHAR_T>
596 inline CodeCvt<CHAR_T>::CodeCvt (const Options& options)
597 : fRep_{make_shared<UTFConvertRep_<char8_t>> (options)} // default, is to serialize to UTF-8
598 {
599 }
600 template <IUNICODECanAlwaysConvertTo CHAR_T>
601 inline CodeCvt<CHAR_T>::CodeCvt (const locale& l, const Options& options)
602 {
604 if constexpr (same_as<CHAR_T, wchar_t>) {
605 *this = mkFromStdCodeCvt<codecvt_byname<wchar_t, char, mbstate_t>> (options, l.name ());
606 }
607 else if constexpr (same_as<CHAR_T, char16_t> or same_as<CHAR_T, char32_t>) {
608 *this = mkFromStdCodeCvt<codecvt_byname<CHAR_T, char8_t, mbstate_t>> (options, l.name ());
609 }
610 else if constexpr (same_as<CHAR_T, Character>) {
611 fRep_ = make_shared<UTF2UTFRep_<char32_t>> (CodeCvt<char32_t>::mkFromStdCodeCvt<codecvt_byname<char32_t, char8_t, mbstate_t>> (
612 CodeCvt<char32_t>::Options::New<CHAR_T> (options), l.name ()));
613 }
614 else {
615 // CHAR_T COULD be UTF-8, but not clear if/why that would be useful.
617 }
618 DISABLE_COMPILER_MSC_WARNING_END (4996)
619 }
620 template <IUNICODECanAlwaysConvertTo CHAR_T>
621 CodeCvt<CHAR_T>::CodeCvt (const Charset& charset, const Options& options)
622 {
623 if (charset == WellKnownCharsets::kISO_8859_1) {
624 fRep_ = make_shared<Latin1ConvertRep_> (options);
625 }
626 else if (charset == WellKnownCharsets::kUTF8) {
627 *this = CodeCvt<CHAR_T>{UnicodeExternalEncodings::eUTF8};
628 }
629 else if (same_as<CHAR_T, Character>) {
631 // best hope is to treat it as a locale name, and hope its found
632 fRep_ = make_shared<UTF2UTFRep_<char32_t>> (CodeCvt<char32_t>::mkFromStdCodeCvt<codecvt_byname<char32_t, char8_t, mbstate_t>> (
633 CodeCvt<char32_t>::Options::New<CHAR_T> (options), charset.AsNarrowSDKString ()));
634 DISABLE_COMPILER_MSC_WARNING_END (4996)
635 }
636 else {
637 Private_::ThrowCharsetNotSupportedException_ (charset);
638 }
639 }
640 template <IUNICODECanAlwaysConvertTo CHAR_T>
641 CodeCvt<CHAR_T>::CodeCvt (UnicodeExternalEncodings e, const Options& options)
642 : fRep_{}
643 {
644 switch (e) {
645 case UnicodeExternalEncodings::eUTF8:
646 fRep_ = make_shared<UTFConvertRep_<char8_t>> (options);
647 break;
648 case UnicodeExternalEncodings::eUTF16_BE:
649 case UnicodeExternalEncodings::eUTF16_LE:
650 if (e == UnicodeExternalEncodings::eUTF16) {
651 fRep_ = make_shared<UTFConvertRep_<char16_t>> (options);
652 }
653 else {
654 fRep_ = make_shared<UTFConvertSwappedRep_<char16_t>> (options);
655 }
656 break;
657 case UnicodeExternalEncodings::eUTF32_BE:
658 case UnicodeExternalEncodings::eUTF32_LE:
659 if (e == UnicodeExternalEncodings::eUTF32) {
660 fRep_ = make_shared<UTFConvertRep_<char32_t>> (options);
661 }
662 else {
663 fRep_ = make_shared<UTFConvertSwappedRep_<char32_t>> (options);
664 }
665 break;
666 default:
668 }
669 }
670 template <IUNICODECanAlwaysConvertTo CHAR_T>
671 CodeCvt<CHAR_T>::CodeCvt (span<const byte>* guessFormatFrom, const optional<CodeCvt>& useElse, const Options& options)
672 : fRep_{}
673 {
674 RequireNotNull (guessFormatFrom);
675 Require (useElse == nullopt or useElse->GetOptions ().fInvalidCharacterReplacement == options.fInvalidCharacterReplacement);
676 if (optional<tuple<UnicodeExternalEncodings, size_t>> r = ReadByteOrderMark (*guessFormatFrom)) {
677 *guessFormatFrom = guessFormatFrom->subspan (get<size_t> (*r));
678 fRep_ = CodeCvt{get<UnicodeExternalEncodings> (*r), options}.fRep_;
679 }
680 else {
681 fRep_ = useElse ? useElse->fRep_ : CodeCvt{options}.fRep_;
682 }
683 }
684 template <IUNICODECanAlwaysConvertTo CHAR_T>
685 CodeCvt<CHAR_T>::CodeCvt (CodePage cp, const Options& options)
686 : fRep_{}
687 {
688 // A few we have builtin table converters for (BuiltinSingleByteTableCodePageRep_);
689 // a few are just UTF, and we can convert those.
690 // On windows, we can delegate to WindowsNative_
691 // else give up and throw not supported code page.
692 switch (cp) {
693 case WellKnownCodePages::kANSI:
694 case WellKnownCodePages::kMAC:
695 case WellKnownCodePages::kPC:
696 case WellKnownCodePages::kPCA:
697 case WellKnownCodePages::kGreek:
698 case WellKnownCodePages::kTurkish:
699 case WellKnownCodePages::kHebrew:
700 case WellKnownCodePages::kArabic:
701 fRep_ = make_shared<UTF2UTFRep_<char16_t>> (
702 CodeCvt<char16_t> (make_shared<Private_::BuiltinSingleByteTableCodePageRep_> (cp, options.fInvalidCharacterReplacement)));
703 break;
704 case WellKnownCodePages::kUTF8:
705 fRep_ = make_shared<UTFConvertRep_<char8_t>> (options);
706 break;
707 case WellKnownCodePages::kUNICODE_WIDE:
708 fRep_ = make_shared<UTFConvertRep_<char16_t>> (options);
709 break;
710 case WellKnownCodePages::kUNICODE_WIDE_BIGENDIAN:
711 fRep_ = make_shared<UTFConvertSwappedRep_<char16_t>> (options);
712 break;
713 default:
714#if qStroika_Foundation_Common_Platform_Windows
715 if (options.fInvalidCharacterReplacement) {
716 Private_::ThrowCodePageNotSupportedException_ (cp); // WindowsNative_ doesn't support fInvalidCharacterReplacement
717 }
718 fRep_ = make_shared<UTF2UTFRep_<char16_t>> (CodeCvt<char16_t> (make_shared<Private_::WindowsNative_> (cp)));
719 break;
720#else
721 Private_::ThrowCodePageNotSupportedException_ (cp);
722#endif
723 }
724 }
725 template <IUNICODECanAlwaysConvertTo CHAR_T>
726 template <IUNICODECanAlwaysConvertTo INTERMEDIATE_CHAR_T>
727 inline CodeCvt<CHAR_T>::CodeCvt (const CodeCvt<INTERMEDIATE_CHAR_T>& basedOn)
728 : fRep_{make_shared<UTF2UTFRep_<INTERMEDIATE_CHAR_T>> (basedOn)}
729 {
730 }
731 template <IUNICODECanAlwaysConvertTo CHAR_T>
732 inline CodeCvt<CHAR_T>::CodeCvt (const shared_ptr<IRep>& rep)
733 : fRep_{(RequireExpression (rep != nullptr), rep)}
734 {
735 }
736 template <IUNICODECanAlwaysConvertTo CHAR_T>
737 template <IStdCodeCVT STD_CODECVT, typename... ARGS>
738 inline CodeCvt<CHAR_T> CodeCvt<CHAR_T>::mkFromStdCodeCvt (const Options& options, ARGS... args)
739 requires (same_as<CHAR_T, typename STD_CODECVT::intern_type>)
740 {
741 auto u = make_unique<Private_::deletable_facet_<STD_CODECVT>> (forward<ARGS> (args)...);
742 return CodeCvt<CHAR_T>{make_shared<CodeCvt_WrapStdCodeCvt_<Private_::deletable_facet_<STD_CODECVT>>> (options, move (u))};
743 }
744 template <IUNICODECanAlwaysConvertTo CHAR_T>
745 inline auto CodeCvt<CHAR_T>::GetOptions () const -> Options
746 {
747 return fRep_->GetOptions ();
748 }
749 template <IUNICODECanAlwaysConvertTo CHAR_T>
750 inline auto CodeCvt<CHAR_T>::Bytes2Characters (span<const byte> from) const -> size_t
751 {
752 Memory::StackBuffer<CHAR_T> to{ComputeTargetCharacterBufferSize (from)};
753 return fRep_->Bytes2Characters (&from, span{to}).size ();
754 }
755 template <IUNICODECanAlwaysConvertTo CHAR_T>
756 inline auto CodeCvt<CHAR_T>::Bytes2Characters (span<const byte>* from, span<CHAR_T> to) const -> span<CHAR_T>
757 {
758 RequireNotNull (from);
759 AssertNotNull (fRep_);
760 Require (to.size () >= ComputeTargetCharacterBufferSize (*from) or to.size () >= Bytes2Characters (*from)); // ComputeTargetCharacterBufferSize cheaper to compute
761 auto r = fRep_->Bytes2Characters (from, to);
762 Ensure (from->size () < 10); // can only contain bytes for a partial character so must be small, typically one or two or zero
763 WeakAssert (from->size () <= 2);
764 return r;
765 }
766 template <IUNICODECanAlwaysConvertTo CHAR_T>
767 inline auto CodeCvt<CHAR_T>::Bytes2Characters (span<const byte> from, span<CHAR_T> to) const -> span<CHAR_T>
768 {
769 AssertNotNull (fRep_);
770 Require (to.size () >= ComputeTargetCharacterBufferSize (from) or to.size () >= Bytes2Characters (from)); // ComputeTargetCharacterBufferSize cheaper to compute
771 size_t origSize = from.size ();
772 auto result = fRep_->Bytes2Characters (&from, to);
773 if (not from.empty ()) {
774 Private_::ThrowErrorConvertingBytes2Characters_ (origSize - from.size ());
775 }
776 return result;
777 }
778 template <IUNICODECanAlwaysConvertTo CHAR_T>
779 inline auto CodeCvt<CHAR_T>::Characters2Bytes (span<const CHAR_T> from) const -> size_t
780 {
781 Memory::StackBuffer<byte> to{ComputeTargetByteBufferSize (from)};
782 return fRep_->Characters2Bytes (from, span{to}).size ();
783 }
784 template <IUNICODECanAlwaysConvertTo CHAR_T>
785 inline auto CodeCvt<CHAR_T>::Characters2Bytes (span<const CHAR_T> from, span<byte> to) const -> span<byte>
786 {
787 AssertNotNull (fRep_);
788 Require (to.size () >= ComputeTargetByteBufferSize (from) or to.size () >= Characters2Bytes (from)); // ComputeTargetByteBufferSize cheaper to compute
789 return fRep_->Characters2Bytes (from, to);
790 }
791 template <IUNICODECanAlwaysConvertTo CHAR_T>
792 inline size_t CodeCvt<CHAR_T>::ComputeTargetCharacterBufferSize (span<const byte> src) const
793 {
794 return fRep_->ComputeTargetCharacterBufferSize (src);
795 }
796 template <IUNICODECanAlwaysConvertTo CHAR_T>
797 inline size_t CodeCvt<CHAR_T>::ComputeTargetCharacterBufferSize (size_t srcSize) const
798 {
799 return fRep_->ComputeTargetCharacterBufferSize (srcSize);
800 }
801 template <IUNICODECanAlwaysConvertTo CHAR_T>
802 inline size_t CodeCvt<CHAR_T>::ComputeTargetByteBufferSize (span<const CHAR_T> src) const
803 {
804 return fRep_->ComputeTargetByteBufferSize (src);
805 }
806 template <IUNICODECanAlwaysConvertTo CHAR_T>
807 inline size_t CodeCvt<CHAR_T>::ComputeTargetByteBufferSize (size_t srcSize) const
808 {
809 return fRep_->ComputeTargetByteBufferSize (srcSize);
810 }
811 template <IUNICODECanAlwaysConvertTo CHAR_T>
812 template <constructible_from<const CHAR_T*, const CHAR_T*> STRINGISH>
813 STRINGISH CodeCvt<CHAR_T>::Bytes2String (span<const byte> from) const
814 {
815 size_t origSize = from.size ();
816 Memory::StackBuffer<CHAR_T> buf{this->ComputeTargetCharacterBufferSize (from)};
817 span<CHAR_T> r = this->Bytes2Characters (&from, span{buf});
818 if (not from.empty ()) {
819 Private_::ThrowErrorConvertingBytes2Characters_ (origSize - from.size ());
820 }
821 return STRINGISH{r.data (), r.data () + r.size ()};
822 }
823 template <IUNICODECanAlwaysConvertTo CHAR_T>
824 template <constructible_from<const byte*, const byte*> BLOBISH>
825 BLOBISH CodeCvt<CHAR_T>::String2Bytes (span<const CHAR_T> from) const
826 {
827 Memory::StackBuffer<byte> buf{Memory::eUninitialized, this->ComputeTargetByteBufferSize (from)};
828 const span<const byte> r = this->Characters2Bytes (from, span{buf});
829 if constexpr (same_as<BLOBISH, string>) {
830 return string{reinterpret_cast<const char*> (r.data ()), reinterpret_cast<const char*> (r.data ()) + r.size ()};
831 }
832 else {
833 return BLOBISH{r.data (), r.data () + r.size ()};
834 }
835 }
836
837}
#define AssertNotNull(p)
Definition Assertions.h:333
#define AssertNotImplemented()
Definition Assertions.h:401
#define RequireNotReached()
Definition Assertions.h:385
#define RequireNotNull(p)
Definition Assertions.h:347
#define WeakAssert(c)
A WeakAssert() is for things that aren't guaranteed to be true, but are overwhelmingly likely to be t...
Definition Assertions.h:438
#define RequireExpression(c)
Definition Assertions.h:267
#define AssertNotReached()
Definition Assertions.h:355
static CodeCvt mkFromStdCodeCvt(const Options &options={}, ARGS... args)
nonvirtual STRINGISH Bytes2String(span< const byte > from) const
nonvirtual size_t Bytes2Characters(span< const byte > from) const
convert span byte (external serialized format) parameters to characters (like std::codecvt<>::in () -...
Definition CodeCvt.inl:750
CodeCvt(const Options &options=Options{})
Definition CodeCvt.inl:596
nonvirtual BLOBISH String2Bytes(span< const CHAR_T > from) const
Logically halfway between std::array and std::vector; Smart 'direct memory array' - which when needed...
constexpr optional< tuple< UnicodeExternalEncodings, size_t > > ReadByteOrderMark(span< const byte > d) noexcept
UnicodeExternalEncodings
list of external UNICODE character encodings, for file IO (eDEFAULT = eUTF8)
Definition UTFConvert.h:31