Stroika Library 3.0d21
 
Loading...
Searching...
No Matches
CodeCvt.inl
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#include <bit>
5
11
13
14 namespace Private_ {
15 // crazy - from https://en.cppreference.com/w/cpp/locale/codecvt
16 DISABLE_COMPILER_MSC_WARNING_START (4996)
17 template <typename FACET>
18 struct deletable_facet_ : FACET {
19 template <typename... Args>
20 deletable_facet_ (Args&&... args)
21 : FACET{forward<Args> (args)...}
22 {
23 }
24 ~deletable_facet_ () = default;
25 };
26 DISABLE_COMPILER_MSC_WARNING_END (4996)
27 void ThrowErrorConvertingBytes2Characters_ (size_t nSrcCharsWhereError);
28 void ThrowErrorConvertingCharacters2Bytes_ (size_t nSrcCharsWhereError);
29 void ThrowCodePageNotSupportedException_ (CodePage cp);
30 void ThrowCharsetNotSupportedException_ (const Charset& charset);
31 void ThrowInvalidCharacterProvidedDoesntFitWithProvidedCodeCvt_ ();
32 }
33
34 /*
35 ********************************************************************************
36 ***************************** CodeCvt<CHAR_T>::IRep ****************************
37 ********************************************************************************
38 */
39 template <IUNICODECanAlwaysConvertTo CHAR_T>
40 size_t CodeCvt<CHAR_T>::IRep::_Bytes2Characters (span<const byte> from) const
41 {
42 Memory::StackBuffer<CHAR_T> to{this->ComputeTargetCharacterBufferSize (from)};
43 return this->Bytes2Characters (&from, span{to}).size ();
44 }
45 template <IUNICODECanAlwaysConvertTo CHAR_T>
46 size_t CodeCvt<CHAR_T>::IRep::_Characters2Bytes (span<const CHAR_T> from) const
47 {
48 Memory::StackBuffer<byte> to{this->ComputeTargetByteBufferSize (from)};
49 return this->Characters2Bytes (from, span{to}).size ();
50 }
51
52 /*
53 ********************************************************************************
54 *********************** CodeCvt<CHAR_T>::UTFConvertRep_ ************************
55 ********************************************************************************
56 */
57 template <IUNICODECanAlwaysConvertTo CHAR_T>
58#if qCompilerAndStdLib_template_second_concept_Buggy
59 template <typename SERIALIZED_CHAR_T>
60#else
61 template <IUNICODECanAlwaysConvertTo SERIALIZED_CHAR_T>
62#endif
63 struct CodeCvt<CHAR_T>::UTFConvertRep_ : CodeCvt<CHAR_T>::IRep {
64 using ConversionResult = UTFConvert::ConversionResult;
65 using ConversionResultWithStatus = UTFConvert::ConversionResultWithStatus;
66 using ConversionStatusFlag = UTFConvert::ConversionStatusFlag;
67 UTFConvertRep_ (const Options& o)
68 : fCodeConverter_{o.fInvalidCharacterReplacement
69 ? UTFConvert{UTFConvert::Options{.fInvalidCharacterReplacement = *o.fInvalidCharacterReplacement}}
70 : UTFConvert::kThe}
71 {
72 }
73 virtual Options GetOptions () const override
74 {
75 return Options{.fInvalidCharacterReplacement = fCodeConverter_.GetOptions ().fInvalidCharacterReplacement};
76 }
77 virtual span<CHAR_T> Bytes2Characters (span<const byte>* from, span<CHAR_T> to) const override
78 {
79 RequireNotNull (from);
80 Require (to.size () >= ComputeTargetCharacterBufferSize (*from));
81 span<const SERIALIZED_CHAR_T> serializedFrom = ReinterpretBytes_ (*from);
82 Assert (serializedFrom.size_bytes () <= from->size ()); // note - serializedFrom could be smaller than from in byte-span
83 ConversionResultWithStatus r = fCodeConverter_.ConvertQuietly (serializedFrom, to);
84 if (r.fStatus == ConversionStatusFlag::sourceIllegal) {
85 UTFConvert::Throw (r.fStatus, r.fSourceConsumed);
86 }
87 *from = from->subspan (r.fSourceConsumed * sizeof (SERIALIZED_CHAR_T)); // from updated to remaining data, if any
88 return to.subspan (0, r.fTargetProduced); // point ACTUAL copied data
89 }
90 virtual span<byte> Characters2Bytes (span<const CHAR_T> from, span<byte> to) const override
91 {
92 Require (to.size () >= ComputeTargetByteBufferSize (from));
93 span<SERIALIZED_CHAR_T> serializedTo = ReinterpretBytes_ (to);
94 ConversionResult r = fCodeConverter_.Convert (from, serializedTo); // cannot have sourceExhausted here so no need to call ConvertQuietly
95 Require (r.fSourceConsumed == from.size ()); // always use all input characters
96 return to.subspan (0, r.fTargetProduced * sizeof (SERIALIZED_CHAR_T)); // point ACTUAL copied data
97 }
98 virtual size_t ComputeTargetCharacterBufferSize (variant<span<const byte>, size_t> src) const override
99 {
100 if (const size_t* i = get_if<size_t> (&src)) {
101 return UTFConvert::ComputeTargetBufferSize<CHAR_T, SERIALIZED_CHAR_T> (*i / sizeof (SERIALIZED_CHAR_T));
102 }
103 else {
104 return UTFConvert::ComputeTargetBufferSize<CHAR_T> (ReinterpretBytes_ (get<span<const byte>> (src)));
105 }
106 }
107 virtual size_t ComputeTargetByteBufferSize (variant<span<const CHAR_T>, size_t> src) const override
108 {
109 if (const size_t* i = get_if<size_t> (&src)) {
110 return UTFConvert::ComputeTargetBufferSize<SERIALIZED_CHAR_T, CHAR_T> (*i) * sizeof (SERIALIZED_CHAR_T);
111 }
112 else {
113 return UTFConvert::ComputeTargetBufferSize<SERIALIZED_CHAR_T> (get<span<const CHAR_T>> (src)) * sizeof (SERIALIZED_CHAR_T);
114 }
115 }
116 /*
117 * essentially 'cast' from bytes to from SERIALIZED_CHAR_T (could be char8_t, char16_t or whatever works with UTFConvert)
118 */
119 static span<const SERIALIZED_CHAR_T> ReinterpretBytes_ (span<const byte> s)
120 {
121 return span<const SERIALIZED_CHAR_T>{reinterpret_cast<const SERIALIZED_CHAR_T*> (s.data ()), s.size () / sizeof (SERIALIZED_CHAR_T)};
122 }
123 static span<SERIALIZED_CHAR_T> ReinterpretBytes_ (span<byte> s)
124 {
125 return span<SERIALIZED_CHAR_T>{reinterpret_cast<SERIALIZED_CHAR_T*> (s.data ()), s.size () / sizeof (SERIALIZED_CHAR_T)};
126 }
127 UTFConvert fCodeConverter_;
128 };
129
130 /*
131 ********************************************************************************
132 ********************* CodeCvt<CHAR_T>::Latin1ConvertRep_ ***********************
133 ********************************************************************************
134 */
135 template <IUNICODECanAlwaysConvertTo CHAR_T>
136 struct CodeCvt<CHAR_T>::Latin1ConvertRep_ : CodeCvt<CHAR_T>::IRep {
137 using ConversionResult = UTFConvert::ConversionResult;
138 using ConversionResultWithStatus = UTFConvert::ConversionResultWithStatus;
139 using ConversionStatusFlag = UTFConvert::ConversionStatusFlag;
140 Latin1ConvertRep_ (const Options& o)
141 : fCodeConverter_{o.fInvalidCharacterReplacement
142 ? UTFConvert{UTFConvert::Options{.fInvalidCharacterReplacement = *o.fInvalidCharacterReplacement}}
143 : UTFConvert::kThe}
144 {
145 }
146 virtual Options GetOptions () const override
147 {
148 return Options{.fInvalidCharacterReplacement = fCodeConverter_.GetOptions ().fInvalidCharacterReplacement};
149 }
150 virtual span<CHAR_T> Bytes2Characters (span<const byte>* from, span<CHAR_T> to) const override
151 {
152 RequireNotNull (from);
153 Require (to.size () >= ComputeTargetCharacterBufferSize (*from));
154 span<const Latin1> serializedFrom = ReinterpretBytes_ (*from);
155 Assert (serializedFrom.size_bytes () <= from->size ()); // note - serializedFrom could be smaller than from in bytespan
156 ConversionResultWithStatus r = fCodeConverter_.ConvertQuietly (serializedFrom, to);
157 if (r.fStatus == ConversionStatusFlag::sourceIllegal) {
158 UTFConvert::Throw (r.fStatus, r.fSourceConsumed);
159 }
160 *from = from->subspan (r.fSourceConsumed); // from updated to remaining data, if any
161 return to.subspan (0, r.fTargetProduced); // point ACTUAL copied data
162 }
163 virtual span<byte> Characters2Bytes ([[maybe_unused]] span<const CHAR_T> from, [[maybe_unused]] span<byte> to) const override
164 {
165 RequireNotReached (); // doesn't work in general, so disallow
166 return span<byte>{};
167 }
168 virtual size_t ComputeTargetCharacterBufferSize (variant<span<const byte>, size_t> src) const override
169 {
170 if (const size_t* i = get_if<size_t> (&src)) {
171 return UTFConvert::ComputeTargetBufferSize<CHAR_T, Latin1> (*i / sizeof (Latin1));
172 }
173 else {
174 return UTFConvert::ComputeTargetBufferSize<CHAR_T> (ReinterpretBytes_ (get<span<const byte>> (src)));
175 }
176 }
177 virtual size_t ComputeTargetByteBufferSize ([[maybe_unused]] variant<span<const CHAR_T>, size_t> src) const override
178 {
179 RequireNotReached (); // doesn't work in general, so disallow
180 return 0;
181 }
182 /*
183 * essentially 'cast' from bytes to from Latin1 (could be char8_t, char16_t or whatever works with UTFConvert)
184 */
185 static span<const Latin1> ReinterpretBytes_ (span<const byte> s)
186 {
187 return span<const Latin1>{reinterpret_cast<const Latin1*> (s.data ()), s.size () / sizeof (Latin1)};
188 }
189 static span<Latin1> ReinterpretBytes_ (span<byte> s)
190 {
191 return span<Latin1>{reinterpret_cast<Latin1*> (s.data ()), s.size () / sizeof (Latin1)};
192 }
193 UTFConvert fCodeConverter_;
194 };
195
196 /*
197 ********************************************************************************
198 ****************** CodeCvt<CHAR_T>::UTFConvertSwappedRep_ **********************
199 ********************************************************************************
200 */
201 template <IUNICODECanAlwaysConvertTo CHAR_T>
202#if qCompilerAndStdLib_template_second_concept_Buggy
203 template <typename SERIALIZED_CHAR_T>
204#else
205 template <IUNICODECanAlwaysConvertTo SERIALIZED_CHAR_T>
206#endif
207 struct CodeCvt<CHAR_T>::UTFConvertSwappedRep_ : UTFConvertRep_<SERIALIZED_CHAR_T> {
208 using inherited = UTFConvertRep_<SERIALIZED_CHAR_T>;
209 UTFConvertSwappedRep_ (const Options& o)
210 : inherited{o}
211 {
212 }
213 virtual span<CHAR_T> Bytes2Characters (span<const byte>* from, span<CHAR_T> to) const override
214 {
215 RequireNotNull (from);
216 Require (to.size () >= this->ComputeTargetCharacterBufferSize (*from));
217 auto r = inherited::Bytes2Characters (from, to);
218 for (CHAR_T& i : to) {
219 if constexpr (same_as<CHAR_T, Character>) {
220 i = Character{Common::StdCompat::byteswap (i.template As<char32_t> ())};
221 }
222 else {
223 i = Common::StdCompat::byteswap (i);
224 }
225 }
226 return r;
227 }
228 virtual span<byte> Characters2Bytes (span<const CHAR_T> from, span<byte> to) const override
229 {
230 Require (to.size () >= this->ComputeTargetByteBufferSize (from));
231 Memory::StackBuffer<CHAR_T> buf{from};
232 for (CHAR_T& i : buf) {
233 if constexpr (same_as<CHAR_T, Character>) {
234 i = Character{Common::StdCompat::byteswap (i.template As<char32_t> ())};
235 }
236 else {
237 i = Common::StdCompat::byteswap (i);
238 }
239 }
240 return inherited::Characters2Bytes (span<const CHAR_T>{buf.begin (), buf.size ()}, to);
241 }
242 };
243
244 /*
245 ********************************************************************************
246 *********************** CodeCvt<CHAR_T>::UTF2UTFRep_ ***************************
247 ********************************************************************************
248 */
249 /*
250 * Utility rep to wrap some kind of rep along with (optional) UTFConvert, to complete
251 * conversion from bytes to/from desired rep generally through some intermediary rep.
252 *
253 * NOTE - this code allows INTERMEDIATE_CHAR_T == CHAR_T special case, and is optimized to do
254 * nothing for that case (or should be - maybe needs a bit more tweaking of implementation for that to be fully true).
255 */
256 template <IUNICODECanAlwaysConvertTo CHAR_T>
257#if qCompilerAndStdLib_template_second_concept_Buggy
258 template <typename INTERMEDIATE_CHAR_T>
259#else
260 template <IUNICODECanAlwaysConvertTo INTERMEDIATE_CHAR_T>
261#endif
262 struct CodeCvt<CHAR_T>::UTF2UTFRep_ : CodeCvt<CHAR_T>::IRep {
263 using ConversionResultWithStatus = UTFConvert::ConversionResultWithStatus;
264 using ConversionStatusFlag = UTFConvert::ConversionStatusFlag;
265 UTF2UTFRep_ (const CodeCvt<INTERMEDIATE_CHAR_T>& origCodeCvt)
266 requires (sizeof (CHAR_T) == sizeof (INTERMEDIATE_CHAR_T))
267 : fBytesVSIntermediateCvt_{origCodeCvt}
268 {
269 }
270 UTF2UTFRep_ (const CodeCvt<INTERMEDIATE_CHAR_T>& origCodeCvt, const UTFConvert& secondStep = {})
271 requires (sizeof (CHAR_T) != sizeof (INTERMEDIATE_CHAR_T))
272 : fBytesVSIntermediateCvt_{origCodeCvt}
273 , fIntermediateVSFinalCHARCvt_{secondStep}
274 {
275 }
276 virtual Options GetOptions () const override
277 {
278 // Not 100% right cuz ignoring - fIntermediateVSFinalCHARCvt_ - LGP - 2023-08-07
279 return Options{.fInvalidCharacterReplacement = fBytesVSIntermediateCvt_.GetOptions ().fInvalidCharacterReplacement};
280 }
281 virtual span<CHAR_T> Bytes2Characters (span<const byte>* from, span<CHAR_T> to) const override
282 {
283 RequireNotNull (from);
284 Require (to.size () >= ComputeTargetCharacterBufferSize (*from) or to.size () >= this->_Bytes2Characters (*from));
285 if constexpr (sizeof (CHAR_T) == sizeof (INTERMEDIATE_CHAR_T)) {
286 return span<CHAR_T>{to.begin (),
287 fBytesVSIntermediateCvt_.Bytes2Characters (from, Memory::SpanBytesCast<span<INTERMEDIATE_CHAR_T>> (to)).size ()};
288 }
289 else {
290 /*
291 * Big picture: fBytesVSIntermediateCvt_ goes bytes -> INTERMEDIATE_CHAR_T, so we use it first.
292 *
293 * BUT - trick - even if we successfully do first conversion (bytes -> INTERMEDIATE_CHAR_T) - we might still get a split
294 * char on the second conversion (RARE). If so - we need to backup in 'from' - to avoid this. Just allege we consumed less. This MIGHT -
295 * in extreme cases - go all the way back to zero.
296 */
297 while (true) {
298 // Because we KNOW everything will fit (disallow target exhausted), we can allocate a temporary buffer for the intermediate state, and be done with
299 // it by the end of this routine (stay stateless)
300 Memory::StackBuffer<INTERMEDIATE_CHAR_T> intermediateBuf{fBytesVSIntermediateCvt_.ComputeTargetCharacterBufferSize (*from)};
301 span<const INTERMEDIATE_CHAR_T> intermediateSpan = fBytesVSIntermediateCvt_.Bytes2Characters (from, intermediateBuf); // shortens 'from' if needed
302
303 // then use fIntermediateVSFinalCHARCvt_ to perform final mapping INTERMEDIATE_CHAR_T -> CHAR_T
304 ConversionResultWithStatus cr = fIntermediateVSFinalCHARCvt_.ConvertQuietly (intermediateSpan, to);
305 switch (cr.fStatus) {
306 case ConversionStatusFlag::sourceIllegal:
307 UTFConvert::Throw (cr.fStatus, cr.fSourceConsumed);
308 case ConversionStatusFlag::sourceExhausted:
309 // TRICKY - if we have at least one character output, then we need to back out bytes 'from' - til this doesn't happen
310 if (not from->empty ()) {
311 *from = from->subspan (0, from->size () - 1);
312 continue; // 'goto try again'
313 }
314 else {
315 return span<CHAR_T>{}; // no update to 'from' since we consumed no characters
316 }
317 case ConversionStatusFlag::ok:
318 return to.subspan (0, cr.fTargetProduced);
319 default:
321 return span<CHAR_T>{};
322 }
323 }
324 }
325 }
326 virtual span<byte> Characters2Bytes (span<const CHAR_T> from, span<byte> to) const override
327 {
328 Require (to.size () >= ComputeTargetByteBufferSize (from) or to.size () >= this->_Characters2Bytes (from));
329 if constexpr (sizeof (CHAR_T) == sizeof (INTERMEDIATE_CHAR_T)) {
330 return fBytesVSIntermediateCvt_.Characters2Bytes (Memory::SpanBytesCast<span<const INTERMEDIATE_CHAR_T>> (from), to);
331 }
332 else {
333 /*
334 * Because we KNOW everything will fit, we can allocate a temporary buffer for the intermediate state, and be done with
335 * it by the end of this routine (stay stateless)
336 */
337 Memory::StackBuffer<INTERMEDIATE_CHAR_T> intermediateBuf{
338 fIntermediateVSFinalCHARCvt_.template ComputeTargetBufferSize<INTERMEDIATE_CHAR_T> (from)};
339
340 /*
341 * first translate to something usable by fBytesVSIntermediateCvt_
342 */
343 span<INTERMEDIATE_CHAR_T> intermediateSpan =
344 fIntermediateVSFinalCHARCvt_.ConvertSpan (from, span<INTERMEDIATE_CHAR_T>{intermediateBuf.data (), intermediateBuf.size ()});
345
346 // Then use fBytesVSIntermediateCvt_, no need to track anything in intermediateBuf, we require all used, no partials etc.
347 return fBytesVSIntermediateCvt_.Characters2Bytes (intermediateSpan, to);
348 }
349 }
350 virtual size_t ComputeTargetCharacterBufferSize (variant<span<const byte>, size_t> src) const override
351 {
352 size_t intermediateCharCntMax = [&] () {
353 if (const size_t* i = get_if<size_t> (&src)) {
354 return fBytesVSIntermediateCvt_.ComputeTargetCharacterBufferSize (*i);
355 }
356 else {
357 return fBytesVSIntermediateCvt_.ComputeTargetCharacterBufferSize (get<span<const byte>> (src));
358 }
359 }();
360 if constexpr (sizeof (CHAR_T) == sizeof (INTERMEDIATE_CHAR_T)) {
361 return intermediateCharCntMax;
362 }
363 else {
364 return fIntermediateVSFinalCHARCvt_.template ComputeTargetBufferSize<INTERMEDIATE_CHAR_T, CHAR_T> (intermediateCharCntMax);
365 }
366 }
367 virtual size_t ComputeTargetByteBufferSize (variant<span<const CHAR_T>, size_t> src) const override
368 {
369 size_t intermediateCharCntMax = [&] () {
370 if constexpr (sizeof (CHAR_T) == sizeof (INTERMEDIATE_CHAR_T)) {
371 if (const size_t* i = get_if<size_t> (&src)) {
372 return *i;
373 }
374 else {
375 return get<span<const CHAR_T>> (src).size ();
376 }
377 }
378 else {
379 if (const size_t* i = get_if<size_t> (&src)) {
380 return fIntermediateVSFinalCHARCvt_.template ComputeTargetBufferSize<INTERMEDIATE_CHAR_T, CHAR_T> (*i);
381 }
382 else {
383 return fIntermediateVSFinalCHARCvt_.template ComputeTargetBufferSize<INTERMEDIATE_CHAR_T> (get<span<const CHAR_T>> (src));
384 }
385 }
386 }();
387 return fBytesVSIntermediateCvt_.ComputeTargetByteBufferSize (intermediateCharCntMax);
388 }
389 CodeCvt<INTERMEDIATE_CHAR_T> fBytesVSIntermediateCvt_;
390 conditional_t<sizeof (CHAR_T) != sizeof (INTERMEDIATE_CHAR_T), UTFConvert, byte> fIntermediateVSFinalCHARCvt_; // would like to remove field if sizeof ==, but not sure how (void doesnt work)
391 };
392
393 /*
394 * This is crazy complicated because codecvt goes out of its way to be hard to copy, hard to move, but with
395 * a little care, can be made to work with unique_ptr.
396 *
397 * Also, std::codecvt doesn't natively support fInvalidCharacterReplacement, so we have to support manually.
398 */
399 template <IUNICODECanAlwaysConvertTo CHAR_T>
400 template <typename STD_CODE_CVT_T>
401 struct CodeCvt<CHAR_T>::CodeCvt_WrapStdCodeCvt_ : CodeCvt<CHAR_T>::IRep {
402 unique_ptr<STD_CODE_CVT_T> fCodeCvt_;
403 optional<Character> fInvalidCharacterReplacement_;
404 optional<span<byte>> fInvalidCharacterReplacementBytes_;
405 using extern_type = typename STD_CODE_CVT_T::extern_type;
406 extern_type fInvalidCharacterReplacementBytesBuf[8]; // WAG at sufficient size, but sb enuf
407 static_assert (same_as<CHAR_T, typename STD_CODE_CVT_T::intern_type>);
408#if qCompilerAndStdLib_arm_asan_FaultStackUseAfterScope_Buggy
409 Stroika_Foundation_Debug_ATTRIBUTE_NO_SANITIZE_ADDRESS
410#endif
411 CodeCvt_WrapStdCodeCvt_ (const Options& options, unique_ptr<STD_CODE_CVT_T>&& codeCvt)
412 : fCodeCvt_{move (codeCvt)}
413 , fInvalidCharacterReplacement_{options.fInvalidCharacterReplacement}
414 {
415 if (fInvalidCharacterReplacement_) {
416 mbstate_t ignoredMBState{};
417 Memory::StackBuffer<CHAR_T> tmpBuf;
418 span<const CHAR_T> invalCharPartlyEncode = fInvalidCharacterReplacement_->As<CHAR_T> (&tmpBuf);
419 const CHAR_T* ignoreCharsConsumed = nullptr;
420 extern_type* bytesInvalChar = fInvalidCharacterReplacementBytesBuf;
421 DISABLE_COMPILER_MSC_WARNING_START (4996)
422 auto r = fCodeCvt_->out (ignoredMBState, invalCharPartlyEncode.data (),
423 invalCharPartlyEncode.data () + invalCharPartlyEncode.size (), ignoreCharsConsumed,
424 fInvalidCharacterReplacementBytesBuf,
425 fInvalidCharacterReplacementBytesBuf + size (fInvalidCharacterReplacementBytesBuf), bytesInvalChar);
426 DISABLE_COMPILER_MSC_WARNING_END (4996)
427 if (r == STD_CODE_CVT_T::ok) {
428 fInvalidCharacterReplacementBytes_ = as_writable_bytes (
429 span{fInvalidCharacterReplacementBytesBuf}.subspan (0, bytesInvalChar - fInvalidCharacterReplacementBytesBuf));
430 }
431 else {
432 Private_::ThrowInvalidCharacterProvidedDoesntFitWithProvidedCodeCvt_ ();
433 }
434 }
435 }
436 virtual Options GetOptions () const override
437 {
438 return Options{.fInvalidCharacterReplacement = fInvalidCharacterReplacement_};
439 }
440 virtual span<CHAR_T> Bytes2Characters (span<const byte>* from, span<CHAR_T> to) const override
441 {
442 RequireNotNull (from);
443 Require (to.size () >= ComputeTargetCharacterBufferSize (*from));
444 const extern_type* _First1 = reinterpret_cast<const extern_type*> (from->data ());
445 const extern_type* _Last1 = _First1 + from->size ();
446 const extern_type* _Mid1 = _First1; // DOUBLE CHECK SPEC - NOT SURE IF THIS IS USED ON INPUT
447 CHAR_T* _First2 = to.data ();
448 CHAR_T* _Last2 = _First2 + to.size ();
449 CHAR_T* _Mid2 = _First2; // DOUBLE CHECK SPEC - NOT SURE IF THIS IS USED ON INPUT
450 mbstate_t ignoredMBState{};
451 size_t bytesDone = 0;
452 size_t charsDone = 0;
453 continueWith:
454 auto r = fCodeCvt_->in (ignoredMBState, _First1 + bytesDone, _Last1, _Mid1, _First2 + charsDone, _Last2, _Mid2);
455 if (r == STD_CODE_CVT_T::partial) {
456 *from = from->subspan (charsDone + static_cast<size_t> (_Mid2 - _First2)); // reference remaining bytes, could be partial character at end of multibyte sequence
457 Assert (from->size () != 0);
458 }
459 else if (r != STD_CODE_CVT_T::ok) {
460 if (fInvalidCharacterReplacement_) {
461 bytesDone = _Mid1 - _First1 + 1; // skip one byte and try again (no idea how many bytes would have been best to skip)
462 charsDone = _Mid2 - _First2;
463
464 Memory::StackBuffer<CHAR_T> badCharTmpBuf;
465 span<const CHAR_T> badCharReplaceSpan = fInvalidCharacterReplacement_->As<CHAR_T> (&badCharTmpBuf);
466 span<CHAR_T> copied = Memory::CopyBytes (badCharReplaceSpan, span{&_First2[charsDone], _Last2});
467 Assert (copied.size () >= 0);
468 charsDone += copied.size ();
469 Assert (charsDone <= to.size ());
470 goto continueWith;
471 }
472 else {
473 Private_::ThrowErrorConvertingBytes2Characters_ (_Mid1 - _First1);
474 }
475 }
476 else {
477 Require (_Mid1 == _Last1);
478 *from = span<const byte>{}; // used all input
479 }
480 return to.subspan (0, _Mid2 - _First2); // point ACTUAL copied data
481 }
482 virtual span<byte> Characters2Bytes (span<const CHAR_T> from, span<byte> to) const override
483 {
484 Require (to.size () >= ComputeTargetByteBufferSize (from));
485 const CHAR_T* _First1 = from.data ();
486 const CHAR_T* _Last1 = _First1 + from.size ();
487 const CHAR_T* _Mid1 = _First1; // DOUBLE CHECK SPEC - NOT SURE IF THIS IS USED ON INPUT
488 extern_type* _First2 = reinterpret_cast<extern_type*> (to.data ());
489 extern_type* _Last2 = _First2 + to.size ();
490 extern_type* _Mid2 = _First2; // DOUBLE CHECK SPEC - NOT SURE IF THIS IS USED ON INPUT
491 mbstate_t ignoredMBState{};
492 size_t charsDone = 0;
493 size_t bytesDone = 0;
494 continueWith:
495 auto r = fCodeCvt_->out (ignoredMBState, _First1 + charsDone, _Last1, _Mid1, _First2 + bytesDone, _Last2, _Mid2);
496 if (r != STD_CODE_CVT_T::ok) {
497 if (fInvalidCharacterReplacement_) {
498 charsDone = _Mid1 - _First1 + 1; // skip one character and try again
499 bytesDone = _Mid2 - _First2;
500 memcpy (_First2 + bytesDone, fInvalidCharacterReplacementBytes_->data (), fInvalidCharacterReplacementBytes_->size ());
501 bytesDone += fInvalidCharacterReplacementBytes_->size ();
502 goto continueWith;
503 }
504 else {
505 Private_::ThrowErrorConvertingCharacters2Bytes_ (_Mid1 - _First1);
506 }
507 }
508 Require (_Mid1 == _Last1); // used all input
509 return to.subspan (0, _Mid2 - _First2); // point ACTUAL copied data
510 }
511 virtual size_t ComputeTargetCharacterBufferSize (variant<span<const byte>, size_t> src) const override
512 {
513 // at most one character per byte, and std::codecvt doesn't appear to offer API to compute better
514 if (const size_t* i = get_if<size_t> (&src)) {
515 return *i;
516 }
517 else {
518 return get<span<const byte>> (src).size ();
519 }
520 }
521 virtual size_t ComputeTargetByteBufferSize (variant<span<const CHAR_T>, size_t> src) const override
522 {
523 if (const size_t* i = get_if<size_t> (&src)) {
524 return (*i) * fCodeCvt_->max_length ();
525 }
526 else {
527 // std::codecvt doesn't appear to provide an API to compute needed buffer length (just the reverse -
528 // for a buffer length, how many bytes consumed).
529 return get<span<const CHAR_T>> (src).size () * fCodeCvt_->max_length ();
530 }
531 }
532 };
533
534 namespace Private_ {
535 // a lot of old, important character sets can be represented this way (like old PC character sets for non-asian languages)
536 struct BuiltinSingleByteTableCodePageRep_ final : CodeCvt<char16_t>::IRep {
537 BuiltinSingleByteTableCodePageRep_ (CodePage cp, optional<Character> invalidCharacterReplacement);
538 virtual ~BuiltinSingleByteTableCodePageRep_ () = default;
539 virtual CodeCvt<char16_t>::Options GetOptions () const override
540 {
541 optional<char16_t> invalRepChar;
542 if (fInvalidCharacterReplacementByte_ != nullopt) {
543 char16_t x;
544 auto byteSpan = span{&*fInvalidCharacterReplacementByte_, 1};
545 (void)this->Bytes2Characters (&byteSpan, span{&x, 1});
546 invalRepChar = x;
547 }
548 return CodeCvt<char16_t>::Options{.fInvalidCharacterReplacement = invalRepChar};
549 }
550 virtual span<char16_t> Bytes2Characters (span<const byte>* from, span<char16_t> to) const override;
551 virtual span<byte> Characters2Bytes (span<const char16_t> from, span<byte> to) const override;
552 virtual size_t ComputeTargetCharacterBufferSize (variant<span<const byte>, size_t> src) const override;
553 virtual size_t ComputeTargetByteBufferSize (variant<span<const char16_t>, size_t> src) const override;
554 const char16_t* fMap_;
555 optional<byte> fInvalidCharacterReplacementByte_;
556 };
557#if qStroika_Foundation_Common_Platform_Windows
558 struct WindowsNative_ final : CodeCvt<char16_t>::IRep {
559 constexpr WindowsNative_ (CodePage cp)
560 : fCodePage_{cp}
561 {
562 }
563 virtual ~WindowsNative_ () = default;
564 virtual CodeCvt<char16_t>::Options GetOptions () const override
565 {
566 return {};
567 }
568 virtual span<char16_t> Bytes2Characters (span<const byte>* from, span<char16_t> to) const override;
569 virtual span<byte> Characters2Bytes (span<const char16_t> from, span<byte> to) const override;
570 virtual size_t ComputeTargetCharacterBufferSize (variant<span<const byte>, size_t> src) const override;
571 virtual size_t ComputeTargetByteBufferSize (variant<span<const char16_t>, size_t> src) const override;
572 CodePage fCodePage_;
573 };
574#endif
575 }
576
577 /*
578 ********************************************************************************
579 ************************* CodeCvt<CHAR_T>::Options *****************************
580 ********************************************************************************
581 */
582 template <IUNICODECanAlwaysConvertTo CHAR_T>
583 template <qCompilerAndStdLib_ConstraintDiffersInTemplateRedeclaration_BWA (IUNICODECanAlwaysConvertTo) FROM_CHAR_T_OPTIONS>
584 constexpr inline auto CodeCvt<CHAR_T>::Options::New (typename CodeCvt<FROM_CHAR_T_OPTIONS>::Options o) -> Options
585 {
586 return Options{.fInvalidCharacterReplacement = o.fInvalidCharacterReplacement};
587 }
588
589 /*
590 ********************************************************************************
591 ******************************* CodeCvt<CHAR_T> ********************************
592 ********************************************************************************
593 */
594 template <IUNICODECanAlwaysConvertTo CHAR_T>
595 inline CodeCvt<CHAR_T>::CodeCvt (const Options& options)
596 : fRep_{make_shared<UTFConvertRep_<char8_t>> (options)} // default, is to serialize to UTF-8
597 {
598 }
599 template <IUNICODECanAlwaysConvertTo CHAR_T>
600 inline CodeCvt<CHAR_T>::CodeCvt (const locale& l, const Options& options)
601 {
603 if constexpr (same_as<CHAR_T, wchar_t>) {
604 *this = mkFromStdCodeCvt<codecvt_byname<wchar_t, char, mbstate_t>> (options, l.name ());
605 }
606 else if constexpr (same_as<CHAR_T, char16_t> or same_as<CHAR_T, char32_t>) {
607 *this = mkFromStdCodeCvt<codecvt_byname<CHAR_T, char8_t, mbstate_t>> (options, l.name ());
608 }
609 else if constexpr (same_as<CHAR_T, Character>) {
610 fRep_ = make_shared<UTF2UTFRep_<char32_t>> (CodeCvt<char32_t>::mkFromStdCodeCvt<codecvt_byname<char32_t, char8_t, mbstate_t>> (
611 CodeCvt<char32_t>::Options::New<CHAR_T> (options), l.name ()));
612 }
613 else {
614 // CHAR_T COULD be UTF-8, but not clear if/why that would be useful.
616 }
617 DISABLE_COMPILER_MSC_WARNING_END (4996)
618 }
619 template <IUNICODECanAlwaysConvertTo CHAR_T>
620 CodeCvt<CHAR_T>::CodeCvt (const Charset& charset, const Options& options)
621 {
622 if (charset == WellKnownCharsets::kISO_8859_1) {
623 fRep_ = make_shared<Latin1ConvertRep_> (options);
624 }
625 else if (charset == WellKnownCharsets::kUTF8) {
626 *this = CodeCvt<CHAR_T>{UnicodeExternalEncodings::eUTF8};
627 }
628 else if (same_as<CHAR_T, Character>) {
630 // best hope is to treat it as a locale name, and hope its found
631 fRep_ = make_shared<UTF2UTFRep_<char32_t>> (CodeCvt<char32_t>::mkFromStdCodeCvt<codecvt_byname<char32_t, char8_t, mbstate_t>> (
632 CodeCvt<char32_t>::Options::New<CHAR_T> (options), charset.AsNarrowSDKString ()));
633 DISABLE_COMPILER_MSC_WARNING_END (4996)
634 }
635 else {
636 Private_::ThrowCharsetNotSupportedException_ (charset);
637 }
638 }
639 template <IUNICODECanAlwaysConvertTo CHAR_T>
640 CodeCvt<CHAR_T>::CodeCvt (UnicodeExternalEncodings e, const Options& options)
641 : fRep_{}
642 {
643 switch (e) {
644 case UnicodeExternalEncodings::eUTF8:
645 fRep_ = make_shared<UTFConvertRep_<char8_t>> (options);
646 break;
647 case UnicodeExternalEncodings::eUTF16_BE:
648 case UnicodeExternalEncodings::eUTF16_LE:
649 if (e == UnicodeExternalEncodings::eUTF16) {
650 fRep_ = make_shared<UTFConvertRep_<char16_t>> (options);
651 }
652 else {
653 fRep_ = make_shared<UTFConvertSwappedRep_<char16_t>> (options);
654 }
655 break;
656 case UnicodeExternalEncodings::eUTF32_BE:
657 case UnicodeExternalEncodings::eUTF32_LE:
658 if (e == UnicodeExternalEncodings::eUTF32) {
659 fRep_ = make_shared<UTFConvertRep_<char32_t>> (options);
660 }
661 else {
662 fRep_ = make_shared<UTFConvertSwappedRep_<char32_t>> (options);
663 }
664 break;
665 default:
667 }
668 }
669 template <IUNICODECanAlwaysConvertTo CHAR_T>
670 CodeCvt<CHAR_T>::CodeCvt (span<const byte>* guessFormatFrom, const optional<CodeCvt>& useElse, const Options& options)
671 : fRep_{}
672 {
673 RequireNotNull (guessFormatFrom);
674 Require (useElse == nullopt or useElse->GetOptions ().fInvalidCharacterReplacement == options.fInvalidCharacterReplacement);
675 if (optional<tuple<UnicodeExternalEncodings, size_t>> r = ReadByteOrderMark (*guessFormatFrom)) {
676 *guessFormatFrom = guessFormatFrom->subspan (get<size_t> (*r));
677 fRep_ = CodeCvt{get<UnicodeExternalEncodings> (*r), options}.fRep_;
678 }
679 else {
680 fRep_ = useElse ? useElse->fRep_ : CodeCvt{options}.fRep_;
681 }
682 }
683 template <IUNICODECanAlwaysConvertTo CHAR_T>
684 CodeCvt<CHAR_T>::CodeCvt (CodePage cp, const Options& options)
685 : fRep_{}
686 {
687 // A few we have builtin table converters for (BuiltinSingleByteTableCodePageRep_);
688 // a few are just UTF, and we can convert those.
689 // On windows, we can delegate to WindowsNative_
690 // else give up and throw not supported code page.
691 switch (cp) {
692 case WellKnownCodePages::kANSI:
693 case WellKnownCodePages::kMAC:
694 case WellKnownCodePages::kPC:
695 case WellKnownCodePages::kPCA:
696 case WellKnownCodePages::kGreek:
697 case WellKnownCodePages::kTurkish:
698 case WellKnownCodePages::kHebrew:
699 case WellKnownCodePages::kArabic:
700 fRep_ = make_shared<UTF2UTFRep_<char16_t>> (
701 CodeCvt<char16_t> (make_shared<Private_::BuiltinSingleByteTableCodePageRep_> (cp, options.fInvalidCharacterReplacement)));
702 break;
703 case WellKnownCodePages::kUTF8:
704 fRep_ = make_shared<UTFConvertRep_<char8_t>> (options);
705 break;
706 case WellKnownCodePages::kUNICODE_WIDE:
707 fRep_ = make_shared<UTFConvertRep_<char16_t>> (options);
708 break;
709 case WellKnownCodePages::kUNICODE_WIDE_BIGENDIAN:
710 fRep_ = make_shared<UTFConvertSwappedRep_<char16_t>> (options);
711 break;
712 default:
713#if qStroika_Foundation_Common_Platform_Windows
714 if (options.fInvalidCharacterReplacement) {
715 Private_::ThrowCodePageNotSupportedException_ (cp); // WindowsNative_ doesn't support fInvalidCharacterReplacement
716 }
717 fRep_ = make_shared<UTF2UTFRep_<char16_t>> (CodeCvt<char16_t> (make_shared<Private_::WindowsNative_> (cp)));
718 break;
719#else
720 Private_::ThrowCodePageNotSupportedException_ (cp);
721#endif
722 }
723 }
724 template <IUNICODECanAlwaysConvertTo CHAR_T>
725 template <IUNICODECanAlwaysConvertTo INTERMEDIATE_CHAR_T>
726 inline CodeCvt<CHAR_T>::CodeCvt (const CodeCvt<INTERMEDIATE_CHAR_T>& basedOn)
727 : fRep_{make_shared<UTF2UTFRep_<INTERMEDIATE_CHAR_T>> (basedOn)}
728 {
729 }
730 template <IUNICODECanAlwaysConvertTo CHAR_T>
731 inline CodeCvt<CHAR_T>::CodeCvt (const shared_ptr<IRep>& rep)
732 : fRep_{(RequireExpression (rep != nullptr), rep)}
733 {
734 }
735 template <IUNICODECanAlwaysConvertTo CHAR_T>
736 template <IStdCodeCVT STD_CODECVT, typename... ARGS>
737 inline CodeCvt<CHAR_T> CodeCvt<CHAR_T>::mkFromStdCodeCvt (const Options& options, ARGS... args)
738 requires (same_as<CHAR_T, typename STD_CODECVT::intern_type>)
739 {
740 auto u = make_unique<Private_::deletable_facet_<STD_CODECVT>> (forward<ARGS> (args)...);
741 return CodeCvt<CHAR_T>{make_shared<CodeCvt_WrapStdCodeCvt_<Private_::deletable_facet_<STD_CODECVT>>> (options, move (u))};
742 }
743 template <IUNICODECanAlwaysConvertTo CHAR_T>
744 inline auto CodeCvt<CHAR_T>::GetOptions () const -> Options
745 {
746 return fRep_->GetOptions ();
747 }
748 template <IUNICODECanAlwaysConvertTo CHAR_T>
749 inline auto CodeCvt<CHAR_T>::Bytes2Characters (span<const byte> from) const -> size_t
750 {
751 Memory::StackBuffer<CHAR_T> to{ComputeTargetCharacterBufferSize (from)};
752 return fRep_->Bytes2Characters (&from, span{to}).size ();
753 }
754 template <IUNICODECanAlwaysConvertTo CHAR_T>
755 inline auto CodeCvt<CHAR_T>::Bytes2Characters (span<const byte>* from, span<CHAR_T> to) const -> span<CHAR_T>
756 {
757 RequireNotNull (from);
758 AssertNotNull (fRep_);
759 Require (to.size () >= ComputeTargetCharacterBufferSize (*from) or to.size () >= Bytes2Characters (*from)); // ComputeTargetCharacterBufferSize cheaper to compute
760 auto r = fRep_->Bytes2Characters (from, to);
761 Ensure (from->size () < 10); // can only contain bytes for a partial character so must be small, typically one or two or zero
762 WeakAssert (from->size () <= 2);
763 return r;
764 }
765 template <IUNICODECanAlwaysConvertTo CHAR_T>
766 inline auto CodeCvt<CHAR_T>::Bytes2Characters (span<const byte> from, span<CHAR_T> to) const -> span<CHAR_T>
767 {
768 AssertNotNull (fRep_);
769 Require (to.size () >= ComputeTargetCharacterBufferSize (from) or to.size () >= Bytes2Characters (from)); // ComputeTargetCharacterBufferSize cheaper to compute
770 size_t origSize = from.size ();
771 auto result = fRep_->Bytes2Characters (&from, to);
772 if (not from.empty ()) {
773 Private_::ThrowErrorConvertingBytes2Characters_ (origSize - from.size ());
774 }
775 return result;
776 }
777 template <IUNICODECanAlwaysConvertTo CHAR_T>
778 inline auto CodeCvt<CHAR_T>::Characters2Bytes (span<const CHAR_T> from) const -> size_t
779 {
780 Memory::StackBuffer<byte> to{ComputeTargetByteBufferSize (from)};
781 return fRep_->Characters2Bytes (from, span{to}).size ();
782 }
783 template <IUNICODECanAlwaysConvertTo CHAR_T>
784 inline auto CodeCvt<CHAR_T>::Characters2Bytes (span<const CHAR_T> from, span<byte> to) const -> span<byte>
785 {
786 AssertNotNull (fRep_);
787 Require (to.size () >= ComputeTargetByteBufferSize (from) or to.size () >= Characters2Bytes (from)); // ComputeTargetByteBufferSize cheaper to compute
788 return fRep_->Characters2Bytes (from, to);
789 }
790 template <IUNICODECanAlwaysConvertTo CHAR_T>
791 inline size_t CodeCvt<CHAR_T>::ComputeTargetCharacterBufferSize (span<const byte> src) const
792 {
793 return fRep_->ComputeTargetCharacterBufferSize (src);
794 }
795 template <IUNICODECanAlwaysConvertTo CHAR_T>
796 inline size_t CodeCvt<CHAR_T>::ComputeTargetCharacterBufferSize (size_t srcSize) const
797 {
798 return fRep_->ComputeTargetCharacterBufferSize (srcSize);
799 }
800 template <IUNICODECanAlwaysConvertTo CHAR_T>
801 inline size_t CodeCvt<CHAR_T>::ComputeTargetByteBufferSize (span<const CHAR_T> src) const
802 {
803 return fRep_->ComputeTargetByteBufferSize (src);
804 }
805 template <IUNICODECanAlwaysConvertTo CHAR_T>
806 inline size_t CodeCvt<CHAR_T>::ComputeTargetByteBufferSize (size_t srcSize) const
807 {
808 return fRep_->ComputeTargetByteBufferSize (srcSize);
809 }
810 template <IUNICODECanAlwaysConvertTo CHAR_T>
811 template <constructible_from<const CHAR_T*, const CHAR_T*> STRINGISH>
812 STRINGISH CodeCvt<CHAR_T>::Bytes2String (span<const byte> from) const
813 {
814 size_t origSize = from.size ();
815 Memory::StackBuffer<CHAR_T> buf{this->ComputeTargetCharacterBufferSize (from)};
816 span<CHAR_T> r = this->Bytes2Characters (&from, span{buf});
817 if (not from.empty ()) {
818 Private_::ThrowErrorConvertingBytes2Characters_ (origSize - from.size ());
819 }
820 return STRINGISH{r.data (), r.data () + r.size ()};
821 }
822 template <IUNICODECanAlwaysConvertTo CHAR_T>
823 template <constructible_from<const byte*, const byte*> BLOBISH>
824 BLOBISH CodeCvt<CHAR_T>::String2Bytes (span<const CHAR_T> from) const
825 {
826 Memory::StackBuffer<byte> buf{Memory::eUninitialized, this->ComputeTargetByteBufferSize (from)};
827 const span<const byte> r = this->Characters2Bytes (from, span{buf});
828 if constexpr (same_as<BLOBISH, string>) {
829 return string{reinterpret_cast<const char*> (r.data ()), reinterpret_cast<const char*> (r.data ()) + r.size ()};
830 }
831 else {
832 return BLOBISH{r.data (), r.data () + r.size ()};
833 }
834 }
835
836}
#define AssertNotNull(p)
Definition Assertions.h:333
#define AssertNotImplemented()
Definition Assertions.h:401
#define RequireNotReached()
Definition Assertions.h:385
#define RequireNotNull(p)
Definition Assertions.h:347
#define WeakAssert(c)
A WeakAssert() is for things that aren't guaranteed to be true, but are overwhelmingly likely to be t...
Definition Assertions.h:438
#define RequireExpression(c)
Definition Assertions.h:267
#define AssertNotReached()
Definition Assertions.h:355
static CodeCvt mkFromStdCodeCvt(const Options &options={}, ARGS... args)
nonvirtual STRINGISH Bytes2String(span< const byte > from) const
nonvirtual size_t Bytes2Characters(span< const byte > from) const
convert span byte (external serialized format) parameters to characters (like std::codecvt<>::in () -...
Definition CodeCvt.inl:749
CodeCvt(const Options &options=Options{})
Definition CodeCvt.inl:595
nonvirtual BLOBISH String2Bytes(span< const CHAR_T > from) const
Logically halfway between std::array and std::vector; Smart 'direct memory array' - which when needed...
constexpr optional< tuple< UnicodeExternalEncodings, size_t > > ReadByteOrderMark(span< const byte > d) noexcept
UnicodeExternalEncodings
list of external UNICODE character encodings, for file IO (eDEFAULT = eUTF8)
Definition UTFConvert.h:31