Stroika Library 3.0d18
 
Loading...
Searching...
No Matches
Character.inl
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#include <bit>
5#include <cwctype>
6#if !qCompilerAndStdLib_stdlib_ranges_pretty_broken_Buggy
7#include <ranges>
8#endif
9#include <type_traits>
10
12#include "Stroika/Foundation/Memory/Common.h"
13
15
16 namespace Private_ {
17 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T, size_t E1, size_t E2>
18 constexpr strong_ordering Compare_CS_ (span<const CHAR_T, E1> lhs, span<const CHAR_T, E2> rhs)
19 {
20 size_t lLen = lhs.size ();
21 size_t rLen = rhs.size ();
22 size_t length = min (lLen, rLen);
23 const CHAR_T* li = lhs.data ();
24 const CHAR_T* ri = rhs.data ();
25
26 // when can we use memcmp() instead of a loop comparing?
27 // for sizeof CHAR_T == 1, sure.
28 // for bigger CHAR_T, we have to worry about endianness
29 // |HI-1|LO-1|HI-2|LO-2... - for this case, we are all set, because
30 // this will have the same 'ordering' when we compare characters as when we compare as bytes
31 constexpr bool kCanUseMemCmpOptimization_ = sizeof (CHAR_T) == 1 or (std::endian::native == std::endian::big);
32
33 // tested on windows, and no obvious difference
34 constexpr bool kUseStdTraitsCompare_ = true;
35
36 if constexpr (kUseStdTraitsCompare_) {
37 using TRAITS_CHAR_T = conditional_t<sizeof (CHAR_T) == 4, char32_t, CHAR_T>;
38 int r = std::char_traits<TRAITS_CHAR_T>::compare (reinterpret_cast<const TRAITS_CHAR_T*> (li),
39 reinterpret_cast<const TRAITS_CHAR_T*> (ri), length);
40 if (r != 0) [[likely]] {
42 }
43 }
44 else if constexpr (kCanUseMemCmpOptimization_) {
45 int r = std::memcmp (li, ri, length);
46 if (r != 0) [[likely]] {
48 }
49 }
50 else {
51 const CHAR_T* lend = li + length; // just end of what we are comparing in this loop
52 for (; li != lend; ++li, ++ri) {
53 if (*li != *ri) [[likely]] {
54 return *li <=> *ri;
55 }
56 }
57 }
58 return Common::CompareResultNormalizer (static_cast<ptrdiff_t> (lLen) - static_cast<ptrdiff_t> (rLen));
59 }
60 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T, size_t E1, size_t E2>
61 constexpr strong_ordering Compare_CI_ (span<const CHAR_T, E1> lhs, span<const CHAR_T, E2> rhs)
62 {
63 size_t lLen = lhs.size ();
64 size_t rLen = rhs.size ();
65 size_t length = min (lLen, rLen);
66 const CHAR_T* li = lhs.data ();
67 const CHAR_T* ri = rhs.data ();
68 const CHAR_T* lend = li + length; // just end of what we are comparing in this loop
69 for (; li != lend; ++li, ++ri) {
70 CHAR_T lc; // intentionally uninitialized
71 CHAR_T rc; // ""
72 if constexpr (same_as<CHAR_T, Character>) {
73 lc = li->ToLowerCase ();
74 rc = ri->ToLowerCase ();
75 }
76 else if constexpr (same_as<CHAR_T, ASCII>) {
77 // @todo NOT SURE if this works for Latin1...
78 // see https://en.cppreference.com/w/cpp/string/byte/tolower for rationale for this crazy casting
79 lc = static_cast<CHAR_T> (std::tolower (static_cast<unsigned char> (*li)));
80 rc = static_cast<CHAR_T> (std::tolower (static_cast<unsigned char> (*ri)));
81 }
82 else {
83 // see https://en.cppreference.com/w/cpp/string/byte/tolower for rationale for this crazy casting
84 lc = static_cast<CHAR_T> (std::towlower (static_cast<wchar_t> (*li)));
85 rc = static_cast<CHAR_T> (std::towlower (static_cast<wchar_t> (*ri)));
86 }
87 if (lc != rc) [[likely]] {
88 return lc <=> rc;
89 }
90 }
91 return Common::CompareResultNormalizer (static_cast<ptrdiff_t> (lLen) - static_cast<ptrdiff_t> (rLen));
92 }
93 void ThrowNotIsASCII_ ();
94 void ThrowNotIsLatin1_ ();
95 void ThrowSurrogatesOutOfRange_ ();
96 }
97
98 /*
99 ********************************************************************************
100 ************************************* Latin1 ***********************************
101 ********************************************************************************
102 */
103 constexpr inline Latin1::operator uint8_t () const
104 {
105 return data;
106 }
107
108 /*
109 ********************************************************************************
110 *********************************** Character **********************************
111 ********************************************************************************
112 */
113 template <IPossibleCharacterRepresentation CHAR_T>
114 constexpr bool Character::IsASCII (CHAR_T c) noexcept
115 {
116 // clang++-14 likes to see this defined before where its used...
117 if constexpr (same_as<remove_cv_t<CHAR_T>, Character>) {
118 return c.IsASCII ();
119 }
120 else if constexpr (same_as<remove_cv_t<CHAR_T>, Latin1>) {
121 return static_cast<uint8_t> (c) <= 0x7f;
122 }
123 else {
124 return static_cast<make_unsigned_t<CHAR_T>> (c) <= 0x7f;
125 }
126 }
127 constexpr inline Character::Character () noexcept
128 : fCharacterCode_{'\0'}
129 {
130 }
131 constexpr inline Character::Character (ASCII c)
132 : fCharacterCode_{static_cast<char32_t> (c)}
133 {
134 if (is_constant_evaluated ()) {
135 //static_assert (not IsASCII (c)); // not sure what/how todo this
136 }
137 if (not is_constant_evaluated () and not IsASCII (c)) [[unlikely]] {
138 Private_::ThrowNotIsASCII_ ();
139 }
140 }
141 constexpr inline Character::Character (Latin1 c) noexcept
142 : fCharacterCode_{static_cast<char32_t> (c)}
143 {
144 }
145 constexpr inline Character::Character (char16_t c)
146 : fCharacterCode_{static_cast<char32_t> (c)}
147 {
148 if (IsSurrogatePair_Hi (c)) [[unlikely]] {
149 Private_::ThrowSurrogatesOutOfRange_ ();
150 }
151 }
152 constexpr Character::Character (char16_t hiSurrogate, char16_t lowSurrogate)
153 {
154 /*
155 * See https://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates
156 *
157 * A surrogate pair denotes the code point
158 * 0x10000 + (H - 0xD800) x 0x400 + (L - 0xDC00)
159 */
160 constexpr int halfShift = 10; /* used for shifting by 10 bits */
161 constexpr char32_t halfBase = 0x0010000UL;
162 if (not IsSurrogatePair_Hi (hiSurrogate)) [[unlikely]] {
163 Private_::ThrowSurrogatesOutOfRange_ ();
164 }
165 if (not IsSurrogatePair_Lo (lowSurrogate)) [[unlikely]] {
166 Private_::ThrowSurrogatesOutOfRange_ ();
167 }
168 fCharacterCode_ = ((hiSurrogate - kUNICODESurrogate_High_Start) << halfShift) + (lowSurrogate - kUNICODESurrogate_Low_Start) + halfBase;
169 }
170 constexpr inline Character::Character (char32_t c) noexcept
171 : fCharacterCode_{c}
172 {
173 }
174 constexpr inline Character::Character (wchar_t c) noexcept (sizeof (wchar_t) == 4)
175 : fCharacterCode_{static_cast<char32_t> (c)}
176 {
177 if constexpr (sizeof (wchar_t) != 4) {
178 if (IsSurrogatePair_Hi (c)) [[unlikely]] {
179 Private_::ThrowSurrogatesOutOfRange_ ();
180 }
181 }
182 }
183 inline ASCII Character::GetAsciiCode () const noexcept
184 {
185 Require (IsASCII ());
186 return static_cast<char> (fCharacterCode_);
187 }
188 constexpr char32_t Character::GetCharacterCode () const noexcept
189 {
190 return fCharacterCode_;
191 }
192 constexpr Character::operator char32_t () const noexcept
193 {
194 return fCharacterCode_;
195 }
196 template <typename T>
197 constexpr T Character::As () const noexcept
198 requires (same_as<T, char32_t> or (sizeof (wchar_t) == sizeof (char32_t) and same_as<T, wchar_t>))
199 {
200 return GetCharacterCode ();
201 }
202 template <>
203 void Character::AsHelper_ (Memory::StackBuffer<char8_t>* buf) const;
204 template <>
205 void Character::AsHelper_ (Memory::StackBuffer<char16_t>* buf) const;
206 template <IUNICODECodePoint T>
207 inline span<const T> Character::As (Memory::StackBuffer<T>* buf) const
208 {
209 RequireNotNull (buf);
210 if constexpr (sizeof (T) == sizeof (char32_t)) {
211 buf->clear ();
212 buf->push_back (this->GetCharacterCode ());
213 return span{*buf};
214 }
215 else if constexpr (same_as<T, wchar_t>) {
216 Assert (sizeof (wchar_t) == sizeof (char16_t));
217 this->AsHelper_ (reinterpret_cast<Memory::StackBuffer<char16_t>*> (buf));
218 Ensure (1 <= buf->size () and buf->size () <= 3);
219 return span{*buf};
220 }
221 else if constexpr (same_as<T, char8_t> or same_as<T, char16_t>) {
222 this->AsHelper_ (buf);
223 Ensure (1 <= buf->size () and buf->size () <= 3);
224 return span{*buf};
225 }
226 }
227 constexpr bool Character::IsASCII () const noexcept
228 {
229 return 0x0 <= fCharacterCode_ and fCharacterCode_ <= 0x7f;
230 }
231 template <IPossibleCharacterRepresentation CHAR_T>
232 constexpr bool Character::IsASCII (span<const CHAR_T> fromS) noexcept
233 {
234 constexpr auto charComparer = [] () noexcept {
235 if constexpr (same_as<remove_cv_t<CHAR_T>, Character>) {
236 return [] (Character c) noexcept { return c.IsASCII (); };
237 }
238 else if constexpr (same_as<remove_cv_t<CHAR_T>, Latin1>) {
239 return [] (Latin1 c) noexcept { return static_cast<uint8_t> (c) <= 0x7f; };
240 }
241 else {
242 return [] (CHAR_T c) noexcept { return static_cast<make_unsigned_t<CHAR_T>> (c) <= 0x7f; };
243 }
244 }();
245#if qCompilerAndStdLib_stdlib_ranges_pretty_broken_Buggy
246 return std::all_of (fromS.begin (), fromS.end (), charComparer);
247#else
248 return ranges::all_of (fromS, charComparer);
249#endif
250 }
251 template <IPossibleCharacterRepresentation CHAR_T>
252 constexpr void Character::CheckASCII (span<const CHAR_T> s)
253 {
254 if (not IsASCII (s)) [[unlikely]] {
255 if (is_constant_evaluated ()) {
256 throw "Argument not valid ASCII";
257 }
258 else {
259 Private_::ThrowNotIsASCII_ (); // not constexpr so can go in CPP file
260 }
261 }
262 }
263 template <IPossibleCharacterRepresentation CHAR_T>
264 constexpr void Character::CheckASCII (span<CHAR_T> s)
265 {
266 CheckASCII (Memory::ConstSpan (s));
267 }
268 constexpr bool Character::IsLatin1 () const noexcept
269 {
270 return 0x0 <= fCharacterCode_ and fCharacterCode_ <= 0xff;
271 }
272 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
273 constexpr bool Character::IsLatin1 (span<const CHAR_T> fromS) noexcept
274 {
275 if constexpr (same_as<CHAR_T, ASCII> or same_as<CHAR_T, Latin1>) {
276 // then data must be ascii or latin1, since any byte is latin1
277 return true;
278 }
279 else if constexpr (same_as<CHAR_T, char8_t>) {
280 // For the special case of UTF-8, tricky to tell if its latin1 or not. Must iterate through the remaining
281 // two-byte pairs, and make sure they are 110xxxxx followed by 10xxxxxx where the xxx's get unpacked in to <= 0xff
282 if (fromS.size () % 2 == 0) {
283 for (auto i = fromS.begin (); i < fromS.end (); ++i) {
284 char8_t c1 = *i++;
285 char8_t c2 = *i;
286 // Check legit high order bits for first and second byte
287 // and to check RANGE of values being latin1, means bit pattern only
288 // 8 bits. Low order bits in second byte, so dont need to examine those.
289 // Just make sure at most two bits set in upper (first) byte
290 if ((c1 & 0b11100000) == 0b11000000 and ((c2 & 0b11000000) == 0b10000000 and (c1 & 0b00111111) <= 0b011)) [[likely]] {
291 // OK
292 }
293 else {
294 return false;
295 }
296 }
297 return true;
298 }
299 return false;
300 }
301 else {
302 static_assert (2 <= sizeof (CHAR_T) and sizeof (CHAR_T) <= 4);
303 constexpr auto charComparer = [] () noexcept {
304 if constexpr (same_as<remove_cv_t<CHAR_T>, Character>) {
305 return [] (Character c) noexcept { return c.IsLatin1 (); };
306 }
307 else {
308 return [] (CHAR_T c) noexcept { return static_cast<make_unsigned_t<CHAR_T>> (c) <= 0xff; };
309 }
310 }();
311#if qCompilerAndStdLib_stdlib_ranges_pretty_broken_Buggy
312 return std::all_of (fromS.begin (), fromS.end (), charComparer);
313#else
314 return ranges::all_of (fromS, charComparer);
315#endif
316 }
317 }
318 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
319 inline void Character::CheckLatin1 (span<const CHAR_T> s)
320 {
321 if (not IsLatin1 (s)) [[unlikely]] {
322 Private_::ThrowNotIsLatin1_ ();
323 }
324 }
325 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
326 inline void Character::CheckLatin1 (span<CHAR_T> s)
327 {
328 CheckLatin1 (Memory::ConstSpan (s));
329 }
330 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
331 constexpr auto Character::IsASCIIOrLatin1 (span<const CHAR_T> s) noexcept -> ASCIIOrLatin1Result
332 {
333 constexpr auto eNone = ASCIIOrLatin1Result::eNone;
334 constexpr auto eLatin1 = ASCIIOrLatin1Result::eLatin1;
335 constexpr auto eASCII = ASCIIOrLatin1Result::eASCII;
336 if constexpr (same_as<CHAR_T, ASCII> or same_as<CHAR_T, Latin1>) {
337 // then data must be ascii or latin1, since any byte is latin1
338 return IsASCII (s) ? eASCII : eLatin1;
339 }
340 else {
341 constexpr auto isASCII = [] () noexcept {
342 if constexpr (same_as<remove_cv_t<CHAR_T>, Character>) {
343 return [] (Character c) noexcept { return c.IsASCII (); };
344 }
345 else {
346 return [] (CHAR_T c) noexcept { return static_cast<make_unsigned_t<CHAR_T>> (c) <= 0x7f; };
347 }
348 }();
349#if qCompilerAndStdLib_stdlib_ranges_pretty_broken_Buggy
350 auto i = s.begin ();
351 for (; i != s.end () and isASCII (*i); ++i)
352 ;
353 size_t leadingAsciiCharCnt = static_cast<size_t> (i - s.begin ());
354 if (leadingAsciiCharCnt == s.size ()) [[likely]] {
355 return eASCII;
356 }
357#else
358 auto leadingASCIISpan = ranges::take_while_view (s, isASCII);
359 size_t leadingAsciiCharCnt = static_cast<size_t> (ranges::distance (leadingASCIISpan));
360 if (leadingAsciiCharCnt == s.size ()) [[likely]] {
361 return eASCII;
362 }
363#endif
364 span remainingInputSpan = s.subspan (leadingAsciiCharCnt);
365 if constexpr (same_as<CHAR_T, char8_t>) {
366 // special case - we need different algorithm looking at pairs of entries, to see if IsLatin1 within utf8
367 return IsLatin1 (remainingInputSpan) ? eLatin1 : eNone;
368 }
369 constexpr auto isLatin1 = [] () noexcept {
370 if constexpr (same_as<remove_cv_t<CHAR_T>, Character>) {
371 return [] (Character c) noexcept { return c.IsLatin1 (); };
372 }
373 else {
374 return [] (CHAR_T c) noexcept { return static_cast<make_unsigned_t<CHAR_T>> (c) <= 0xff; };
375 }
376 }();
377#if qCompilerAndStdLib_stdlib_ranges_pretty_broken_Buggy
378 auto ii = remainingInputSpan.begin ();
379 for (; ii != remainingInputSpan.end () and isLatin1 (*ii); ++ii)
380 ;
381 size_t remainingLatin1 = static_cast<size_t> (ii - remainingInputSpan.begin ());
382 if (remainingLatin1 == remainingInputSpan.size ()) [[likely]] {
383 return eLatin1;
384 }
385#else
386 auto remainingLatin1 = ranges::take_while_view (remainingInputSpan, isLatin1);
387 if (static_cast<size_t> (ranges::distance (remainingLatin1)) == remainingInputSpan.size ()) [[likely]] {
388 return eLatin1;
389 }
390#endif
391 return eNone;
392 }
393 }
394 constexpr bool Character::IsWhitespace () const noexcept
395 {
396 bool result = false;
397 if (0x09 <= fCharacterCode_ and fCharacterCode_ <= 0x0D) [[unlikely]] {
398 result = true;
399 }
400 else if (fCharacterCode_ == 0x20) [[unlikely]] {
401 result = true;
402 }
403 else if (fCharacterCode_ >= 0x1680) [[unlikely]] {
404 // rarely get chars this big, so shortcut all the detailed tests
405 if (fCharacterCode_ == 0x1680 or fCharacterCode_ == 0x180E) [[unlikely]] {
406 result = true;
407 }
408 else if (0x2000 <= fCharacterCode_ and fCharacterCode_ <= 0x2006) [[unlikely]] {
409 result = true;
410 }
411 else if (0x2008 <= fCharacterCode_ and fCharacterCode_ <= 0x200A) [[unlikely]] {
412 result = true;
413 }
414 else if (fCharacterCode_ == 0x2028 or fCharacterCode_ == 0x2029 or fCharacterCode_ == 0x205F or fCharacterCode_ == 0x3000) [[unlikely]] {
415 result = true;
416 }
417 }
419 DISABLE_COMPILER_CLANG_WARNING_START ("clang diagnostic ignored \"-Wconstant-evaluated\"");
420 DISABLE_COMPILER_GCC_WARNING_START ("GCC diagnostic ignored \"-Wtautological-compare\"");
421 if constexpr (not std::is_constant_evaluated ()) {
422 Ensure (result == !!iswspace (static_cast<wchar_t> (fCharacterCode_)));
423 }
424 DISABLE_COMPILER_GCC_WARNING_END ("GCC diagnostic ignored \"-Wtautological-compare\"");
425 DISABLE_COMPILER_CLANG_WARNING_END ("clang diagnostic ignored \"-Wconstant-evaluated\"");
426 DISABLE_COMPILER_MSC_WARNING_END (5063)
427 return result;
428 }
429 constexpr bool Character::IsWhitespace (Character c) noexcept
430 {
431 return c.IsWhitespace ();
432 }
433 inline bool Character::IsDigit () const noexcept
434 {
435 // https://stackoverflow.com/questions/60353945/isthing-equivalents-for-char32-t
436 // @todo RECONSIDER IF THIS IS RIGHT FOR char32_t?
437 return !!iswdigit (static_cast<wchar_t> (fCharacterCode_));
438 }
439 inline bool Character::IsHexDigit () const noexcept
440 {
441 // https://stackoverflow.com/questions/60353945/isthing-equivalents-for-char32-t
442 return !!iswxdigit (static_cast<wchar_t> (fCharacterCode_));
443 }
444 inline bool Character::IsAlphabetic () const noexcept
445 {
446 // https://stackoverflow.com/questions/60353945/isthing-equivalents-for-char32-t
447 return !!iswalpha (static_cast<wchar_t> (fCharacterCode_));
448 }
449 inline bool Character::IsUpperCase () const noexcept
450 {
451 // https://stackoverflow.com/questions/60353945/isthing-equivalents-for-char32-t
452 return !!iswupper (static_cast<wchar_t> (fCharacterCode_));
453 }
454 inline bool Character::IsLowerCase () const noexcept
455 {
456 // https://stackoverflow.com/questions/60353945/isthing-equivalents-for-char32-t
457 return !!iswlower (static_cast<wchar_t> (fCharacterCode_));
458 }
459 inline bool Character::IsAlphaNumeric () const noexcept
460 {
461 // https://stackoverflow.com/questions/60353945/isthing-equivalents-for-char32-t
462 return !!iswalnum (static_cast<wchar_t> (fCharacterCode_));
463 }
464 inline bool Character::IsPunctuation () const noexcept
465 {
466 // https://stackoverflow.com/questions/60353945/isthing-equivalents-for-char32-t
467 return !!iswpunct (static_cast<wchar_t> (fCharacterCode_));
468 }
469 constexpr bool Character::IsControl () const noexcept
470 {
471 /*
472 * According to https://en.cppreference.com/w/cpp/string/wide/iswcntrl
473 *
474 * ISO 30112 defines POSIX control characters as Unicode characters U+0000..U+001F,
475 * U+007F..U+009F, U+2028, and U+2029 (Unicode classes Cc, Zl, and Zp)
476 *
477 * Be explicit here so can use constexpr
478 *
479 * WAS: return !!iswcntrl (static_cast<wchar_t> (fCharacterCode_));
480 */
481 if (0 <= fCharacterCode_ and fCharacterCode_ <= 0x1f) [[unlikely]] {
482 return true;
483 }
484 if (0x7f <= fCharacterCode_ and fCharacterCode_ <= 0x9f) [[unlikely]] {
485 return true;
486 }
487 if (0x2028 == fCharacterCode_ or 0x2029 == fCharacterCode_) [[unlikely]] {
488 return true;
489 }
490 return false;
491 }
492 inline Character Character::ToLowerCase () const noexcept
493 {
494 // https://stackoverflow.com/questions/60353945/isthing-equivalents-for-char32-t
495 // Cannot find good spec on towlower/towupper, so not sure that this check is necessary
496 //
497 // before Stroika v3.0d1 - we used to check iswupper first, but according to https://en.cppreference.com/w/cpp/string/wide/towlower
498 // that appears unnecessary
499 return static_cast<wchar_t> (::towlower (static_cast<wchar_t> (fCharacterCode_)));
500 }
501 inline Character Character::ToUpperCase () const noexcept
502 {
503 // See ToLowerCase() for implementation comments
504 return static_cast<wchar_t> (::towupper (static_cast<wchar_t> (fCharacterCode_)));
505 }
506 template <typename RESULT_T, IPossibleCharacterRepresentation CHAR_T>
507 inline bool Character::AsASCIIQuietly (span<const CHAR_T> fromS, RESULT_T* into)
508 requires requires (RESULT_T* into) {
509 { into->empty () } -> same_as<bool>;
510 { into->push_back (ASCII{0}) };
511 }
512 {
513 RequireNotNull (into);
514 Require (into->empty ());
515 // note - tried to simplify with conditional_t but both sides evaluated
516 if constexpr (same_as<remove_cv_t<CHAR_T>, Character>) {
517 for (Character c : fromS) {
518 if (c.IsASCII ()) [[likely]] {
519 into->push_back (c.GetAsciiCode ());
520 }
521 else {
522 return false;
523 }
524 }
525 }
526 else {
527 for (CHAR_T c : fromS) {
528 if (static_cast<make_unsigned_t<CHAR_T>> (c) <= 0x7f) [[likely]] {
529 into->push_back (static_cast<char> (c));
530 }
531 else {
532 return false;
533 }
534 }
535 }
536 return true;
537 }
538 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T, size_t E1, size_t E2>
539 constexpr strong_ordering Character::Compare (span<const CHAR_T, E1> lhs, span<const CHAR_T, E2> rhs, CompareOptions co) noexcept
540 {
541 Require (co == eWithCase or co == eCaseInsensitive);
542 switch (co) {
543 case eWithCase:
544 return Private_::Compare_CS_ (lhs, rhs);
545 case eCaseInsensitive:
546 return Private_::Compare_CI_ (lhs, rhs);
547 default:
549 return strong_ordering::equal;
550 }
551 }
552 constexpr bool Character::IsSurrogatePair () const
553 {
554 /*
555 * See https://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates
556 *
557 * A surrogate pair denotes the code point
558 * 0x10000 + (H - 0xD800) x 0x400 + (L - 0xDC00)
559 */
560 constexpr char32_t kMinCode_ = 0x10000;
561 constexpr char32_t kMaxCode_ = kMinCode_ + (kUNICODESurrogate_High_End - kUNICODESurrogate_High_Start) * 0x400 +
562 (kUNICODESurrogate_Low_End - kUNICODESurrogate_Low_Start);
563 return kMinCode_ <= fCharacterCode_ and fCharacterCode_ <= kMaxCode_;
564 }
565 constexpr bool Character::IsSurrogatePair (char16_t hiSurrogate, char16_t lowSurrogate)
566 {
567 return IsSurrogatePair_Hi (hiSurrogate) and IsSurrogatePair_Lo (lowSurrogate);
568 }
569 constexpr bool Character::IsSurrogatePair_Hi (char16_t hiSurrogate)
570 {
571 return kUNICODESurrogate_High_Start <= hiSurrogate and hiSurrogate <= kUNICODESurrogate_High_End;
572 }
573 constexpr bool Character::IsSurrogatePair_Lo (char16_t lowSurrogate)
574 {
575 return kUNICODESurrogate_Low_Start <= lowSurrogate and lowSurrogate <= kUNICODESurrogate_Low_End;
576 }
577 constexpr pair<char16_t, char16_t> Character::GetSurrogatePair () const
578 {
579 Require (IsSurrogatePair ());
580 /*
581 * Run fCharacterCode_ = ((hiSurrogate - kUNICODESurrogate_High_Start) << halfShift) + (lowSurrogate - kUNICODESurrogate_Low_Start) + halfBase; BACKWARDS
582 */
583 constexpr int halfShift = 10; /* used for shifting by 10 bits */
584 constexpr char32_t halfBase = 0x0010000UL;
585 constexpr char32_t halfMask = 0x3FFUL;
586 char32_t ch = fCharacterCode_ - halfBase;
587 return pair<char16_t, char16_t>{static_cast<char16_t> ((ch >> halfShift) + kUNICODESurrogate_High_Start),
588 static_cast<char16_t> ((ch & halfMask) + kUNICODESurrogate_Low_Start)};
589 }
590
591 /*
592 ********************************************************************************
593 *************************** Character::EqualsComparer **************************
594 ********************************************************************************
595 */
596 constexpr Character::EqualsComparer::EqualsComparer (CompareOptions co) noexcept
597 : fCompareOptions{co}
598 {
599 }
600 constexpr bool Character::EqualsComparer::operator() (Character lhs, Character rhs) const noexcept
601 {
602 using namespace Stroika::Foundation::Characters;
603 return Character::Compare (Memory::ConstSpan (span{&lhs, 1}), Memory::ConstSpan (span{&rhs, 1}), fCompareOptions) == 0;
604 }
605
606 /*
607 ********************************************************************************
608 ************************* Character::ThreeWayComparer **************************
609 ********************************************************************************
610 */
611 constexpr Character::ThreeWayComparer::ThreeWayComparer (Stroika::Foundation::Characters::CompareOptions co) noexcept
612 : fCompareOptions{co}
613 {
614 }
615 inline auto Character::ThreeWayComparer::operator() (Stroika::Foundation::Characters::Character lhs,
617 {
618 using namespace Stroika::Foundation::Characters;
619 return Character::Compare (Memory::ConstSpan (span{&lhs, 1}), Memory::ConstSpan (span{&rhs, 1}), fCompareOptions);
620 }
621
622}
623
625 template <typename ENUM_TYPE>
626 class EnumNames;
627 template <>
628 constexpr EnumNames<Characters::CompareOptions> DefaultNames<Characters::CompareOptions>::k{{{
629 {Characters::CompareOptions::eCaseInsensitive, L"Case-Insensitive"},
630 {Characters::CompareOptions::eWithCase, L"With-Case"},
631 }}};
632}
#define RequireNotNull(p)
Definition Assertions.h:347
#define AssertNotReached()
Definition Assertions.h:355
constexpr bool IsASCII() const noexcept
Return true iff the given character (or all in span) is (are) in the ascii range [0....
static void CheckLatin1(span< const CHAR_T > s)
if not IsLatin1 (arg) throw RuntimeException...
static constexpr void CheckASCII(span< const CHAR_T > s)
if not IsASCII (arg) throw RuntimeException...
static constexpr char16_t kUNICODESurrogate_High_Start
Definition Character.h:473
static constexpr ASCIIOrLatin1Result IsASCIIOrLatin1(span< const CHAR_T > s) noexcept
nonvirtual Character ToLowerCase() const noexcept
nonvirtual ASCII GetAsciiCode() const noexcept
static constexpr strong_ordering Compare(span< const CHAR_T, E1 > lhs, span< const CHAR_T, E2 > rhs, CompareOptions co) noexcept
constexpr bool IsControl() const noexcept
nonvirtual bool IsLowerCase() const noexcept
constexpr char32_t GetCharacterCode() const noexcept
Return the char32_t UNICODE code-point associated with this character.
constexpr pair< char16_t, char16_t > GetSurrogatePair() const
nonvirtual Character ToUpperCase() const noexcept
static bool AsASCIIQuietly(span< const CHAR_T > fromS, RESULT_T *into)
constexpr bool IsLatin1() const noexcept
Return true iff the given character (or all in span) is (are) in the ascii/iso-latin range [0....
constexpr bool IsWhitespace() const noexcept
nonvirtual bool IsUpperCase() const noexcept
Logically halfway between std::array and std::vector; Smart 'direct memory array' - which when needed...
nonvirtual void push_back(Common::ArgByValueType< T > e)
nonvirtual size_t size() const noexcept
char ASCII
Stroika's string/character classes treat 'char' as being an ASCII character.
Definition Character.h:59
constexpr strong_ordering CompareResultNormalizer(FROM_INT_TYPE f)
Definition Compare.inl:226
constexpr EqualsComparer(Stroika::Foundation::Characters::CompareOptions co=Stroika::Foundation::Characters::CompareOptions::eWithCase) noexcept
constexpr ThreeWayComparer(Stroika::Foundation::Characters::CompareOptions co=Stroika::Foundation::Characters::CompareOptions::eWithCase) noexcept