Stroika Library 3.0d16
 
Loading...
Searching...
No Matches
Character.h
Go to the documentation of this file.
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#ifndef _Stroika_Foundation_Characters_Character_h_
5#define _Stroika_Foundation_Characters_Character_h_ 1
6
7#include "Stroika/Foundation/StroikaPreComp.h"
8
9#include <compare>
10#include <span>
11#include <type_traits>
12
16
17/**
18 * \file
19 *
20 * \note Code-Status: <a href="Code-Status.md#Beta">Beta</a>
21 *
22 * TODO:
23 *
24 * @todo ToLower ('GERMAN ES-ZETT' or 'SHARP S') returns two esses ('ss') - and we return a single character.
25 * We COULD change return value, or simply document that issue here and define ToLower() of STRING todo
26 * the right thing for queer cases like this, and use this API for the most common cases.
27 *
28 * I HOPE - though am not sure - that this is now addressed in Stroika v3 - by encoding Character as utf32_t.
29 */
30
32
33 /**
34 */
35 enum class CompareOptions : uint8_t {
36 eWithCase,
37 eCaseInsensitive,
38
39 Stroika_Define_Enum_Bounds (eWithCase, eCaseInsensitive)
40 };
41 using CompareOptions::eCaseInsensitive;
42 using CompareOptions::eWithCase;
43
44 /**
45 * \brief Stroika's string/character classes treat 'char' as being an ASCII character
46 *
47 * This using declaration just documents that fact, without really enforcing anything.
48 * Prior to Stroika v3, the Stroika String classes basically prohibited the use of char
49 * because it was always UNCLEAR what character set to interpret it as.
50 *
51 * But a safe (and quite useful) assumption, is just that it is ASCII. If you assume its
52 * always ASCII, you can simplify a lot of pragmatic usage. So Stroika v3 does that,
53 * with checks to enforce.
54 *
55 * So generally - Stroika String (and Character) APIs - if given a 'char' REQUIRE that it be
56 * ASCII (unless otherwise documented in that API). Use u8string, or something else if you don't want
57 * to assume ASCII.
58 */
59 using ASCII = char;
60
61 /**
62 * Internally, several algorithms and data structures operate on this one-byte subset of UNICODE.
63 * However, most Stroika public APIs don't expose this, because this is not any kind of standard for APIs.
64 * APIs use char8_t, char16_t, char32_t, ASCII (aka char).
65 *
66 * This refers to ASCII OR https://en.wikipedia.org/wiki/Latin-1_Supplement, so any UNICODE character code point
67 * less than U+00FF.
68 *
69 * \note Considered using Latin1 = uint8_t; But this is better since less likely accidentally used.
70 */
71 struct Latin1 {
72 uint8_t data;
73 constexpr operator uint8_t () const;
74 constexpr bool operator== (const Latin1&) const = default;
75 constexpr auto operator<=> (const Latin1&) const = default;
76 };
77 static_assert (is_trivially_constructible_v<Latin1>);
78 static_assert (is_trivially_destructible_v<Latin1>);
79 static_assert (sizeof (Latin1) == 1); // so can re_reinterpret_cast<> between Latin1 and unsigned char/uint8_t;
80
81 /**
82 * \brief check if T is char8_t, char16_t, char32_t - one of the three possible UNICODE UTF code-point classes.
83 *
84 * IBasicUNICODECodePoint:
85 * o char8_t
86 * o char16_t
87 * o char32_t
88 */
89 template <typename T>
90 concept IBasicUNICODECodePoint = Common::IAnyOf<remove_cv_t<T>, char8_t, char16_t, char32_t>;
91
92 /**
93 * \brief check if T is IBasicUNICODECodePoint or wchar_t (any basic code-point class)
94 *
95 * IUCodePoint:
96 * o char8_t IBasicUNICODECodePoint
97 * o char16_t ""
98 * o char32_t ""
99 * o wchar_t added here
100 */
101 template <typename T>
102 concept IUNICODECodePoint = IBasicUNICODECodePoint<T> or same_as<remove_cv_t<T>, wchar_t>;
103
104 /**
105 * \brief concept IStdBasicStringCompatibleCharacter tests if the 'T' argument is a legit CHARACTER argument to std::basic_string, and basic_string_view (char,char8_t,char16_t,char32_t,wchar_t).
106 *
107 * \note ALL of these character types are ALSO legitimate arguments to Stroika's String class (array of span of such). In the case
108 * of char, the text must be ASCII.
109 * \note IStdBasicStringCompatibleCharacter<T> => IUNICODECanUnambiguouslyConvertFrom<T>
110 */
111 template <typename T>
113
114 class Character;
115
116 /**
117 * \brief UNICODE string can be always be converted into array of this type
118 *
119 * IUNICODECanAlwaysConvertTo:
120 * o char8_t IBasicUNICODECodePoint
121 * o char16_t ""
122 * o char32_t ""
123 * o wchar_t IUNICODECodePoint
124 * o Character added in IUNICODECanAlwaysConvertTo
125 * \note all these types are <= 4 bytes (size of char32_t)
126 *
127 * \note - ASCII and Latin1 are NOT included here because - though these strings
128 * can be unambiguously converted to UNICODE, the REVERSE is not true (since for example
129 * not all UNICODE strings are ascii).
130 */
131 template <typename T>
132 concept IUNICODECanAlwaysConvertTo = IUNICODECodePoint<T> or same_as<remove_cv_t<T>, Character>;
137 //static_assert (IUNICODECanAlwaysConvertTo<Character>); true but not defined yet, so cannot assert here
138 static_assert (not IUNICODECanAlwaysConvertTo<ASCII>);
139 static_assert (not IUNICODECanAlwaysConvertTo<Latin1>);
140
141 /*
142 * IPossibleCharacterRepresentation concept corresponds to any type which MIGHT be convertible back and forth into a Character (possibly with extra information).
143 * For example, ASCII, Latin1, wchar_t, and obviously char32_t, etc...
144 */
145 template <typename T>
146 concept IPossibleCharacterRepresentation = convertible_to<T, char32_t> or same_as<remove_cv_t<T>, Character>;
147 static_assert (IPossibleCharacterRepresentation<char8_t>);
148 static_assert (IPossibleCharacterRepresentation<char16_t>);
149 static_assert (IPossibleCharacterRepresentation<char32_t>);
150 static_assert (IPossibleCharacterRepresentation<wchar_t>);
151 //static_assert (IPossibleCharacterRepresentation<Character>); true but not defined yet, so cannot assert here
152 static_assert (IPossibleCharacterRepresentation<ASCII>);
153 static_assert (IPossibleCharacterRepresentation<Latin1>);
154 static_assert (not IPossibleCharacterRepresentation<std::byte>);
155
156 /**
157 * \brief IUNICODECanUnambiguouslyConvertFrom is any 'character representation type' where array of them unambiguously convertible to UNICODE string
158 *
159 * IUNICODECanUnambiguouslyConvertFrom:
160 * o char8_t IUNICODECodePoint
161 * o char16_t ""
162 * o char32_t ""
163 * o wchar_t ""
164 * o Character added
165 * o ASCII added (NOTE - this is a typedef for char - so CHAR* interpreted as ASCII ONLY)
166 * o Latin1 added
167 *
168 * \note IUNICODECanUnambiguouslyConvertFrom means any 'basic character type' - size <= 4 bytes, which
169 * could reasonably, in context (so with extra info), could be safely converted into
170 * a Character object.
171 *
172 * @aliases Character_CanConditionallyConvertUNICODEStringToArrayOfThese
173 * for example, ASCII is one of these - any depending on what is in the UNICODE string
174 * you maybe able to (unambiguously) covnert to a string of this type.
175 *
176 * \see also IUNICODECanUnambiguouslyConvertTo
177 */
178 template <typename T>
184 //static_assert (IUNICODECanUnambiguouslyConvertFrom<Character>); true but not defined yet, so cannot assert here
187
188 /**
189 * \brief IUNICODECanUnambiguouslyConvertTo is any 'character representation type' you can unambiguously convert a UNICODE string into.
190 *
191 * IUNICODECanUnambiguouslyConvertTo:
192 * o char8_t IUNICODECodePoint
193 * o char16_t ""
194 * o char32_t ""
195 * o wchar_t ""
196 * o Character added
197 *
198 * \see also IUNICODECanUnambiguouslyConvertFrom
199 */
200 template <typename T>
206 //static_assert (IUNICODECanUnambiguouslyConvertTo<Character>); true but not defined yet, so cannot assert here
209
210 /**
211 * \note Satisfies Concepts:
212 * o static_assert (regular<Character>);
213 *
214 * \note <a href="Design-Overview.md#Comparisons">Comparisons</a>:
215 * o static_assert (totally_ordered<Character>)
216 * o Character::EqualsComparer and Character::ThreeWayComparer provided with construction parameters to allow case insensitive compares
217 */
218 class [[nodiscard]] Character {
219 public:
220 /**
221 * Default constructor produces a zero (null) character.
222 * Constructor with char32_t always produces a valid character.
223 *
224 * The overloads check for a valid character code-point and throw if given invalid data.
225 * - The overload taking a single char (ASCII) will throw if arg is not ASCII
226 * - The overload taking a single char16_t will throw if given a code-point not valid on its own (surrogate without pair)
227 * - The overload taking two char16_t surrogate pairs, may throw if given invalid code-points
228 * - The overload taking wchar_t will treat it as char16_t or char32_t constructor, depending on sizeof (wchar_t)
229 *
230 * To avoid checking, cast 'c' to char32_t, as any code-point will be considered valid (so no need to check).
231 */
232 constexpr Character () noexcept;
233 constexpr Character (const Character&) noexcept = default;
234 constexpr Character (Character&&) noexcept = default;
235 constexpr Character (ASCII c);
236 constexpr Character (Latin1 c) noexcept;
237 constexpr Character (char16_t c);
238 constexpr Character (char16_t hiSurrogate, char16_t lowSurrogate);
239 constexpr Character (char32_t c) noexcept;
240 constexpr Character (wchar_t c) noexcept (sizeof (wchar_t) == 4);
241
242 public:
243 constexpr Character& operator= (const Character&) noexcept = default;
244 constexpr Character& operator= (Character&&) noexcept = default;
245
246 public:
247 /**
248 * \pre IsASCII()
249 */
250 nonvirtual ASCII GetAsciiCode () const noexcept;
251
252 public:
253 /**
254 * \brief Return the char32_t UNICODE code-point associated with this character.
255 */
256 constexpr char32_t GetCharacterCode () const noexcept;
257
258 public:
259 /**
260 * Explicit cuz creates too many ambiguities with things like c == '\0' where conversions can go both ways.
261 */
262 explicit constexpr operator char32_t () const noexcept;
263
264 public:
265 /*
266 * \brief /0 overload return the character as a char32_t (or on systems where wchar_t is large enuf, as wchar_t)
267 * /1 overload return the character as a span<T> - for any IUNICODECodePoint - storing actual data in provided stack buffer.
268 *
269 * \note Before Stroika v3, As<> () always supported wchar_t - now only if its same as char32_t.
270 */
271 template <typename T>
272 constexpr T As () const noexcept
273 requires (same_as<T, char32_t> or (sizeof (wchar_t) == sizeof (char32_t) and same_as<T, wchar_t>));
274 template <IUNICODECodePoint T>
275 nonvirtual span<const T> As (Memory::StackBuffer<T>* buf) const;
276
277 private:
278 template <IUNICODECodePoint T>
279 nonvirtual void AsHelper_ (Memory::StackBuffer<T>* buf) const;
280
281 public:
282 /**
283 * \brief Return true iff the given character (or all in span) is (are) in the ascii range [0..0x7f]
284 *
285 * \note unlike other uses of CHAR_T in other methods in this class, even if CHAR_T=ASCII
286 * the code still loops and checks the range of characters. This is because ASCII == char
287 * and you need some way to check a bunch of 'char' elements and see if they are ascii.
288 */
289 constexpr bool IsASCII () const noexcept;
290 template <IPossibleCharacterRepresentation CHAR_T>
291 static constexpr bool IsASCII (CHAR_T c) noexcept;
292 template <IPossibleCharacterRepresentation CHAR_T>
293 static constexpr bool IsASCII (span<const CHAR_T> s) noexcept;
294
295 public:
296 /**
297 * \brief if not IsASCII (arg) throw RuntimeException...
298 */
299 template <IPossibleCharacterRepresentation CHAR_T>
300 static constexpr void CheckASCII (span<const CHAR_T> s);
301 template <IPossibleCharacterRepresentation CHAR_T>
302 static constexpr void CheckASCII (span<CHAR_T> s);
303
304 public:
305 /**
306 * \brief Return true iff the given character (or all in span) is (are) in the ascii/iso-latin range [0..0xff]
307 *
308 * This refers to ASCII OR https://en.wikipedia.org/wiki/Latin-1_Supplement, so any UNICODE character code point
309 * less than U+00FF.
310 *
311 * \note this pays close attention to the CHAR_T, and checks differently (especially for
312 * sizeof(CHAR_T)==1. If the type is ASCII or Latin1, there is nothing
313 * to check, and so this just returns true. For CHAR_T==char8_t, we walk the sequence of characters
314 * and verify carefully that the encoded characters all will fit in the ISO-Latin1 range (<= 256).
315 *
316 * @see Latin1
317 *
318 */
319 constexpr bool IsLatin1 () const noexcept;
320 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
321 static constexpr bool IsLatin1 (span<const CHAR_T> s) noexcept;
322
323 public:
324 /**
325 * \see IsASCIIOrLatin1
326 */
328 eNone,
329 eASCII,
330 eLatin1
331 };
332
333 public:
334 /**
335 * Combines check for IsASCII and IsLatin1 in one call (performance). Returns flag indicating
336 * most specific possible answer for the entire span. So if all characters ascii, that's returned.
337 * If not, but all characters latin1, that's returned. Else returned none.
338 *
339 * \note, if CHAR_T == Latin1 or ASCII, then this will NEVER return none. Its equivalent to
340 * IsASCII. If CHAR_T==ASCII. we do like IsASCII(): and actually check the bytes in the
341 * ASCII change, despite the ASCII designation (rationale in IsASCII).
342 */
343 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
344 static constexpr ASCIIOrLatin1Result IsASCIIOrLatin1 (span<const CHAR_T> s) noexcept;
345
346 public:
347 /**
348 * \brief if not IsLatin1 (arg) throw RuntimeException...
349 */
350 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
351 static void CheckLatin1 (span<const CHAR_T> s);
352 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
353 static void CheckLatin1 (span<CHAR_T> s);
354
355 public:
356 /**
357 * FROM https://en.cppreference.com/w/cpp/string/wide/iswspace:
358 * In the default (C) locale, the whitespace characters are the following:
359 * space (0x20, ' ')
360 * form feed (0x0c, '\f')
361 * line feed (0x0a, '\n')
362 * carriage return (0x0d, '\r')
363 * horizontal tab (0x09, '\t')
364 * vertical tab (0x0b, '\v')
365 * ...
366 * ISO 30112 defines POSIX space characters as UNICODE characters
367 * U+0009..U+000D, U+0020, U+1680, U+180E, U+2000..U+2006, U+2008..U+200A, U+2028, U+2029, U+205F, and U+3000.
368 *
369 * \note before Stroika v3.0d1, this just used iswspace()
370 */
371 constexpr bool IsWhitespace () const noexcept;
372 static constexpr bool IsWhitespace (Character c) noexcept;
373
374 public:
375 nonvirtual bool IsDigit () const noexcept;
376
377 public:
378 nonvirtual bool IsHexDigit () const noexcept;
379
380 public:
381 nonvirtual bool IsAlphabetic () const noexcept;
382
383 public:
384 /**
385 * Checks if the given character is upper case. Can be called on any character.
386 * Returns false if not alphabetic
387 */
388 nonvirtual bool IsUpperCase () const noexcept;
389
390 public:
391 /**
392 * Checks if the given character is lower case. Can be called on any character.
393 * Returns false if not alphabetic
394 */
395 nonvirtual bool IsLowerCase () const noexcept;
396
397 public:
398 /**
399 */
400 nonvirtual bool IsAlphaNumeric () const noexcept;
401
402 public:
403 /**
404 */
405 nonvirtual bool IsPunctuation () const noexcept;
406
407 public:
408 /**
409 * According to https://en.cppreference.com/w/cpp/string/wide/iswcntrl
410 *
411 * ISO 30112 defines POSIX control characters as UNICODE characters U+0000..U+001F,
412 * U+007F..U+009F, U+2028, and U+2029 (UNICODE classes Cc, Zl, and Zp)
413 */
414 constexpr bool IsControl () const noexcept;
415
416 public:
417 /**
418 * Note that this does NOT modify the character in place but returns the new desired
419 * character.
420 *
421 * It is not necessary to first check
422 * if the argument character is uppercase or alphabetic. ToLowerCase () just returns the
423 * original character if there is no sensible conversion.
424 *
425 * \todo @todo See https://www.open-std.org/JTC1/SC35/WG5/docs/30112d10.pdf
426 *
427 * \see https://en.cppreference.com/w/cpp/string/wide/towlower
428 * Only 1:1 character mapping can be performed by this function,
429 * e.g. the Greek uppercase letter 'Σ' has two lowercase forms, depending on the
430 * position in a word: 'σ' and 'ς'. A call to std::towlower cannot be used to
431 * obtain the correct lowercase form in this case.
432 */
433 nonvirtual Character ToLowerCase () const noexcept;
434
435 public:
436 /**
437 * Note that this does NOT modify the character in place but returns the new desired
438 * character.
439 *
440 * It is not necessary to first check
441 * if the argument character is lowercase or alphabetic. ToUpperCase () just returns the
442 * original character if there is no sensible conversion.
443 *
444 * \todo @todo See https://www.open-std.org/JTC1/SC35/WG5/docs/30112d10.pdf
445 */
446 nonvirtual Character ToUpperCase () const noexcept;
447
448 public:
449 /**
450 * Convert String losslessly into a standard C++ type.
451 * If this source contains any invalid ASCII characters, this returns false, and otherwise true (with set into).
452 *
453 * \pre into->empty ()
454 *
455 * Supported Types (RESULT_T):
456 * o Memory::StackBuffer<ASCII>
457 * o string
458 * o u8string
459 */
460 template <typename RESULT_T = string, IPossibleCharacterRepresentation CHAR_T>
461 static bool AsASCIIQuietly (span<const CHAR_T> fromS, RESULT_T* into)
462 requires requires (RESULT_T* into) {
463 { into->empty () } -> same_as<bool>;
464 { into->push_back (ASCII{0}) };
465 };
466
467 public:
468 /**
469 * See https://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates
470 *
471 * \note - would be nice to use DiscreteRange for these, but hard todo given deadly embrace.
472 */
473 static constexpr char16_t kUNICODESurrogate_High_Start{0xD800}; // was UNI_SUR_HIGH_START
474 static constexpr char16_t kUNICODESurrogate_High_End{0xDBFF}; // was UNI_SUR_HIGH_END
475 static constexpr char16_t kUNICODESurrogate_Low_Start{0xDC00}; // was UNI_SUR_LOW_START
476 static constexpr char16_t kUNICODESurrogate_Low_End{0xDFFF}; // was UNI_SUR_LOW_END
477
478 public:
479 /**
480 * Return true iff this Character (or argument codepoints) represent a
481 * character which would be represented in UCS-16 as a surrogate pair.
482 */
483 constexpr bool IsSurrogatePair () const;
484 static constexpr bool IsSurrogatePair (char16_t hiSurrogate, char16_t lowSurrogate);
485 static constexpr bool IsSurrogatePair_Hi (char16_t hiSurrogate);
486 static constexpr bool IsSurrogatePair_Lo (char16_t lowSurrogate);
487
488 public:
489 /**
490 * \pre IsSurrogatePair
491 * returns the high/low pseudo-characters of the character
492 */
493 constexpr pair<char16_t, char16_t> GetSurrogatePair () const;
494
495 public:
496 /**
497 */
498 constexpr bool operator== (const Character&) const noexcept = default;
499
500 public:
501 /**
502 */
503 constexpr strong_ordering operator<=> (const Character&) const noexcept = default;
504
505 public:
506 struct EqualsComparer;
507
508 public:
509 struct ThreeWayComparer;
510
511 public:
512 /**
513 * utility to compare an array of characters, like strcmp (), except with param saying if case sensitive or insensitive.
514 *
515 * \todo Consider if this should be somehow packaged with Character::ThreeWayComparer?
516 */
517 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T, size_t E1, size_t E2>
518 static constexpr strong_ordering Compare (span<const CHAR_T, E1> lhs, span<const CHAR_T, E2> rhs, CompareOptions co) noexcept;
519
520 public:
521 [[deprecated ("Since Stroika 3.0d1, use span based Compare")]] static strong_ordering
522 Compare (const Character* lhsStart, const Character* lhsEnd, const Character* rhsStart, const Character* rhsEnd, CompareOptions co) noexcept
523 {
524 return Compare (span{lhsStart, lhsEnd}, span{rhsStart, rhsEnd}, co);
525 }
526
527 private:
528 char32_t fCharacterCode_;
529 };
530 static_assert (totally_ordered<Character>);
531
532 /**
533 * Like equal_to<Character> but allow optional case insensitive compares
534 */
535 struct Character::EqualsComparer : Common::ComparisonRelationDeclarationBase<Common::ComparisonRelationType::eEquals> {
536 /**
537 * optional CompareOptions to CTOR allows for case insensitive compares
538 */
539 constexpr EqualsComparer (Stroika::Foundation::Characters::CompareOptions co = Stroika::Foundation::Characters::CompareOptions::eWithCase) noexcept;
540
541 /**
542 */
543 constexpr bool operator() (Character lhs, Character rhs) const noexcept;
544
545 Stroika::Foundation::Characters::CompareOptions fCompareOptions;
546 };
547
548 /**
549 * Like compare_three_way but allow optional case insensitive compares
550 */
551 struct Character::ThreeWayComparer : Common::ComparisonRelationDeclarationBase<Common::ComparisonRelationType::eThreeWayCompare> {
552 /**
553 * optional CompareOptions to CTOR allows for case insensitive compares
554 */
555 constexpr ThreeWayComparer (Stroika::Foundation::Characters::CompareOptions co = Stroika::Foundation::Characters::CompareOptions::eWithCase) noexcept;
556
557 /**
558 */
559 nonvirtual auto operator() (Stroika::Foundation::Characters::Character lhs, Stroika::Foundation::Characters::Character rhs) const noexcept;
560
561 Stroika::Foundation::Characters::CompareOptions fCompareOptions;
562 };
563
564 [[deprecated ("UNSUPPORTED Since Stroika v3.0d1")]] const wchar_t* CVT_CHARACTER_2_wchar_t (const Character* c);
565
566}
567
568/*
569 ********************************************************************************
570 ***************************** Implementation Details ***************************
571 ********************************************************************************
572 */
573#include "Character.inl"
574
575#endif /*__Character__*/
#define Stroika_Define_Enum_Bounds(FIRST_ITEM, LAST_ITEM)
static constexpr ASCIIOrLatin1Result IsASCIIOrLatin1(span< const CHAR_T > s) noexcept
Logically halfway between std::array and std::vector; Smart 'direct memory array' - which when needed...
check if T is char8_t, char16_t, char32_t - one of the three possible UNICODE UTF code-point classes.
Definition Character.h:90
concept IStdBasicStringCompatibleCharacter tests if the 'T' argument is a legit CHARACTER argument to...
Definition Character.h:112
UNICODE string can be always be converted into array of this type.
Definition Character.h:132
IUNICODECanUnambiguouslyConvertFrom is any 'character representation type' where array of them unambi...
Definition Character.h:179
IUNICODECanUnambiguouslyConvertTo is any 'character representation type' you can unambiguously conver...
Definition Character.h:201
check if T is IBasicUNICODECodePoint or wchar_t (any basic code-point class)
Definition Character.h:102
concept - trivial shorthand for variadic same_as A or same_as B, or ...
Definition Concepts.h:175
char ASCII
Stroika's string/character classes treat 'char' as being an ASCII character.
Definition Character.h:59