Stroika Library 3.0d16
 
Loading...
Searching...
No Matches
String.h
Go to the documentation of this file.
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#ifndef _Stroika_Foundation_Characters_String_h_
5#define _Stroika_Foundation_Characters_String_h_ 1
6
7#include "Stroika/Foundation/StroikaPreComp.h"
8
9#include <compare>
10#include <iosfwd>
11#include <locale>
12#include <string>
13#include <string_view>
14
16#include "Stroika/Foundation/Characters/SDKString.h"
19#include "Stroika/Foundation/Containers/Sequence.h"
20#include "Stroika/Foundation/Containers/Set.h"
24
25/**
26 * \file
27 *
28 * \note Code-Status: <a href="Code-Status.md#Beta">Beta</a>
29 *
30 * TODO:
31 *
32 * @todo Cleanup SubString (), and String::SubString_ use of SharedByValue<TRAITS>::ReadOnlyReference for
33 * performance. At some level - in String::SubString_ - we have a (hidden) sharedPtr and it would
34 * be safe and performant in that case to re-use that shared_ptr to make a new String envelope.
35 *
36 * However, I'm not sure its safe in general to have SharedByValue<TRAITS>::ReadOnlyReference expose
37 * its shared_ptr, which appears needed to make this happen.
38 *
39 * Not a biggie opportunity, so we can delay this -- LGP 2014-04-10
40 *
41 * @todo Add PadLeft/PadRight or FillLeft/FilLRight() - not sure which name is better. But idea is to
42 * produce a string which is identical to the orig except that IF start len < n, then expand it with
43 * the given arg char repeated on the left or right.
44 *
45 * @todo RFind() API should be embellished to include startAt etc, like regular Find () - but not 100%
46 * sure - think through...
47 *
48 * @todo MAYBE also add ReplaceOne() function (we have ReplaceAll() now) ; see Replace() API in this function? - maybe overload?
49 *
50 * @todo Move DOCS in the top of this file down to the appropriate major classes - and then review the implementation and make sure
51 * it is all correct for each (especially SetStorage () stuff looks questionable)
52 *
53 */
54
56 template <typename T>
57 class Set;
58}
59
61
62 /*
63 * \brief On Windows, affects the behavior of String::As<filesystem::path> ()
64 *
65 * On windows, its helpful when mapping String to std::filesystem::pathname to map certain common name
66 * prefixes to things that will be found on Windows.
67 *
68 * MSYS creates paths like /c/folder for c:/folder
69 * CYGWIN creates paths like /cygdrive/c/folder for c:/folder
70 *
71 * Automatically map these (since Stroika v3.0d6) in (was ToPath) As<filesystem::path> ();
72 *
73 * \see https://www.msys2.org/docs/filesystem-paths/
74 * this API is for getting strings from the commandline, or user input, or configuration files etc, where Cygwin
75 * or msys style paths maybe present. APIs that talk directly to the OS are more likely to more directly produce
76 * filesystem::path than String. Anyhow - because of this, on windows, its probably more helpful than not to map
77 * the MSYS/cygdrive crap to a path more likely to actually work right. --LGP 2024-03-06
78 */
79#ifndef qStroika_Foundation_Characters_AsPathAutoMapMSYSAndCygwin
80#define qStroika_Foundation_Characters_AsPathAutoMapMSYSAndCygwin qStroika_Foundation_Common_Platform_Windows
81#endif
82
83}
84
85#if qStroika_Foundation_Characters_AsPathAutoMapMSYSAndCygwin
86namespace std::filesystem {
87 class path; // forward declare for template specialization
88}
89#endif
90
92
93 class RegularExpression;
94 class RegularExpressionMatch;
95
96 /**
97 * Flag principally for LimitLength, but used elsewhere as well (e.g. ToString ()).
98 */
100 ePreferKeepLeft,
101 ePreferKeepRight,
102 ePreferKeepMid,
103
104 eDEFAULT = ePreferKeepLeft,
105
106 Stroika_Define_Enum_Bounds (ePreferKeepLeft, ePreferKeepMid)
107 };
108 using StringShorteningPreference::ePreferKeepLeft;
109 using StringShorteningPreference::ePreferKeepMid;
110 using StringShorteningPreference::ePreferKeepRight;
111
112 /**
113 * \brief returns true iff T == u8string, u16string, u32string, or wstring - which std::string types can be unambiguously converted to UNICODE
114 */
115 template <typename T>
116 concept IBasicUNICODEStdString = same_as<T, u8string> or same_as<T, u16string> or same_as<T, u32string> or same_as<T, wstring>;
117
118 /**
119 * \brief anything with a 'special .STRINGTYPE conversion' method to UNICODE string, such as filesystem::path
120 *
121 * Really, this is a thinly veiled attempt to avoid #include <filesystem> for modularity reasons.
122 */
123 template <typename T>
124 concept IStdPathLike2UNICODEString = requires (T t) {
125 { t.wstring () } -> same_as<wstring>;
126 } or requires (T t) {
127 { t.u8string () } -> same_as<u8string>;
128 } or requires (T t) {
129 { t.u16string () } -> same_as<u16string>;
130 } or requires (T t) {
131 { t.u32string () } -> same_as<u32string>;
132 };
133
134 class String;
135
136 /**
137 * The concept IConvertibleToString is satisfied iff the argument type can be used to construct a (Stroika) String.
138 * Note subtly, const char* is treated (as of Stroika v3) as convertible to String, but the characters must be ASCII, or
139 * an exception will be generated in the constructor.
140 */
141 template <typename T>
142 concept IConvertibleToString = convertible_to<T, String>;
143
144 /**
145 * \brief String is like std::u32string, except it is much easier to use, often much more space efficient, and more easily interoperates with other string types
146 *
147 * The Stroika String class is conceptually a sequence of (UNICODE) Characters, and so there is
148 * no obvious way to map the Stroika String to a std::string (in general). However, if you specify a codepage
149 * for conversion, or are converting to/from SDKString/SDKChar, or u8string, etc, there is builtin support for that.
150 *
151 * EOS Handling:
152 * The Stroika String class supports having embedded NUL-characters. It also supports
153 * easy construction from NUL-terminated character strings.
154 *
155 * Since Stroika v3, there is no longer c_str () support, since Stroika doesn't internally
156 * require NUL-terminated strings, and actively encourages different compact representations of
157 * strings (c_str() requires a choice of a particular encoding to make sense).
158 *
159 * About spans, and the \0 NUL-termination - generally do NOT include
160 * the NUL-character in your span! Stroika strings will allow this, and treat
161 * it as just another character, but its probably not what you meant.
162 *
163 * \note Narrow String handling
164 * Because the character set of strings of type 'char' is ambiguous, if you construct a String
165 * with char (char* etc) - it is somehow 'required' that the characters be ASCII. If using the FromConstantString () API
166 * , or operator"" _k, it is checked with Require () - so assertion failure. If you construct
167 * with String::CTOR, it will generate a runtime exception (so more costly runtime checking).
168 *
169 * \note Satisfies Concepts:
170 * o static_assert (regular<String>);
171 *
172 * \note \em Thread-Safety <a href="Thread-Safety.md#C++-Standard-Thread-Safety">C++-Standard-Thread-Safety</a>
173 *
174 * \note Design note - mutability vs. immutability
175 * http://stroika-bugs.sophists.com/browse/STK-968 (see about deleting deprecated APIs and remnants of mutability) and c_str()
176 *
177 * String objects are IMMUTABLE (except for the OBVIOUS meaning case of operator= being allowed).
178 *
179 * String reps are IMMUTABLE.
180 *
181 * Use StringBuilder for a 'mutable' String (can be used mostly interchangeably with String).
182 *
183 * Current Mutating methods (as of v3.0d1x)
184 * o c_str () -- non-const deprecated in v3.0d13
185 * o SetCharAt - deprecated v3.0d12
186 * o c_str() (consider deprecating?)
187 * o operator= - deprecated v3.0d12
188 * o clear()- deprecated v3.0d12
189 * o Append - deprecated v3.0d12
190 * o operator+= - deprecated v3.0d12
191 * o erase() - deprecated v3.0d12
192 *
193 * SOMEWHAT ironically, the only of these methods hard to replace is the non-const c_str () - and maybe there
194 * not bad cuz I deprecated? COULD just deprecate ALL of these, and then the class is fully immutable. Probably
195 * easier to understand/reason about.
196 *
197 * \note <a href="Design-Overview.md#Comparisons">Comparisons</a>:
198 * o static_assert (totally_ordered<String>);
199 * o String::EqualsComparer, String::ThreeWayComparer and String::LessComparer provided with construction parameters to allow case insensitive compares
200 */
201 class [[nodiscard]] String : public Traversal::Iterable<Character> {
202 private:
203 using inherited = Iterable<Character>;
204
205 protected:
206 class _IRep;
207
208 public:
209 /**
210 * All the constructors are obvious, except
211 * o NUL-character ARE allowed in strings, except for the case of single
212 * charX* argument constructors - which find the length based on
213 * the terminating NUL-character.
214 *
215 * o CTOR (PATHLIKE_TOSTRINGABLE&& s) - IStdPathLike2UNICODEString PATHLIKE_TOSTRINGABLE
216 * carefully excludes conflicting CTOR overloads, and purpose is to allow constructing a String
217 * from anything with a 'special conversion' method to UNICODE string, such as filesystem::path.
218 *
219 * \note about lifetime of argument data (basic_string_view<CHAR_T> constructors)
220 * All data is copied out / saved by the end of the constructor for all constructors EXCEPT
221 * the basic_string_view<CHAR_T> constructors - where it is REQUIRED the data last 'forever'.
222 *
223 * \pre for String (const basic_string_view<wchar_t>& str) - str[str.length()]=='\0';
224 * c-string nul-terminated (which happens automatically with L"xxx"sv)
225 *
226 * \note 'char' (using ASCII = char) constructors:
227 * Because the character-set of strings of type 'char' is ambiguous, if you construct a String
228 * with char (char* etc) - it runtime checked that the characters are ASCII (except for the basic_string_view
229 * constructors where we check but with assertions).
230 *
231 * This mimics the behavior in Stroika v2.1 with String::FromASCII ()
232 *
233 * \note the basic_string move Constructors MAY move or copy the underlying std string, but they still maintain
234 * the same requirements on their arguments as the copy basic_string constructors (eg. char must be ascii)
235 *
236 * \see also - FromUTF8, FromSDKString, FromNarrowSDKString, FromStringConstant, FromLatin1, which are all like constructors
237 * but with special names to avoid confusion and make clear their arguments, and not participate in overloading. Note, chose
238 * this path instead of FLAG argument and explicit on CTOR, cuz more terse.
239 */
240 String ();
241 explicit String (Character c);
242 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
243 String (const CHAR_T* cString);
244 template <Memory::ISpan SPAN_OF_CHAR_T>
245 String (SPAN_OF_CHAR_T s)
247 template <IStdBasicStringCompatibleCharacter CHAR_T>
248 String (const basic_string<CHAR_T>& s);
249 template <IStdBasicStringCompatibleCharacter CHAR_T>
250 String (const basic_string_view<CHAR_T>& s);
251 template <IStdBasicStringCompatibleCharacter CHAR_T>
252 explicit String (basic_string<CHAR_T>&& s);
253 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
254 String (const Iterable<CHAR_T>& src)
255 requires (not Memory::ISpan<CHAR_T>);
256 template <IStdPathLike2UNICODEString PATHLIKE_TOSTRINGABLE>
257 explicit String (PATHLIKE_TOSTRINGABLE&& s);
258 String (String&& from) noexcept = default;
259 String (const String& from) noexcept = default;
260
261 private:
262 template <IStdPathLike2UNICODEString PATHLIKE_TOSTRINGABLE>
263 static String mkSTR_ (PATHLIKE_TOSTRINGABLE&& s);
264
265 private:
266 static shared_ptr<_IRep> CTORFromBasicStringView_ (const basic_string_view<ASCII>& str);
267 static shared_ptr<_IRep> CTORFromBasicStringView_ (const basic_string_view<char8_t>& str);
268 static shared_ptr<_IRep> CTORFromBasicStringView_ (const basic_string_view<char16_t>& str);
269 static shared_ptr<_IRep> CTORFromBasicStringView_ (const basic_string_view<char32_t>& str);
270 static shared_ptr<_IRep> CTORFromBasicStringView_ (const basic_string_view<wchar_t>& str);
271
272 public:
273 ~String () = default;
274
275 protected:
276 /**
277 */
278 using _SafeReadRepAccessor = Iterable<Character>::_SafeReadRepAccessor<_IRep>;
279
280 protected:
281 /**
282 * \pre rep MUST be not-null
283 * However, with move constructor, it maybe null on exit.
284 */
285 String (const shared_ptr<_IRep>& rep) noexcept;
286 String (shared_ptr<_IRep>&& rep) noexcept;
287
288 public:
289 nonvirtual String& operator= (String&& rhs) noexcept = default;
290 nonvirtual String& operator= (const String& rhs) noexcept = default;
291
292 public:
293 /**
294 * Create a String object from a 'char-based' utf-8 encoded string.
295 *
296 * \par Example Usage
297 * \code
298 * EXPECT_TRUE (string{u8"שלום"} == String::FromUTF8 (u8"שלום").AsUTF8 ());
299 * \endcode
300 *
301 * \note This is not generally needed, as you can just use the String::CTOR, but for cases like
302 * std::string-> String - where the conversion needs extra information (an assertion about character encoding of source characters).
303 *
304 * \note Reading improperly encoded text may result in a RuntimeException indicating improperly encoded characters.
305 */
306 template <typename CHAR_T>
307 static String FromUTF8 (span<CHAR_T> from)
308 requires (same_as<remove_cv_t<CHAR_T>, char8_t> or same_as<remove_cv_t<CHAR_T>, char>);
309 template <typename CHAR_T>
310 static String FromUTF8 (basic_string<CHAR_T> from)
311 requires (same_as<remove_cv_t<CHAR_T>, char8_t> or same_as<remove_cv_t<CHAR_T>, char>);
312 template <typename CHAR_T>
313 static String FromUTF8 (const CHAR_T* from)
314 requires (same_as<remove_cv_t<CHAR_T>, char8_t> or same_as<remove_cv_t<CHAR_T>, char>);
315
316 public:
317 /**
318 * Create a String object from a 'SDKChar' (os-setting - current code page) encoded string.
319 * See @SDKChar
320 * See @SDKString
321 *
322 * \note Reading improperly encoded text may result in a RuntimeException indicating improperly encoded characters.
323 */
324 static String FromSDKString (const SDKChar* from);
325 static String FromSDKString (span<const SDKChar> s);
326 static String FromSDKString (const SDKString& from);
327
328 public:
329 /**
330 * Create a String object from a 'char-based' (os-setting - current code page) encoded string.
331 *
332 * \note Reading improperly encoded text may result in a RuntimeException indicating improperly encoded characters.
333 */
334 static String FromNarrowSDKString (const char* from);
335 static String FromNarrowSDKString (span<const char> s);
336 static String FromNarrowSDKString (const string& from);
337
338 public:
339 /**
340 * Create a String object from a char based on the encoding from the argument locale.
341 * This throws an exception if there is an error performing the conversion.
342 *
343 * \note Reading improperly encoded text may result in a RuntimeException indicating improperly encoded characters.
344 */
345 static String FromNarrowString (const char* from, const locale& l);
346 static String FromNarrowString (span<const char> s, const locale& l);
347 static String FromNarrowString (const string& from, const locale& l);
348
349 public:
350 /**
351 * \brief Take the given argument data (constant span) - which must remain unchanged - constant - for the application lifetime - and treat it as a Stroika String object
352 *
353 * This allows creation of String objects with fewer memory allocations and less copyinh, and more efficient storage, in most situations
354 *
355 * The resulting String is a perfectly compliant Stroika String (somewhat akin to std::string_view vs std::string).
356 *
357 * \par Example:
358 * \code
359 * String tmp1 = "FRED";
360 * String tmp2 = String{"FRED"};
361 * String tmp3 = String::FromStringConstant ("FRED"); // same as 2 above, but faster
362 * String tmp4 = "FRED"sv; // equivalent to FromStringConstant
363 * String tmp5 = "FRED"_k; // equivalent to FromStringConstant
364 * \endcode
365 *
366 * \em WARNING - BE VERY CAREFUL - be sure arguments have application lifetime (intended use case is C string literals).
367 *
368 * \pre argument string MAY contain embedded nul characters (but for char* overloads wrong size inferred).
369 *
370 * \note In Stroika v2.1 this was called class String_ExternalMemoryOwnership_ApplicationLifetime.
371 * \note In Stroika v2.1 this was called class String_Constant.
372 * \note In Stroika v2.1 this required NUL-char termination, but no longer
373 *
374 * \note FromStringConstant with 'char' - REQUIRES that the char elements are ASCII (someday this maybe lifted and interpret as Latin1)
375 * For the case of char, we also do not check/require the nul-termination bit.
376 */
377 template <size_t SIZE, IUNICODECanUnambiguouslyConvertFrom CHAR_T>
378 static String FromStringConstant (const CHAR_T (&cString)[SIZE]);
379 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
380 static String FromStringConstant (const basic_string_view<CHAR_T>& str);
381 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
382 static String FromStringConstant (span<const CHAR_T> str);
383 static String FromStringConstant (span<const ASCII> s); // better impl in CPP file
384 static String FromStringConstant (span<const char16_t> s); // ""
385 static String FromStringConstant (span<const wchar_t> s); // "" (inl file)
386 static String FromStringConstant (span<const char32_t> s); // ""
387
388 public:
389 /**
390 * Create a String object from UNICODE Latin-1 Supplement (https://en.wikipedia.org/wiki/Latin-1_Supplement)
391 *
392 * This is roughly, but not exactly, the same as the ISO-Latin-1 single-byte character set (https://en.wikipedia.org/wiki/ISO/IEC_8859-1)
393 *
394 * \note if character code point >= 256, this will throw an exception - not defined for that range (only checked if sizeof (CHAR_T) > 1)
395 *
396 * @aliases From8bitASCII () or FromExtendedASCII ()
397 */
398 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
399 static String FromLatin1 (const CHAR_T* cString);
400 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
401 static String FromLatin1 (span<const CHAR_T> s);
402 template <IStdBasicStringCompatibleCharacter CHAR_T>
403 static String FromLatin1 (const basic_string<CHAR_T>& s);
404
405 public:
406 /**
407 * \brief appends 'rhs' string to this string (without modifying this string) and returns the combined string
408 *
409 * @see Append() for a similar function that modifies 'this'
410 */
411 template <typename T>
412 nonvirtual String Concatenate (T&& rhs) const
413 requires (is_convertible_v<T, String>);
414
415 private:
416 nonvirtual String Concatenate_ (const String& rhs) const;
417
418 public:
419 /**
420 * Returns the number of characters in the String. Note that this may not be the same as bytes,
421 * does not include NUL termination, and doesn't in any way respect NUL termination (meaning
422 * a nul-character is allowed in a Stroika string.
423 *
424 * @aliases GetLength ()
425 */
426 nonvirtual size_t size () const noexcept;
427
428 public:
429 /**
430 */
431 nonvirtual bool empty () const noexcept;
432
433 public:
434 /**
435 */
436 nonvirtual const Character GetCharAt (size_t i) const noexcept;
437
438 public:
439 /**
440 * \brief return (read-only) Character object
441 *
442 * @aliases GetCharAt (size_t i) const;
443 *
444 * \note returns const due to https://stroika.atlassian.net/browse/STK-376 - so cannot accidentally have illusion of assignment being legal
445 */
446 nonvirtual const Character operator[] (size_t i) const noexcept;
447
448 public:
449 /**
450 * InsertAt() constructs a new string by taking this string, and inserting the argument
451 * characters.
452 *
453 * \em Note that for repeated insertions, this is much less efficient than just
454 * using StringBuilder.
455 *
456 * \note that if at == this->size (), you are appending.
457 */
458 nonvirtual String InsertAt (Character c, size_t at) const;
459 nonvirtual String InsertAt (const String& s, size_t at) const;
460 nonvirtual String InsertAt (span<const Character> s, size_t at) const;
461 nonvirtual String InsertAt (span<Character> s, size_t at) const;
462
463 public:
464 /**
465 * Remove the characters at 'charAt' (RemoveAt/1) or between 'from' and 'to' (const method - doesn't modify this)
466 *
467 * It is an error if this implies removing characters off the end of the string.
468 *
469 * \par Example Usage
470 * \code
471 * String mungedData = "04 July 2014";
472 * if (optional<pair<size_t, size_t>> i = mungedData.Find (RegularExpression{"0[^\b]"})) {
473 * mungedData = mungedData.RemoveAt (*i);
474 * }
475 * \endcode
476 *
477 * \pre (charAt < size ())
478 * \pre (from <= to)
479 * \pre (to <= size ())
480 *
481 * \em Note that this is quite inefficient: consider using StringBuilder
482 */
483 nonvirtual String RemoveAt (size_t charAt) const;
484 nonvirtual String RemoveAt (size_t from, size_t to) const;
485 nonvirtual String RemoveAt (pair<size_t, size_t> fromTo) const;
486
487 public:
488 /**
489 * Remove the first occurrence of Character 'c'/'/subString/ from the string. Not an error if none
490 * found. Doesn't modify this (const method) - returns resulting string.
491 *
492 * \em Note that this is quite inefficient: consider using StringBuffer
493 */
494 nonvirtual String RemoveFirstIf (Character c) const;
495 nonvirtual String RemoveFirstIf (const String& subString) const;
496
497 public:
498 /**
499 * Remove the all occurrences of Character 'c/subString' from this string
500 * (walking front to back - if removal creates one, it too is removed).
501 * Not an error if none found. Doesn't modify this (const method) - returns resulting string.
502 */
503 nonvirtual String RemoveAll (Character c) const;
504 nonvirtual String RemoveAll (const String& subString) const;
505
506 public:
507 /**
508 * OVERLOADS WITH size_t:
509 *
510 * Produce a substring of this string, starting at 'from', and up to 'to' (or end of string
511 * for one-arg overload).
512 *
513 * *NB* This function treats the second argument differently than String::substr () -
514 * which respects the STL basic_string API. This function treats the second argument
515 * as a 'to', STL substr() treats it as a count. This amounts to the same thing for the
516 * very common cases of substr(N) - because second argument is defaulted, and,
517 * substr (0, N) - because then the count and end are the same.
518 *
519 * \pre (from <= to);
520 * \pre (to <= size ()); // for 2-arg variant
521 *
522 * \par Example Usage
523 * \code
524 * String tmp { "This is good" };
525 * Assert (tmp.SubString (5) == "is good");
526 * \endcode
527 *
528 * \par Example Usage
529 * \code
530 * const String kTest_ { "a=b"sv };
531 * const String kLbl2LookFor_ { "a="_k };
532 * if (resultLine.Find (kLbl2LookFor_)) {
533 * String tmp { resultLine.SubString (kLbl2LookFor_.length ()) };
534 * }
535 * Assert (tmp == "b");
536 * \endcode
537 *
538 * OVERLOADS WITH ptrdiff_t:
539 *
540 * This is like SubString() except that if from/to are negative, they are treated as relative to the end
541 * of the String.
542 *
543 * So for example, SubString (0, -1) is equivalent to SubString (0, size () - 1) - and so is an
544 * error if the string is empty.
545 *
546 * Similarly, SubString (-5) is equivalent to SubString (size ()-5, size ()) - so can be used
547 * to grab the end of a string.
548 *
549 * \pre (adjustedFrom <= adjustedTo);
550 * \pre (adjustedTo <= size ()); // for 2-arg variant
551 *
552 * \note \em Design Note
553 * We chose not to overload SubString() with this functionality because it would have been to easy
554 * to mask bugs.
555 *
556 * \note \em Design Note
557 * This was originally inspired by Python arrays. From https://docs.python.org/2/tutorial/introduction.html:
558 * Indices may also be negative numbers, to start counting from the right
559 *
560 * @aliases
561 * This API is identical to the javascript String.slice () method/behavior
562 * @see http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
563 * 15.5.4.13 String.prototype.slice (start, end)
564 *
565 * @aliases
566 * This API - when called with negative indexes - used to be called CircularSubString ().
567 *
568 * @see substr
569 * @see SafeSubString
570 */
571 template <typename SZ>
572 nonvirtual String SubString (SZ from) const;
573 template <typename SZ1, typename SZ2>
574 nonvirtual String SubString (SZ1 from, SZ2 to) const;
575
576 public:
577 /**
578 * Like SubString(), but no requirements on from/to. These are just adjusted to the edge of the string
579 * if the exceed those endpoints. And if arguments are <0, they are interpreted as end-relative.
580 *
581 * @aliases
582 * This API - when called with negative indexes - used to be called SafeCircularSubString ().
583 *
584 * @see substr
585 * @see SubString
586 */
587 template <typename SZ>
588 nonvirtual String SafeSubString (SZ from) const;
589 template <typename SZ1, typename SZ2>
590 nonvirtual String SafeSubString (SZ1 from, SZ2 to) const;
591
592 public:
593 /**
594 * Return 'count' copies of this String (concatenated after one another).
595 */
596 nonvirtual String Repeat (unsigned int count) const;
597
598 public:
599 /**
600 * Returns true if the argument character or string is found anywhere inside this string.
601 * This is equivalent to
602 * return Matches (".*" + X + ".*"); // If X had no characters which look like they are part of
603 * // a regular expression
604 *
605 * @see Match
606 */
607 nonvirtual bool Contains (Character c, CompareOptions co = eWithCase) const;
608 nonvirtual bool Contains (const String& subString, CompareOptions co = eWithCase) const;
609
610 public:
611 /**
612 *
613 */
614 nonvirtual bool ContainsAny (Iterable<Character> cs, CompareOptions co = eWithCase) const;
615
616 public:
617 /**
618 * Returns true iff the given substring is contained in this string.
619 *
620 * Similar to:
621 * return Matches (X + ".*");
622 * except for the fact that with StartsWith() doesn't interpret 'X' as a regular expression
623 *
624 * \pre not subString.empty () -- for the subString overload (because otherwise "".StartsWith("") would be ill-defined)
625 *
626 * @see Match
627 * @see EndsWith
628 */
629 nonvirtual bool StartsWith (const Character& c, CompareOptions co = eWithCase) const;
630 nonvirtual bool StartsWith (const String& subString, CompareOptions co = eWithCase) const;
631
632 public:
633 /**
634 * Returns true iff the given substring is contained in this string.
635 *
636 * Similar to:
637 * return Matches (X + ".*");
638 * except for the fact that with StartsWith() doesn't interpret 'X' as a regular expression
639 *
640 * \pre not subString.empty () -- for the subString overload (because otherwise "".EndsWith("") would be ill-defined)
641 *
642 * @see Match
643 * @see StartsWith
644 */
645 nonvirtual bool EndsWith (const Character& c, CompareOptions co = eWithCase) const;
646 nonvirtual bool EndsWith (const String& subString, CompareOptions co = eWithCase) const;
647
648 public:
649 /**
650 * \brief Return *this if it ends with argument character, or append 'c' so that it ends with a 'c'.
651 *
652 * \note this is to specific-purpose to be a very sensible API, but I find it pretty often pretty useful. So what-the-heck.
653 */
654 nonvirtual String AssureEndsWith (const Character& c, CompareOptions co = eWithCase) const;
655
656 public:
657 /**
658 * Apply the given regular expression return true if it matches this string. This only
659 * returns true if the expression matches the ENTIRE string - all the way to the end.
660 * @see FindEach() or @see Find - to find a set of things which match.
661 *
662 * \par Example Usage
663 * \code
664 * Assert (String{"abc"}.Matches ("abc"));
665 * Assert (not (String{"abc"}.Matches ("bc")));
666 * Assert (String{"abc"}.Matches (".*bc"));
667 * Assert (not String{"abc"}.Matches ("b.*c"));
668 * \endcode
669 *
670 * \par Example Usage
671 * \code
672 * static const RegularExpression kSonosRE_{"([0-9.:]*)( - .*)"_RegEx};
673 * static const String kTestStr_{"192.168.244.104 - Sonos Play:5"};
674 * optional<String> match1;
675 * optional<String> match2;
676 * EXPECT_TRUE (kTestStr_.Matches (kSonosRE_, &match1, &match2) and match1 == "192.168.244.104" and match2 == " - Sonos Play:5");
677 * EXPECT_EQ (kTestStr_.Matches<1> (kSonosRE_), make_tuple ("192.168.244.104"_k));
678 * EXPECT_EQ (kTestStr_.Matches<2> (kSonosRE_), make_tuple ("192.168.244.104"_k, "Sonos Play:5"_k));
679 * \endcode
680 *
681 * \par Example Usage
682 * \code
683 * // https://tools.ietf.org/html/rfc3986#appendix-B
684 * static const RegularExpression kParseURLRegExp_{"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"_RegEx};
685 * optional<String> scheme;
686 * optional<String> authority;
687 * optional<String> path;
688 * optional<String> query;
689 * optional<String> fragment;
690 * if (rawURL.Matches (kParseURLRegExp_, nullptr, &scheme, nullptr, &authority, &path, nullptr, &query, nullptr, &fragment)) {
691 * DbgTrace ("***good - scheme={}"_f, scheme);
692 * DbgTrace ("***good - authority={}"_f, authority);
693 * DbgTrace ("***good - path={}"_f, path);
694 * DbgTrace ("***good - query={}"_f, query);
695 * DbgTrace ("***good - fragment={}"_f, fragment);
696 * }
697 * \endcode
698 *
699 * \par Example Usage
700 * \code
701 * // Praat 6.4.23 (October 27 2024)
702 * String processRunnerOutput = "Praat 6.4.23 (October 27 2024)";
703 * String version;
704 * return processRunnerOutput.Matches ("(\\w+)\\s([\\w\\.]+).*"_RegEx, nullptr, &version)? version: "???"sv;
705 * return get<1> (processRunnerOutput.Matches<1> ("\\w+\\s([\\w\\.]+).*"_RegEx).value_or(make_tuple("???"_k))); // Or better
706 * \endcode
707 *
708 * Details on the regular expression language/format can be found at:
709 * http://en.wikipedia.org/wiki/C%2B%2B11#Regular_expressions
710 *
711 * \note If any 'sub-match' arguments are passed to Match, they MUST be of type optional<String>* or nullptr.
712 * Passing nullptr allows matched parameters to not be returned, but still identified positionally (by index).
713 *
714 * @see Contains
715 * @see StartsWith
716 * @see EndsWith
717 * @see Find
718 * @see FindEach
719 */
720 nonvirtual bool Matches (const RegularExpression& regEx) const;
721 nonvirtual bool Matches (const RegularExpression& regEx, Containers::Sequence<String>* matches) const;
722 template <Common::IAnyOf<optional<String>*, String*, nullptr_t>... OPTIONAL_STRINGS>
723 nonvirtual bool Matches (const RegularExpression& regEx, OPTIONAL_STRINGS&&... subMatches) const;
724 template <size_t I>
725 nonvirtual optional<Common::RepeatedTuple_t<I, String>> Matches (const RegularExpression& regEx) const;
726
727 public:
728 /**
729 * Find returns the index of the first occurrence of the given Character/substring argument in
730 * this string. Find () always returns a valid string index, which is followed by the
731 * given substring, or nullopt otherwise.
732 *
733 * Find () can optionally be provided a 'startAt' offset to begin the search at.
734 *
735 * And the overload taking a RegularExpression - returns BOTH the location where the match
736 * is found, as well as the end of the match.
737 *
738 * Note - for the special case of Find(empty-string) - the return value is 0 if this string
739 * is non-empty, and nullopt if this string was empty.
740 *
741 * @aliases - could have been called IndexOf ()
742 *
743 * \pre (startAt <= size ());
744 *
745 * \par Example Usage
746 * \code
747 * const String kTest_{ "a=b" };
748 * const String kLbl2LookFor_ { "a=" };
749 * if (kTest_.Find (kLbl2LookFor_)) {
750 * String tmp { kTest_.SubString (kLbl2LookFor_.length ()) };
751 * }
752 * Assert (tmp == "b");
753 * \endcode
754 *
755 * @see FindEach ()
756 * @see FindEachString ()
757 * @see Tokenize
758 */
759 nonvirtual optional<size_t> Find (Character c, CompareOptions co = eWithCase) const;
760 nonvirtual optional<size_t> Find (Character c, size_t startAt, CompareOptions co = eWithCase) const;
761 nonvirtual optional<size_t> Find (const String& subString, CompareOptions co = eWithCase) const;
762 nonvirtual optional<size_t> Find (const String& subString, size_t startAt, CompareOptions co = eWithCase) const;
763 nonvirtual optional<pair<size_t, size_t>> Find (const RegularExpression& regEx, size_t startAt = 0) const;
764 nonvirtual Traversal::Iterator<Character> Find (const function<bool (Character item)>& that) const;
765
766 public:
767 /**
768 * This is just like Find, but captures all the matching results in an iterable result.
769 * The reason the overload for RegularExpression's returns a list of pair<size_t,size_t> is because
770 * the endpoint of the match is ambiguous. For fixed string Find, the end of match is computable
771 * from the arguments.
772 *
773 * FindEach () can be more handy to use than directly using Find () in scenarios where you want
774 * to iterate over each match:
775 * e.g.:
776 * for (auto i : s.FindEach ("xxx")) {....}
777 *
778 * Also, to count matches, you can use:
779 * size_t nMatches = FindEach (matchexp).size ();
780 *
781 * Note: FindEach handles the special case of an empty match as ignored, so FindEach(empty-str-or-regexp)
782 * always returns an empty list. Also - for the String case, it returns distinct matches, so if you
783 * search String{"AAAA"}.FindEach ("AA"), you will get 2 answers ({0, 2}).
784 *
785 * @see Find ()
786 * @see FindEachString ()
787 * @see Matches ()
788 */
789 nonvirtual Containers::Sequence<pair<size_t, size_t>> FindEach (const RegularExpression& regEx) const;
790 nonvirtual Containers::Sequence<size_t> FindEach (const String& string2SearchFor, CompareOptions co = eWithCase) const;
791
792 public:
793 /**
794 * \par Example Usage
795 * \code
796 * const String kTest_{ "a=b,"sv };
797 * const RegularExpression kRE_{ "a=(.*)[, ]" };
798 * Sequence<String> tmp1{ kTest_.FindEachString (kRE_) };
799 * Assert (tmp1.size () == 1 and tmp1[0] == "a=b,");
800 * Sequence<RegularExpressionMatch> tmp2 { kTest_.FindEachMatch (kRE_) };
801 * Assert (tmp2.size () == 1 and tmp2[0].GetFullMatch () == "a=b," and tmp2[0].GetSubMatches () == Sequence<String>{"b"});
802 * \endcode
803 *
804 * @see Find ()
805 * @see FindEachString ()
806 * @see Matches ()
807 */
808 nonvirtual Containers::Sequence<RegularExpressionMatch> FindEachMatch (const RegularExpression& regEx) const;
809
810 public:
811 /**
812 * \par Example Usage
813 * \code
814 * const String kTest_ { "a=b, c=d"_k };
815 * const RegularExpression kRE_ { "(.)=(.)" };
816 * Assert ((kTest_.FindEachString (kRE_) == vector<String>{"a=b", "c=d"}));
817 * \endcode
818 *
819 * @see Find ()
820 * @see FindEachMatch ()
821 * @see Matches ()
822 */
823 nonvirtual Containers::Sequence<String> FindEachString (const RegularExpression& regEx) const;
824
825 public:
826 /**
827 * RFind (substring) returns the index of the last occurrence of the given substring in
828 * this string. This function always returns a valid string index, which is followed by the
829 * given substring, or optional<size_t> {} otherwise.
830 *
831 * @aliases RIndexOf ()
832 */
833 nonvirtual optional<size_t> RFind (Character c) const noexcept;
834 nonvirtual optional<size_t> RFind (const String& subString) const;
835
836 public:
837 /**
838 * Replace the range of this string with the given replacement. Const method: just creates new string as described.
839 */
840 nonvirtual String Replace (size_t from, size_t to, const String& replacement) const;
841 nonvirtual String Replace (pair<size_t, size_t> fromTo, const String& replacement) const;
842
843 public:
844 /**
845 * Apply the given regular expression, with 'with' and replace each match. This doesn't
846 * modify this string, but returns the replacement string.
847 *
848 * CHECK - BUT HI HTINK WE DEFINE TO REPLACE ALL? OR MAKE PARAM?
849 * See regex_replace () for definition of the regEx language
850 *
851 * Require (not string2SearchFor.empty ());
852 *
853 * \par Example Usage
854 * \code
855 * mungedData = mungedData.ReplaceAll (RegularExpression{ "\\b0+" }, ""); // strip all leading zeros
856 * \endcode
857 *
858 * \par Example Usage
859 * \code
860 * String a = "a b \n\t c";
861 * EXPECT_EQ (a.ReplaceAll (RegularExpression{"\\s+"sv}, " "sv), "a b c");
862 * EXPECT_EQ (a.ReplaceAll ("\\s+"_RegEx, " "sv), "a b c");
863 * \endcode
864 *
865 * Note - it IS legal to have with contain the original search for string, or even
866 * to have it 'created' as part of where it gets
867 * inserted. The implementation will only replace those that pre-existed.
868 *
869 * \note To perform a regular expression replace-all, which is case insensitive, create the regular expression with CompareOptions::eCaseInsensitive
870 *
871 * \note ReplaceAll could have been called 'SafeString' or 'FilteredString' (was at one point - replaces that functionality)
872 */
873 nonvirtual String ReplaceAll (const RegularExpression& regEx, const String& with) const;
874 nonvirtual String ReplaceAll (const String& string2SearchFor, const String& with, CompareOptions co = eWithCase) const;
875 nonvirtual String ReplaceAll (const function<bool (Character)>& replaceCharP, const String& with) const;
876 nonvirtual String ReplaceAll (const Containers::Set<Character>& charSet, const String& with) const;
877
878 public:
879 /**
880 * Replace any CR or LF or CRLF sequences with plain NL-terminated text.
881 */
882 nonvirtual String NormalizeTextToNL () const;
883
884 public:
885 /**
886 * \brief Replace sequences of whitespace characters (space, tab, newline etc) with a single space (or argument space character).
887 *
888 * \note see Qt 'QString::simplify()'.Idea is Trim () (right and left) - plus replace contiguous substrings with* Character::IsSpace () with a
889 * single (given) space character.
890 */
891 nonvirtual String NormalizeSpace (Character useSpaceCharacter = ' ') const;
892
893 public:
894 /**
895 * Break this String into constituent parts. This is a simplistic API but at least handy as is.
896 *
897 * The caller can specify the token separators by set, by lambda. This defaults to the lambda "isWhitespace".
898 *
899 * This is often called 'Split' in other APIs. This is NOT (as is now) a replacement for flex, but just for
900 * simple, but common string splitting needs (though if I had a regexp param, it may approach the power of flex).
901 *
902 * \note If this->length () == 0, this method returns a list of length 0;
903 * \note Its fine for the split character/characters to be missing, in which case this
904 * returns a list of length 1
905 *
906 * \par Example Usage
907 * \code
908 * String t { "ABC DEF G" };
909 * Assert (t.Tokenize ().length () == 3);
910 * Assert (t.Tokenize ()[1] == "DEF");
911 * \endcode
912 *
913 * \par Example Usage
914 * \code
915 * String t { "foo= 7" };
916 * auto tt = t.Tokenize ({ '=' });
917 * Assert (t.length () == 2);
918 * Assert (t[1] == " 7");
919 * Assert (t[1].Trim () == "7");
920 * \endcode
921 *
922 * \par Example Usage
923 * \code
924 * String t { "foo= 7" };
925 * auto tt = t.Tokenize ({ '=', ' ' });
926 * Assert (t.length () == 2);
927 * Assert (t[1] == "7");
928 * \endcode
929 *
930 * @see Find
931 *
932 * TODO:
933 * @todo Review:
934 * http://qt-project.org/doc/qt-5.0/qtcore/qstring.html#split
935 * especially:
936 * QString line = "forename\tmiddlename surname \t \t phone";
937 * QRegularExpression sep("\\s+");
938 * str = line.section(sep, 2, 2); // str == "surname"
939 * str = line.section(sep, -3, -2); // str == "middlename surname"
940 * Make sure our Find/Tokenize is at least this simple, and maybe diff between find and split
941 * is FIND the regular expression names the things looked for and SPLIT() uses regexp to name the separators?
942 * Add something like the above to the String String demo app (when it exists)
943 */
944 nonvirtual Containers::Sequence<String> Tokenize () const;
945 nonvirtual Containers::Sequence<String> Tokenize (const function<bool (Character)>& isTokenSeperator) const;
946 nonvirtual Containers::Sequence<String> Tokenize (const RegularExpression& isSeparator) const;
947 nonvirtual Containers::Sequence<String> Tokenize (const Containers::Set<Character>& delimiters) const;
948
949 public:
950 /**
951 * \brief break the String into a series of lines;
952 *
953 * \note could almost be done with Tokenize(), except for the one-sided nl-specific trimming.
954 *
955 * \note removes line-endings (\r\n, or \n, or \r).
956 */
957 nonvirtual Containers::Sequence<String> AsLines () const;
958
959 public:
960 /**
961 * \brief Breaks this string into Lines, with AsLines (), and applies the argument filter (as if with .Map<>) producing a subset of the lines which match
962 *
963 * note this is useful to replace 'shell script' logic where you might run some command and grep through its output for all
964 * matching lines.
965 *
966 * \par Example Usage
967 * \code
968 * String firstALineOrEmpty = String{"...e.g. from output of ProcessRunner..."}.Grep ("a:").NthValue (0);
969 * \endcode
970 */
971 nonvirtual Containers::Sequence<String> Grep (const String& fgrepArg) const;
972 nonvirtual Containers::Sequence<String> Grep (const RegularExpression& egrepArg) const;
973
974 public:
975 /**
976 * \brief Useful to replace 'awk print $3' - replace with Col(2) - zero based
977 *
978 * default separator = = "\\s+"_RegEx;
979 *
980 * \par Example Usage
981 * \code
982 * Assert (String{"ffmpeg version 7.1"}.Col (2) == "7.1");
983 * \endcode
984 *
985 */
986 nonvirtual optional<String> Col (size_t i) const;
987 nonvirtual optional<String> Col (size_t i, const RegularExpression& separator) const;
988
989 public:
990 /**
991 * \brief see Col(i) - but with default value of empty string
992 */
993 nonvirtual String ColValue (size_t i, const String& valueIfMissing = {}) const;
994
995 public:
996 /**
997 * String LTrim () scans the characters form the left to right, and applies the given
998 * 'shouldBeTrimmed' function (defaults to IsWhitespace). All such characters are removed,
999 * and the resulting string is returned. This does not modify the current string its
1000 * applied to - just returns the trimmed string.
1001 */
1002 nonvirtual String LTrim (bool (*shouldBeTrimmed) (Character) = Character::IsWhitespace) const;
1003
1004 public:
1005 /**
1006 * String RTrim () scans the characters form the right to left, and applies the given
1007 * 'shouldBeTrimmed' function (defaults to IsWhitespace). All such characters are removed,
1008 * and the resulting string is returned. This does not modify the current string its
1009 * applied to - just returns the trimmed string.
1010 *
1011 * \par Example Usage
1012 * \code
1013 * String name = origName.RTrim ([] (Character c) { return c == '\\';}); // Trim a trailing backslash(s), if present
1014 * \endcode
1015 */
1016 nonvirtual String RTrim (bool (*shouldBeTrimmed) (Character) = Character::IsWhitespace) const;
1017
1018 public:
1019 /**
1020 * String Trim () is locally equivalent to RTrim (shouldBeTrimmed).LTrim (shouldBeTrimmed).
1021 */
1022 nonvirtual String Trim (bool (*shouldBeTrimmed) (Character) = Character::IsWhitespace) const;
1023
1024 public:
1025 /**
1026 * Walk the entire string, and produce a new string consisting of all characters for which
1027 * the predicate 'removeCharIf' returned false.
1028 */
1029 nonvirtual String StripAll (bool (*removeCharIf) (Character)) const;
1030
1031 public:
1032 /**
1033 * Combine the given array into a single string (typically comma space) separated.
1034 * If given a list of length n, this adds n-1 separators.
1035 *
1036 * \note .Net version - https://docs.microsoft.com/en-us/dotnet/api/system.string.join?redirectedfrom=MSDN&view=net-6.0#System_String_Join_System_String_System_String___
1037 * \note Java version - https://docs.oracle.com/javase/8/docs/api/java/lang/String.html#join-java.lang.CharSequence-java.lang.CharSequence...-
1038 * \note Javascript - https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/join
1039 *
1040 * \note - CONSIDER LOSING this as 'Iterable<>::Join' just appears to work better -- LGP 2025-01-21
1041 */
1042 static String Join (const Iterable<String>& list, const String& separator = ", "sv);
1043
1044 public:
1045 /**
1046 * Return a new string based on this string where each lower case character is replaced by its
1047 * upper case equivalent. Note that non-lower-case characters (such as punctuation) un unchanged.
1048 */
1049 nonvirtual String ToLowerCase () const;
1050
1051 public:
1052 /**
1053 * Return a new string based on this string where each lower case character is replaced by its
1054 * upper case equivalent. Note that non-upper-case characters (such as punctuation) un unchanged.
1055 */
1056 nonvirtual String ToUpperCase () const;
1057
1058 public:
1059 /**
1060 * Return true if the string contains zero non-whitespace characters.
1061 */
1062 nonvirtual bool IsWhitespace () const;
1063
1064 public:
1065 /**
1066 * \brief return the first maxLen (or fewer if string shorter) characters of this string (adding ellipsis if truncated)
1067 *
1068 * This function is for GUI/display purposes. It returns the given string, trimmed down
1069 * to at most maxLen characters, and removes whitespace (on 'to trim' side - given by keepLeft flag -
1070 * if needed to get under maxLen).
1071 *
1072 * Note in the 3-arg overload, the ellipsis string MAY be the empty string.
1073 */
1074 nonvirtual String LimitLength (size_t maxLen, StringShorteningPreference keepPref = StringShorteningPreference::ePreferKeepLeft) const;
1075 nonvirtual String LimitLength (size_t maxLen, StringShorteningPreference keepLeft, const String& ellipsis) const;
1076
1077 public:
1078 /**
1079 * CopyTo () copies the contents of this string to the target buffer.
1080 * CopyTo () does NOT nul-terminate the target buffer
1081 *
1082 * Returns span of CHAR_T objects written - a subspan of the argument span
1083 *
1084 * \pre s.size () >= UTFConvert::ComputeTargetBufferSize<CHAR_T> (...this-string-data...);
1085 *
1086 * \see See also GetData<CHAR_T> (buf) - similar functionality - except caller doesn't need to know size of buffer to allocate
1087 */
1088 template <IUNICODECanAlwaysConvertTo CHAR_T>
1089 nonvirtual span<CHAR_T> CopyTo (span<CHAR_T> s) const
1090 requires (not is_const_v<CHAR_T>);
1091
1092 public:
1093 /**
1094 * Convert String losslessly into a standard C++ type.
1095 *
1096 * Only specifically specialized variants supported: IBasicUNICODEStdString<T> or same_as<T,String>
1097 * o wstring
1098 * o u8string
1099 * o u16string
1100 * o u32string
1101 * o String (return *this; handy sometimes in templated usage; harmless)
1102 * as well as:
1103 * o filesystem::path (or anything with .wstring() -> wstring method) - note see qStroika_Foundation_Characters_AsPathAutoMapMSYSAndCygwin
1104 *
1105 * DEPRECATED AS OF v3.0d1 because As is const method - could do non-const As<> overload for these, but that would be confusing
1106 * o const wchar_t*
1107 * o const Character*
1108 *
1109 * \note
1110 * o As<u8string> () equivalent to AsUTF8 () call
1111 * o As<u16string> () equivalent to AsUTF16 () call
1112 * o As<u32string> () equivalent to AsUTF32 () call
1113 *
1114 * \note We tried to also have template<typename T> explicit operator T () const; - conversion operator - but
1115 * We got too frequent confusion in complex combinations of templates, like with:
1116 * Set<String> x ( *optional<String> {String{}) ); // fails cuz calls operator Set<String> ()!
1117 * Set<String> x { *optional<String> {String{}) }; // works as expected
1118 */
1119 template <typename T>
1120 nonvirtual T As () const
1121 requires (IBasicUNICODEStdString<T> or same_as<T, String> or constructible_from<T, wstring>);
1122
1123 public:
1124 /**
1125 * Create a narrow string object from this, based on the encoding from the argument locale.
1126 * This throws an exception if there is an error performing the conversion, and the 'into' overload
1127 * leaves 'into' in an undefined (but safe) state.
1128 */
1129 nonvirtual string AsNarrowString (const locale& l) const;
1130 nonvirtual string AsNarrowString (const locale& l, AllowMissingCharacterErrorsFlag) const;
1131
1132 public:
1133 /**
1134 * Convert String losslessly into a standard C++ type.
1135 * Only specifically specialized variants are supported.
1136 *
1137 * SUPPORTED result type "T": values are:
1138 * string
1139 * u8string
1140 */
1141 template <typename T = u8string>
1142 nonvirtual T AsUTF8 () const
1143 requires (same_as<T, string> or same_as<T, u8string>);
1144
1145 public:
1146 /**
1147 * Convert String losslessly into a standard C++ type u16string.
1148 *
1149 * \par Example Usage:
1150 * \code
1151 * String s = u"hi mom";
1152 * u16string su = AsUTF16 ();
1153 * \endcode
1154 *
1155 * \note - the resulting string may have a different length than this->size() due to surrogates
1156 *
1157 * @todo allow wchar_t if sizeof(wchar_t) == 2
1158 */
1159 template <typename T = u16string>
1160 nonvirtual T AsUTF16 () const
1161 requires (same_as<T, u16string> or (sizeof (wchar_t) == sizeof (char16_t) and same_as<T, wstring>));
1162
1163 public:
1164 /**
1165 * Convert String losslessly into a standard C++ type u32string.
1166 *
1167 * \par Example Usage:
1168 * \code
1169 * String s = u"hi mom";
1170 * u32string su = AsUTF32 ();
1171 * \endcode
1172 *
1173 * \note - As of Stroika 2.1d23 - the resulting string may have a different length than this->size() due to surrogates,
1174 * but eventually the intent is to fix Stroika's string class so this is not true, and it returns the length of the string
1175 * in size () with surrogates removed (in other words uses ucs32 representation). But not there yet.
1176 */
1177 template <typename T = u32string>
1178 nonvirtual T AsUTF32 () const
1179 requires (same_as<T, u32string> or (sizeof (wchar_t) == sizeof (char32_t) and same_as<T, wstring>));
1180
1181 public:
1182 /**
1183 * See docs on SDKChar for meaning (character set).
1184 *
1185 * Note - many UNICODE Strings cannot be represented in the SDKString character set (especially if narrow - depends a lot).
1186 * But in that case, AsNarrowSDKString () will throw, unless AllowMissingCharacterErrorsFlag is specified.
1187 */
1188 nonvirtual SDKString AsSDKString () const;
1189 nonvirtual SDKString AsSDKString (AllowMissingCharacterErrorsFlag) const;
1190
1191 public:
1192 /**
1193 * See docs on SDKChar for meaning (character set). If SDKChar is a wide character, there is probably still a
1194 * default 'code page' to interpret narrow characters (Windows CP_ACP). This is a string in that character set.
1195 *
1196 * Note - many UNICODE Strings cannot be represented in the SDKString character set (especially if narrow - depends a lot).
1197 * But in that case, AsNarrowSDKString () will throw, unless AllowMissingCharacterErrorsFlag is specified.
1198 */
1199 nonvirtual string AsNarrowSDKString () const;
1200 nonvirtual string AsNarrowSDKString (AllowMissingCharacterErrorsFlag) const;
1201
1202 public:
1203 /**
1204 * Convert String losslessly into a standard C++ type.
1205 * Only specifically specialized variants are supported (right now just <string> supported).
1206 * The source string MUST be valid ascii characters - throw RuntimeErrorException<>
1207 *
1208 * \par Example Usage:
1209 * \code
1210 * string a1 = String{"hi mom"}.AsASCII (); // OK
1211 * string a2 = String{u"שלום"}.AsASCII (); // throws
1212 * \endcode
1213 *
1214 * \note - this is a (compatible) change of behavior: before Stroika v2.1d23, this would assert out on invalid ASCII.
1215 *
1216 * Supported Types:
1217 * o Memory::StackBuffer<char>
1218 * o string
1219 * o u8string (note any ASCII string is also legit utf-8)
1220 */
1221 template <typename T = string>
1222 nonvirtual T AsASCII () const
1223 requires requires (T* into) {
1224 { into->empty () } -> same_as<bool>;
1225 { into->push_back (ASCII{0}) };
1226 };
1227
1228 public:
1229 /**
1230 * Convert String losslessly into a standard C++ type.
1231 * Only specifically specialized variants are supported (right now just <string> supported).
1232 * If this source contains any invalid ASCII characters, this returns nullopt, and else a valid engaged string.
1233 *
1234 * Supported Types(T):
1235 * o Memory::StackBuffer<char>
1236 * o string
1237 * o u8string (note any valid ASCII string is also valid utf-8)
1238 */
1239 template <typename T = string>
1240 nonvirtual optional<T> AsASCIIQuietly () const
1241 requires requires (T* into) {
1242 { into->empty () } -> same_as<bool>;
1243 { into->push_back (ASCII{0}) };
1244 };
1245
1246 public:
1247 /**
1248 * \brief Summary data for raw contents of rep - each rep will support at least one of these span forms
1249 *
1250 * Each rep will support a span of at least one code-point type (ascii, utf8, utf16, or utf32)
1251 *
1252 * This API is guaranteed to support a span of at least one of these types (maybe more). The caller may
1253 * specify the code-point type preferred.
1254 *
1255 * \note eAscii is a subset of eSingleByteLatin1, so when the type eAscii is returned, EITHER fSingleByteLatin1 or fAscii maybe
1256 * maybe used.
1257 *
1258 * This API is public, but best to avoid depending on internals of String API - like PeekSpanData - since
1259 * this reasonably likely to change in future versions.
1260 */
1263 /**
1264 * ASCII is useful to track in storage (though same size as eSingleByteLatin1) - because requests
1265 * to convert to UTF-8 are free - ASCII is legit UTF8 (not true for eSingleByteLatin1)
1266 */
1268 /**
1269 * Latin1 - 8 bit representation of characters. But 256 of them - more than plain ascii.
1270 * And cheap/easy to convert to UNICODE (since code points of wider characters exactly the same values).
1271 */
1273 eChar16,
1274 eChar32
1275 };
1276 StorageCodePointType fInCP;
1277 union {
1278 span<const ASCII> fAscii;
1279 span<const Latin1> fSingleByteLatin1;
1280 span<const char16_t> fChar16;
1281 span<const char32_t> fChar32;
1282 };
1283 };
1284
1285 public:
1286 /**
1287 * \brief return the constant character data inside the string in the form of a case variant union of different span types (at least one will be there)
1288 * templated type arg just used to pick a preferred type.
1289 *
1290 * \note CHAR_TYPE == char implies eAscii
1291 *
1292 * \note Reason for the two step API - getting the PeekSpanData, and then using - is because getting
1293 * the data is most expensive part (virtual function), and the packaged PeekSpanData gives enuf
1294 * info to do the next steps (quickly inline usually)
1295 *
1296 * This API is public, but best to avoid depending on internals of String API - like PeekSpanData - since
1297 * this reasonably likely to change in future versions.
1298 */
1299 template <IUNICODECanUnambiguouslyConvertFrom CHAR_TYPE = ASCII>
1300 nonvirtual PeekSpanData GetPeekSpanData () const;
1301
1302 public:
1303 /**
1304 * \brief return the constant character data inside the string in the form of a span or nullopt if not available for that CHAR_TYPE
1305 *
1306 * This API is public, but best to avoid depending on internals of String API - like PeekSpanData - since
1307 * this reasonably likely to change in future versions.
1308 *
1309 * \note It is generally true that the data IsASCII (span) IFF Peek<ASCII> returns non-nullopt. But this is
1310 * not ACTUALLY always true. Generally, Stroika constructs strings like this. But callers may manually construct
1311 * a String with backend rep u32string, for example (e.g because of move construct) - and that might just happen
1312 * to be all ascii. You can count on that IF you get back value from PeekData<ASCII> - it must be all ASCII. But
1313 * the contrapositive is not always true.
1314 */
1315 template <IUNICODECanUnambiguouslyConvertFrom CHAR_TYPE>
1316 static optional<span<const CHAR_TYPE>> PeekData (const PeekSpanData& pds);
1317 template <IUNICODECanUnambiguouslyConvertFrom CHAR_TYPE>
1318 nonvirtual optional<span<const CHAR_TYPE>> PeekData () const;
1319
1320 public:
1321 /**
1322 * \brief return the constant character data inside the string (rep) in the form of a span, possibly quickly and directly, and possibly copied into possiblyUsedBuffer
1323 *
1324 * This API will typically return a span of data which is internal pointers into the data of the rep (and so its invalidated on the
1325 * next change to the string).
1326 *
1327 * BUT - it maybe a span of data stored into the argument possiblyUsedBuffer (which is why it must be provided - cannot be nullptr).
1328 * If you want the freedom to not pass in this buffer, see the PeekData API.
1329 *
1330 * \note - CHAR_T must satisfy the concept IUNICODECanAlwaysConvertTo - SAFELY - because the string MIGHT contain characters not in any
1331 * unsafe char class (like ASCII or Latin1), and so there might not be a way to do the conversion. Use
1332 * PeekData () to do that - where it can return nullopt if no conversion possible.
1333 *
1334 * \par Example Usage
1335 * \code
1336 * Memory::StackBuffer<char8_t> maybeIgnoreBuf1;
1337 * span<const char8_t> thisData = GetData (&maybeIgnoreBuf1);
1338 * \endcode
1339 *
1340 * \note Prior to Stroika v3.0d1, GetData() took no arguments, and returned pair<const CHAR_TYPE*, const CHAR_TYPE*>
1341 * which is pretty similar, but not quite the same. To adapt any existing code calling that older version of the API
1342 * just add a Memory::StackBuffer<T> b; and pass &b to GetData(); And the return span is not the same as pair<> but
1343 * easily convertible.
1344 */
1345 template <IUNICODECanAlwaysConvertTo CHAR_TYPE, size_t STACK_BUFFER_SZ>
1346 static span<const CHAR_TYPE> GetData (const PeekSpanData& pds, Memory::StackBuffer<CHAR_TYPE, STACK_BUFFER_SZ>* possiblyUsedBuffer);
1347 template <IUNICODECanAlwaysConvertTo CHAR_TYPE, size_t STACK_BUFFER_SZ>
1348 nonvirtual span<const CHAR_TYPE> GetData (Memory::StackBuffer<CHAR_TYPE, STACK_BUFFER_SZ>* possiblyUsedBuffer) const;
1349
1350 public:
1351 struct EqualsComparer;
1352
1353 public:
1354 struct LessComparer;
1355
1356 public:
1357 struct ThreeWayComparer;
1358
1359 public:
1360 /**
1361 * Return true if case sensitive compare of the two IConvertibleToString objects have the same characters.
1362 * Indirects to EqualsComparer{eWithCase} (...)
1363 *
1364 * \note For reasons I don't understand, the plain function declaration of operator== and requires not do appear to be required
1365 * on all major compilers due to quirks of the operator= rewrite rules, but I don't fully understand why --LGP 2024-11-18.
1366 */
1367 nonvirtual bool operator== (const String& rhs) const;
1368 template <IConvertibleToString T>
1369 nonvirtual bool operator== (T&& rhs) const
1370 requires (not same_as<remove_cvref_t<T>, String>);
1371
1372 public:
1373 /**
1374 * Return strong_ordering of case sensitive (three-way) compare of the two IConvertibleToString objects.
1375 * Indirects to ThreeWayComparer{eWithCase} (...)
1376 *
1377 * \see operator== ()
1378 */
1379 nonvirtual strong_ordering operator<=> (const String& rhs) const;
1380 template <IConvertibleToString T>
1381 nonvirtual strong_ordering operator<=> (T&& rhs) const
1382 requires (not same_as<remove_cvref_t<T>, String>);
1383
1384 public:
1385 /**
1386 * @aliases basic_string>char>::npos
1387 *
1388 * This is only used for 'STL-compatibility APIs, like substr (), find, rfind (), etc.
1389 */
1390 static constexpr size_t npos = static_cast<size_t> (-1);
1391
1392 public:
1393 /**
1394 * @aliases size
1395 */
1396 nonvirtual size_t length () const noexcept;
1397
1398 public:
1399 /**
1400 * \note BREAKING change between Stroika 2.1 and v3 - const c_str/0 no longer guaranteed to return non-null
1401 *
1402 * Mitigating this, the non-const c_str() still will return non-null, and the const overload taking
1403 * StackBuffer<wchar_t> will also guarantee returning non-null.
1404 *
1405 * In the case of the overloads taking no arguments, the lifetime of the returned pointer is until the
1406 * next change to this string. In the case of the StackBuffer overload, the guarantee extends for the lifetime
1407 * of the argument buffer (typically just the next few lines of code).
1408 *
1409 * This will always return a value which is NUL-terminated.
1410 *
1411 * Note also - the c_str () function CAN now be somewhat EXPENSIVE, causing a mutation of the String object, so use
1412 * one of the const overloads where possible (or where performance matters).
1413 *
1414 * \note Why does c_str (StackBuffer) return a tuple?
1415 * Sometimes you just want a plain const wchar_t* you can use with an old C pointer based API. But that
1416 * fails/asserts out if you happen to have an empty string and try to get the pointer. Sometimes - you just need
1417 * the pointer!
1418 *
1419 * And why the string-view part? Because sometimes you want the LENGTH. Sure - you can just compute it again. But
1420 * that is costly. Sure you can just use the original string length. BUT THAT WOULD BE A BUG once I support
1421 * surrogates properly (at least on windows where wchar_t isn't char32_t).
1422 */
1423 nonvirtual tuple<const wchar_t*, wstring_view> c_str (Memory::StackBuffer<wchar_t>* possibleBackingStore) const;
1424
1425 public:
1426 /**
1427 * Follow the basic_string<>::find () API
1428 *
1429 * need more overloads.
1430 *
1431 * Returns String::npos if not found, else the zero based index.
1432 */
1433 nonvirtual size_t find (Character c, size_t startAt = 0) const;
1434 nonvirtual size_t find (const String& s, size_t startAt = 0) const;
1435
1436 public:
1437 /**
1438 * Follow the basic_string<>::rfind () API
1439 *
1440 * need more overloads.
1441 *
1442 * Returns String::npos if not found, else the zero based index.
1443 */
1444 nonvirtual size_t rfind (Character c) const;
1445
1446 public:
1447 /**
1448 * mimic https://en.cppreference.com/w/cpp/string/basic_string/front
1449 *
1450 * \pre not empty ()
1451 */
1452 nonvirtual Character front () const;
1453
1454 public:
1455 /**
1456 * mimic https://en.cppreference.com/w/cpp/string/basic_string/back
1457 *
1458 * \pre not empty ()
1459 */
1460 nonvirtual Character back () const;
1461
1462 public:
1463 /**
1464 * Compatable with STL::basic_string::subtr() - which interprets second argument as count. Not the same
1465 * as Stroika::String::SubString (where the second argument is a 'to')
1466 *
1467 * @see SubString
1468 *
1469 * From http://en.cppreference.com/w/cpp/string/basic_string/substr
1470 * Returns a substring [pos, pos+count). If the requested substring extends
1471 * past the end of the string, or if count == npos, the returned substring is [pos, size()).
1472 * std::out_of_range if pos > size()
1473 */
1474 nonvirtual String substr (size_t from, size_t count = npos) const;
1475
1476 public:
1477 ///////////////// DEPRECATED FUNCTIONS /////////////////
1478 [[deprecated ("Since Stroika v3.0d13 - if you must use c_str() - use the overload taking StackBuffer arg), or use As<wstring> "
1479 "().c_str ()")]] const wchar_t*
1480 c_str ();
1481 [[deprecated ("Since Stroika v3.0d12 use StringBuilder::SetAt")]] void SetCharAt (Character c, size_t i);
1482 [[deprecated ("Since Stroika v3.0d12 use StringBuilder")]] void erase (size_t from = 0);
1483 [[deprecated ("Since Stroika v3.0d12 use StringBuilder")]] void erase (size_t from, size_t count);
1484 [[deprecated ("Since Stroika v3.0d12 use StringBuilder")]] void push_back (wchar_t c);
1485 [[deprecated ("Since Stroika v3.0d12 use StringBuilder")]] void push_back (Character c);
1486 [[deprecated ("Since Stroika v3.0d12 use StringBuilder")]] void Append (Character c);
1487 [[deprecated ("Since Stroika v3.0d12 use StringBuilder")]] void Append (const String& s);
1488 [[deprecated ("Since Stroika v3.0d12 use StringBuilder")]] void Append (const wchar_t* s);
1489 [[deprecated ("Since Stroika v3.0d12 use StringBuilder")]] void Append (const wchar_t* from, const wchar_t* to);
1490 [[deprecated ("Since Stroika v3.0d12 use StringBuilder")]] void Append (const Character* from, const Character* to);
1491 template <typename CHAR_T>
1492 [[deprecated ("Since Stroika v3.0d12 use StringBuilder")]] void Append (span<const CHAR_T> s)
1493 requires (same_as<CHAR_T, Character> or same_as<CHAR_T, char32_t>);
1494 [[deprecated ("Since Stroika v3.0d12 use StringBuilder")]] String& operator+= (Character appendage);
1495 [[deprecated ("Since Stroika v3.0d12 use StringBuilder")]] String& operator+= (const String& appendage);
1496 [[deprecated ("Since Stroika v3.0d12 use StringBuilder")]] String& operator+= (const wchar_t* appendageCStr);
1497 [[deprecated ("Since Stroika v3.0d12 - just use a b String{}")]] void clear ()
1498 {
1499 *this = String{};
1500 }
1501 template <typename T>
1502 [[deprecated ("Since Stroika v3.0d2, just use 0 arg version)")]] void As (T* into) const
1503 requires (IBasicUNICODEStdString<T> or same_as<T, String>)
1504 {
1505 *into = this->As<T> ();
1506 }
1507 [[deprecated ("Since Stroika v3.0d2, just use 1 arg version)")]] void AsNarrowString (const locale& l, string* into) const
1508 {
1509 *into = this->AsNarrowString (l);
1510 }
1511 template <typename T = u8string>
1512 [[deprecated ("Since Stroika v3.0d2 - use AsUTF8/0")]] void AsUTF8 (T* into) const
1513 requires (same_as<T, string> or same_as<T, u8string>)
1514 {
1515 *into = this->AsUTF8 ();
1516 }
1517 template <typename T = u16string>
1518 [[deprecated ("Since Stroika v3.0d2 - use AsUTF16/0")]] void AsUTF16 (T* into) const
1519 requires (same_as<T, u16string> or (sizeof (wchar_t) == sizeof (char16_t) and same_as<T, wstring>))
1520 {
1521 *into = AsUTF16 ();
1522 }
1523 template <typename T = u32string>
1524 [[deprecated ("Since Stroika v3.0d2 - use AsUTF32/0")]] void AsUTF32 (T* into) const
1525 requires (same_as<T, u32string> or (sizeof (wchar_t) == sizeof (char32_t) and same_as<T, wstring>))
1526 {
1527 *into = AsUTF32 ();
1528 }
1529 [[deprecated ("Since Stroika v3.0d2 - just use /0")]] void AsSDKString (SDKString* into) const
1530 {
1531 *into = AsSDKString ();
1532 }
1533 [[deprecated ("Since Stroika v3.0d2 - just use /0")]] void AsNarrowSDKString (string* into) const
1534 {
1535 *into = SDK2Narrow (AsSDKString ());
1536 }
1537 template <typename T = string>
1538 [[deprecated ("Since v3.0d2 use /0")]] void AsASCII (T* into) const
1539 requires (same_as<T, string> or same_as<T, Memory::StackBuffer<char>>)
1540 {
1541 if (not AsASCIIQuietly (into)) {
1542 ThrowInvalidAsciiException_ ();
1543 }
1544 }
1545 template <typename T = string>
1546 [[deprecated ("Since v3.0d2 use /0 overload")]] bool AsASCIIQuietly (T* into) const
1547 requires (same_as<T, string> or same_as<T, Memory::StackBuffer<char>>)
1548 {
1549 auto r = this->AsASCIIQuietly ();
1550 if (r) {
1551 *into = *r;
1552 return true;
1553 }
1554 else {
1555 return false;
1556 }
1557 }
1558 [[deprecated ("Since Stroika v3.0d5 use StringShorteningPreference argument")]] String LimitLength (size_t maxLen, bool keepLeft) const
1559 {
1560 return LimitLength (maxLen, keepLeft ? StringShorteningPreference::ePreferKeepLeft : StringShorteningPreference::ePreferKeepRight);
1561 }
1562 [[deprecated ("Since Stroika v3.0d5 use StringShorteningPreference argument")]] String LimitLength (size_t maxLen, bool keepLeft,
1563 const String& ellipsis) const
1564 {
1565 return LimitLength (maxLen, keepLeft ? StringShorteningPreference::ePreferKeepLeft : StringShorteningPreference::ePreferKeepRight, ellipsis);
1566 }
1567 template <typename CHAR_T>
1568 [[deprecated ("Since Stroika v3.0d1, String{}")]] static String FromASCII (span<const CHAR_T> s)
1569 {
1570 return String{s};
1571 }
1572 template <typename CHAR_T>
1573 [[deprecated ("Since Stroika v3.0d1, String{}")]] static String FromASCII (const CHAR_T* cString)
1574 {
1575 return String{cString};
1576 }
1577 template <IStdBasicStringCompatibleCharacter CHAR_T>
1578 [[deprecated ("Since Stroika v3.0d1, String{}")]] static String FromASCII (const basic_string<CHAR_T>& str)
1579 {
1580 return String{str};
1581 }
1582 [[deprecated ("Since Stroika v3.0d1, use span{} overload for this")]] static String FromASCII (const char* from, const char* to)
1583 {
1584 return String{span{from, to}};
1585 }
1586 [[deprecated ("Since Stroika v3.0d1, use span{} overload for this")]] static String FromASCII (const wchar_t* from, const wchar_t* to)
1587 {
1588 return String{span{from, to}};
1589 }
1590 [[deprecated ("Since Stroika v3.0d1, use span overloads")]] String InsertAt (const wchar_t* from, const wchar_t* to, size_t at) const
1591 {
1592 Memory::StackBuffer<Character> buf{Memory::eUninitialized, UTFConvert::ComputeTargetBufferSize<Character> (span{from, to})};
1593 return InsertAt (UTFConvert::kThe.ConvertSpan (span{from, to}, span{buf}), at);
1594 }
1595 [[deprecated ("Since Stroika v3.0d1, use span overloads")]] String InsertAt (const Character* from, const Character* to, size_t at) const
1596 {
1597 return InsertAt (span{from, to}, at);
1598 }
1599 [[deprecated ("Since Stroika v3.0d1, use span{} overload for this")]] static String FromLatin1 (const char* start, const char* end)
1600 {
1601 return FromLatin1 (span{start, end});
1602 }
1603 [[deprecated ("Since Stroika v3.0d1, use span{} constructor for this")]] static String FromNarrowString (const char* from,
1604 const char* to, const locale& l)
1605 {
1606 return FromNarrowString (span{from, to}, l);
1607 }
1608 [[deprecated ("Since Stroika v3.0d1, use span{} constructor for this")]] static String FromNarrowSDKString (const char* from, const char* to)
1609 {
1610 return FromNarrowSDKString (span{from, to});
1611 }
1612 template <IUNICODECanAlwaysConvertTo CHAR_T>
1613 [[deprecated ("Since Stroika v3.0d1, use span{} constructor for this")]] String (const CHAR_T* from, const CHAR_T* to)
1614 : String{span<const CHAR_T>{from, to}}
1615 {
1616 }
1617 [[deprecated (
1618 "Since Stroika v3.0d1 - use As<wstring> ().c_str () or other c_str() overload (*UNSAFE TO USE*)")]] nonvirtual const wchar_t*
1619 c_str () const noexcept;
1620 [[deprecated ("Since Stroika v3.0 - use span{} overloads")]] inline static String FromSDKString (const SDKChar* from, const SDKChar* to)
1621 {
1622 return FromSDKString (span{from, to});
1623 }
1624 [[deprecated ("Since Stroika v3.0 - use span{} overloads")]] static String FromUTF8 (const char* from, const char* to)
1625 {
1626 return FromUTF8 (span{from, to});
1627 }
1628 [[deprecated ("Since Stroika v3.0 - use span{} overloads")]] static String FromUTF8 (const char8_t* from, const char8_t* to)
1629 {
1630 return FromUTF8 (span{from, to});
1631 }
1632 template <typename T = string>
1633 [[deprecated ("Since Stroika v3.0d1 - use Character::AsAsciiQuietly")]] static bool AsASCIIQuietly (const wchar_t* fromStart,
1634 const wchar_t* fromEnd, T* into)
1635 {
1636 return Character::AsASCIIQuietly (span<const wchar_t>{fromStart, fromEnd}, into);
1637 }
1638 [[deprecated (
1639 "Since Stroika v3.0d1 due to http://stroika-bugs.sophists.com/browse/STK-965 - NOT IMPLEMENTED")]] nonvirtual const wchar_t*
1640 data () const;
1641 [[deprecated ("Since Stroika v3.0d8 - use RemoveFirstIf")]] String Remove (Character c) const
1642 {
1643 return RemoveFirstIf (c);
1644 }
1645 [[deprecated ("Since Stroika v3.0d8 - use RemoveFirstIf")]] String Remove (const String& subString) const
1646 {
1647 return RemoveFirstIf (subString);
1648 }
1649
1650 private:
1651 static shared_ptr<_IRep> mkEmpty_ ();
1652
1653 private:
1654 /**
1655 * If the argument CHAR_T is restrictive (such as ASCII/char) - this CHECKS and THROWS (Character::CheckASCII).
1656 * This function also reads the data, and sees if it can downshift 'CHAR_T' to something more restrictive, and produces
1657 * a possibly smaller rep.
1658 *
1659 * For some overloads (e..g && move) - the data is 'stolen/moved'.
1660 *
1661 * See mk_nocheck_ for a simpler - DO WHAT I SAID - operation.
1662 */
1663 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
1664 static shared_ptr<_IRep> mk_ (span<const CHAR_T> s);
1665 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
1666 static shared_ptr<_IRep> mk_ (Iterable<CHAR_T> it);
1667 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
1668 static shared_ptr<_IRep> mk_ (span<CHAR_T> s);
1669 template <IStdBasicStringCompatibleCharacter CHAR_T>
1670 static shared_ptr<_IRep> mk_ (basic_string<CHAR_T>&& s);
1671
1672 private:
1673 /*
1674 * Note the mk_nocheck_ - just does the mk of the buffer, but assuming the arguments are legit and will fit (though it may
1675 * assert in DEBUG builds this is true).
1676 *
1677 * This just blindly allocates the buffer of the given size/type for the given arguments.
1678 */
1679 template <typename CHAR_T>
1680 static shared_ptr<_IRep> mk_nocheck_ (span<const CHAR_T> s)
1681 requires (same_as<CHAR_T, ASCII> or same_as<CHAR_T, Latin1> or same_as<CHAR_T, char16_t> or same_as<CHAR_T, char32_t>);
1682
1683 private:
1684 template <unsigned_integral T>
1685 nonvirtual size_t SubString_adjust_ (T fromOrTo, size_t myLength) const;
1686 template <signed_integral T>
1687 nonvirtual size_t SubString_adjust_ (T fromOrTo, size_t myLength) const;
1688
1689 private:
1690 nonvirtual String SubString_ (const _SafeReadRepAccessor& thisAccessor, size_t from, size_t to) const;
1691
1692 protected:
1693 nonvirtual void _AssertRepValidType () const;
1694
1695 private:
1696 [[noreturn]] static void ThrowInvalidAsciiException_ (); // avoid include
1697 };
1698 static_assert (totally_ordered<String>);
1699
1700#if qStroika_Foundation_Characters_AsPathAutoMapMSYSAndCygwin
1701 template <>
1702 std::filesystem::path String::As<std::filesystem::path> () const;
1703#endif
1704
1705 /**
1706 * operator<< ostream adapters work as you would expect and allow writing Stroika strings easily to ostreams such as cout.
1707 *
1708 * The only catch - is that Stroika strings are UNICODE based, and so may not fit perfectly with 'char' based basic_ostream<>.
1709 * To address this, Stroika strings are mapped to 'narrow sdk strings' - ignoring any errors. As this is generally not a very
1710 * good practice to do (lossy) - and generally just done for debugging/diagnostic output, this was deemed acceptable (as of Stroika v3.0d6).
1711 */
1712 wostream& operator<< (wostream& out, const String& s);
1713 ostream& operator<< (ostream& out, const String& s);
1714
1715#if qStroika_HasComponent_googletest
1716 // For googletest compatibility
1717 void PrintTo (const String& s, std::ostream* os);
1718#endif
1719
1720 /**
1721 * Protected helper Rep class.
1722 *
1723 * \note Important design note - String reps are IMMUTABLE. Changes to string like +=, create new string reps (so costly).
1724 * Use StringBuilder for that purpose in performance sensitive code.
1725 */
1726 class String::_IRep : public Iterable<Character>::_IRep {
1727 public:
1728 /**
1729 * Return the ith character in the string.
1730 */
1731 virtual Character GetAt (size_t index) const noexcept = 0;
1732
1733 public:
1734 /**
1735 * Each rep will support a span of at least one code-point type (ascii, utf8, utf16, or utf32)
1736 *
1737 * This API is guaranteed to support a span of at least one of these types (maybe more). The caller may
1738 * specify the code-point type preferred.
1739 */
1740 virtual PeekSpanData PeekData ([[maybe_unused]] optional<PeekSpanData::StorageCodePointType> preferred) const noexcept = 0;
1741
1742 public:
1743 /*
1744 * Return a pointer to mostly standard (wide, nul-terminated) C string,
1745 * whose lifetime extends to the next non-const call on this rep, or nullptr.
1746 *
1747 * It is only 'mostly' standard because it is allowed to have nul-chars embedded in it. But it will
1748 * always have str[len] == 0;
1749 *
1750 * \note Since Stroika v3.0d1, this can return nullptr (in which case the String library will allocate a new backend)
1751 *
1752 * \post returnResult == nullptr or returnResult[len] == '\0';
1753 */
1754 virtual const wchar_t* c_str_peek () const noexcept = 0;
1755
1756 private:
1757 friend class String;
1758 };
1759
1760 // Some some docs/testing...
1761 static_assert (not IConvertibleToString<int>);
1762 static_assert (not IConvertibleToString<char>); // would have been sensible to allow, but easily generates confusing results: cuz that means String x = 3 would work; confusing with ovarloads)
1763 static_assert (IConvertibleToString<string>);
1764 static_assert (IConvertibleToString<wstring>);
1765 static_assert (IConvertibleToString<u8string>);
1766 static_assert (IConvertibleToString<u16string>);
1767 static_assert (IConvertibleToString<u32string>);
1768 static_assert (not IConvertibleToString<optional<String>>);
1769
1770 namespace Private_ {
1771 // This is just anything that can be treated as a 'span<const Character>'
1772 // clang-format off
1773 template <typename T>
1774 concept ICanBeTreatedAsSpanOfCharacter_ =
1775 derived_from<remove_cvref_t<T>, String>
1776 or same_as<remove_cvref_t<T>, u8string>
1777 or same_as<remove_cvref_t<T>, u8string_view>
1778 or same_as<remove_cvref_t<T>, u16string>
1779 or same_as<remove_cvref_t<T>, u16string_view>
1780 or same_as<remove_cvref_t<T>, u32string>
1781 or same_as<remove_cvref_t<T>, u32string_view>
1782 or same_as<remove_cvref_t<T>, wstring>
1783 or same_as<remove_cvref_t<T>, wstring_view>
1784 or same_as<remove_cvref_t<T>, const Character*>
1785 or same_as<remove_cvref_t<T>, const char8_t*>
1786 or same_as<remove_cvref_t<T>, const char16_t*>
1787 or same_as<remove_cvref_t<T>, const char32_t*>
1788 or same_as<remove_cvref_t<T>, const wchar_t*>
1789 ;
1790 // clang-format on
1791
1792 template <ICanBeTreatedAsSpanOfCharacter_ USTRING, size_t STACK_BUFFER_SZ>
1793 span<const Character> AsSpanOfCharacters_ (USTRING&& s, Memory::StackBuffer<Character, STACK_BUFFER_SZ>* mostlyIgnoredBuf);
1794 }
1795
1796 /**
1797 *
1798 * \par Example Usage
1799 * \code
1800 * constexpr String::EqualsComparer kStringCIComparer_ {Characters::CompareOptions::eCaseInsensitive};
1801 * if (kStringCIComparer_ (filename.extension (), ".HFCC"sv)) {
1802 * compiledName = filename;
1803 * }
1804 * \endcode
1805 *
1806 * \note There is no String::Equals() method, because it would look queer if it took one string argument, and if it was static
1807 * it would essentially look like the above comparer, so little point.
1808 */
1809 struct String::EqualsComparer : Common::ComparisonRelationDeclarationBase<Common::ComparisonRelationType::eEquals> {
1810 /**
1811 * optional CompareOptions to CTOR allows for case insensitive compares
1812 */
1813 constexpr EqualsComparer (CompareOptions co = eWithCase);
1814
1815 /**
1816 * Extra overloads a slight performance improvement
1817 */
1818 template <IConvertibleToString LT, IConvertibleToString RT>
1819 nonvirtual bool operator() (LT&& lhs, RT&& rhs) const;
1820
1821 CompareOptions fCompareOptions;
1822
1823 private:
1824 template <Private_::ICanBeTreatedAsSpanOfCharacter_ LT, Private_::ICanBeTreatedAsSpanOfCharacter_ RT>
1825 bool Cmp_ (LT&& lhs, RT&& rhs) const;
1826 template <Private_::ICanBeTreatedAsSpanOfCharacter_ LT, Private_::ICanBeTreatedAsSpanOfCharacter_ RT>
1827 bool Cmp_Generic_ (LT&& lhs, RT&& rhs) const;
1828 };
1829
1830 /**
1831 */
1832 struct String::ThreeWayComparer : Common::ComparisonRelationDeclarationBase<Common::ComparisonRelationType::eThreeWayCompare> {
1833 /**
1834 * optional CompareOptions to CTOR allows for case insensitive compares
1835 */
1836 constexpr ThreeWayComparer (CompareOptions co = eWithCase);
1837
1838 /**
1839 * Extra overloads a slight performance improvement
1840 */
1841 template <IConvertibleToString LT, IConvertibleToString RT>
1842 nonvirtual strong_ordering operator() (LT&& lhs, RT&& rhs) const;
1843
1844 CompareOptions fCompareOptions;
1845
1846 private:
1847 template <Private_::ICanBeTreatedAsSpanOfCharacter_ LT, Private_::ICanBeTreatedAsSpanOfCharacter_ RT>
1848 strong_ordering Cmp_ (LT&& lhs, RT&& rhs) const;
1849 template <Private_::ICanBeTreatedAsSpanOfCharacter_ LT, Private_::ICanBeTreatedAsSpanOfCharacter_ RT>
1850 strong_ordering Cmp_Generic_ (LT&& lhs, RT&& rhs) const;
1851 };
1852
1853 /**
1854 * \brief very similar to ThreeWayComparer but returns true if less
1855 */
1856 struct String::LessComparer : Common::ComparisonRelationDeclarationBase<Common::ComparisonRelationType::eStrictInOrder> {
1857 constexpr LessComparer (CompareOptions co = eWithCase);
1858
1859 template <typename T1, typename T2>
1860 nonvirtual bool operator() (T1 lhs, T2 rhs) const;
1861
1862 private:
1863 ThreeWayComparer fComparer_;
1864 };
1866
1867 inline namespace Literals {
1868 /**
1869 * \brief shorthand for String::FromStringConstant { ARGUMENT }
1870 *
1871 * \par Example:
1872 * \code
1873 * String s1 = "some-string"_k;
1874 * String s2 = String::FromStringConstant ("some-string");
1875 * String s3 = "some-string"sv; // in most cases this will also work fine, and is preferable (since sv is part of C++ standard)
1876 * \endcode
1877 *
1878 * \note _k is STILL sometimes useful and better than sv, since the TYPE returned by _k is a String_Constant which IS a String
1879 * so it will work in some overload contexts where sv would fail.
1880 *
1881 * \note operator"" _k with char*, requires that the argument string MUST BE ASCII (someday maybe lifted to allow Latin1)
1882 */
1883 String operator"" _k (const ASCII* s, size_t len);
1884 String operator"" _k (const wchar_t* s, size_t len);
1885 String operator"" _k (const char8_t* s, size_t len);
1886 String operator"" _k (const char16_t* s, size_t len);
1887 String operator"" _k (const char32_t* s, size_t len);
1888 }
1889
1890 /**
1891 * Basic operator overload with the obvious meaning, and simply indirect to @String::Concatenate (const String& rhs)
1892 *
1893 * \note Design Note
1894 * Don't use member function so "x" + String{u"x"} works.
1895 * Insist that EITHER LHS or RHS is a string (else operator applies too widely).
1896 *
1897 * Both arguments must be convertible to a String, and at least must be String or derived from String
1898 */
1899 template <IConvertibleToString LHS_T, IConvertibleToString RHS_T>
1900 String operator+ (LHS_T&& lhs, RHS_T&& rhs)
1901 requires (derived_from<remove_cvref_t<LHS_T>, String> or derived_from<remove_cvref_t<RHS_T>, String>);
1902
1903 /**
1904 * \brief StringCombiner is a simple function object used to combine two strings visually - used in Iterable<>::Join ()
1905 *
1906 * This can combine strings in the obvious way (concatenation) - but defaults to separating them with a comma (', ').
1907 *
1908 * \note the functional api - is to be given two strings, and a flag saying if the combination is the last one in the list,
1909 * since in English, this is frequently rendered somewhat differently than the rest.
1910 */
1911 template <typename STRING = String>
1913 STRING fSeparator{", "sv};
1914 optional<STRING> fSpecialSeparatorForLastPair;
1915 STRING operator() (const STRING& lhs, const STRING& rhs, bool isLast) const;
1916 };
1917
1918 /**
1919 * kDefaultStringCombiner is just StringCombiner{}, rendered as a function object, so that it can be externed/imported
1920 * in the Iterable code without imposing a dependency on the String code.
1921 */
1922 extern const function<String (String, String, bool)> kDefaultStringCombiner;
1923
1924}
1925
1926namespace Stroika::Foundation::Traversal {
1927 // specialized as performance optimization
1928 template <>
1929 Characters::String Iterable<Characters::String>::Join (const Characters::String& separator, const optional<Characters::String>& finalSeparator) const;
1930}
1931
1932namespace std {
1933 template <>
1934 struct hash<Stroika::Foundation::Characters::String> {
1935 size_t operator() (const Stroika::Foundation::Characters::String& arg) const;
1936 };
1937}
1938
1939namespace Stroika::Foundation::Memory {
1940 class BLOB; // Forward declare to avoid mutual include issues
1941}
1942
1944 template <typename T>
1945 struct DefaultSerializer; // Forward declare to avoid mutual include issues
1946 template <>
1948 Memory::BLOB operator() (const Stroika::Foundation::Characters::String& arg) const;
1949 };
1950}
1951
1952/**
1953 * Allow std::format to work with String class
1954 *
1955 * \note SUPER PRIMITIVE ROUGH FIRST DRAFT
1956 */
1957template <>
1958struct qStroika_Foundation_Characters_FMT_PREFIX_::formatter<Stroika::Foundation::Characters::String, wchar_t> {
1959 qStroika_Foundation_Characters_FMT_PREFIX_::formatter<std::wstring, wchar_t> fDelegate2_;
1960
1961 template <typename ParseContext>
1962 constexpr typename ParseContext::iterator parse (ParseContext& ctx)
1963 {
1964 return fDelegate2_.parse (ctx);
1965 }
1966
1967 template <typename FmtContext>
1968 typename FmtContext::iterator format (Stroika::Foundation::Characters::String s, FmtContext& ctx) const
1969 {
1970 return fDelegate2_.format (s.As<std::wstring> (), ctx);
1971 }
1972};
1973template <>
1974struct qStroika_Foundation_Characters_FMT_PREFIX_::formatter<Stroika::Foundation::Characters::String, char> {
1975 bool ignoreerrors{true}; // maybe set from thread-local variable, or parse() settings, or both
1976
1977 template <typename ParseContext>
1978 constexpr typename ParseContext::iterator parse (ParseContext& ctx)
1979 {
1980 auto it = ctx.begin ();
1981 while (it != ctx.end ()) {
1982 ++it;
1983#if 0
1984 if (it == ctx.end()) {
1985 throw Common::StdCompat::format_error{"Invalid format args (missing }) for formatter<String,char>."};
1986 }
1987#endif
1988 if (*it == '}') {
1989 return it;
1990 }
1991 }
1992 return it;
1993 }
1994
1995 template <typename FmtContext>
1996 typename FmtContext::iterator format (Stroika::Foundation::Characters::String s, FmtContext& ctx) const
1997 {
1998 using namespace Stroika::Foundation::Characters;
1999 // wformat_context delegateCTX;
2000 String dr{s}; // really want to delegate to wchar_t version (with vformat) but no documented easy way to extract format_args from ctx (though its in there)
2001 if (ignoreerrors) {
2002#if __cpp_lib_ranges >= 202207L
2003 return std::ranges::copy (dr.AsNarrowSDKString (eIgnoreErrors), ctx.out ()).out;
2004#else
2005 return format_to (ctx.out (), "{}", dr.AsNarrowSDKString (eIgnoreErrors));
2006#endif
2007 }
2008 else {
2009#if __cpp_lib_ranges >= 202207L
2010 return std::ranges::copy (dr.AsNarrowSDKString (), ctx.out ()).out;
2011#else
2012 return format_to (ctx.out (), "{}", dr.AsNarrowSDKString ());
2013#endif
2014 }
2015 }
2016};
2017
2018/*
2019 ********************************************************************************
2020 ***************************** Implementation Details ***************************
2021 ********************************************************************************
2022 */
2023#include "String.inl"
2024
2025#endif /*_Stroika_Foundation_Characters_String_h_*/
#define Stroika_Define_Enum_Bounds(FIRST_ITEM, LAST_ITEM)
RegularExpression is a compiled regular expression which can be used to match on a String class.
virtual Character GetAt(size_t index) const noexcept=0
virtual PeekSpanData PeekData(optional< PeekSpanData::StorageCodePointType > preferred) const noexcept=0
Similar to String, but intended to more efficiently construct a String. Mutable type (String is large...
String is like std::u32string, except it is much easier to use, often much more space efficient,...
Definition String.h:201
nonvirtual tuple< const wchar_t *, wstring_view > c_str(Memory::StackBuffer< wchar_t > *possibleBackingStore) const
Definition String.inl:1049
nonvirtual String Concatenate(T &&rhs) const
appends 'rhs' string to this string (without modifying this string) and returns the combined string
nonvirtual span< CHAR_T > CopyTo(span< CHAR_T > s) const
nonvirtual PeekSpanData GetPeekSpanData() const
return the constant character data inside the string in the form of a case variant union of different...
nonvirtual optional< T > AsASCIIQuietly() const
Set<T> is a container of T, where once an item is added, additionally adds () do nothing.
Definition Set.h:105
Logically halfway between std::array and std::vector; Smart 'direct memory array' - which when needed...
Iterable<T> is a base class for containers which easily produce an Iterator<T> to traverse them.
Definition Iterable.h:237
returns true iff T == u8string, u16string, u32string, or wstring - which std::string types can be una...
Definition String.h:116
anything with a 'special .STRINGTYPE conversion' method to UNICODE string, such as filesystem::path
Definition String.h:124
IUNICODECanUnambiguouslyConvertFrom is any 'character representation type' where array of them unambi...
Definition Character.h:179
char ASCII
Stroika's string/character classes treat 'char' as being an ASCII character.
Definition Character.h:59
conditional_t< qTargetPlatformSDKUseswchar_t, wchar_t, char > SDKChar
Definition SDKChar.h:71
basic_string< SDKChar > SDKString
Definition SDKString.h:38
String operator+(LHS_T &&lhs, RHS_T &&rhs)
Definition String.inl:1288
const function< String(String, String, bool)> kDefaultStringCombiner
Definition String.inl:1313
wostream & operator<<(wostream &out, const String &s)
Definition String.cpp:2035
STL namespace.
very similar to ThreeWayComparer but returns true if less
Definition String.h:1856
Summary data for raw contents of rep - each rep will support at least one of these span forms.
Definition String.h:1261
StringCombiner is a simple function object used to combine two strings visually - used in Iterable<>:...
Definition String.h:1912
function object which serializes type T to a BLOB (or BLOB like) object