Stroika Library 3.0d16
 
Loading...
Searching...
No Matches
String.cpp
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#include "Stroika/Foundation/StroikaPreComp.h"
5
6#include <algorithm>
7#include <climits>
8#include <cstdarg>
9#include <istream>
10#include <regex>
11#include <string>
12
15#include "Stroika/Foundation/Characters/SDKString.h"
19#include "Stroika/Foundation/Containers/Set.h"
20#include "Stroika/Foundation/Containers/Support/ReserveTweaks.h"
23#include "Stroika/Foundation/Execution/Exceptions.h"
24#include "Stroika/Foundation/Execution/Throw.h"
25#include "Stroika/Foundation/Math/Common.h"
27#include "Stroika/Foundation/Memory/Common.h"
29
30#include "String.h"
31
32using namespace Stroika::Foundation;
35using namespace Stroika::Foundation::Common;
36
39
40// see Satisfies Concepts:
41static_assert (regular<String>);
42
43#if qStroika_Foundation_Characters_AsPathAutoMapMSYSAndCygwin
44#include <filesystem>
45#endif
46
47namespace {
48
49 /**
50 * Helper for sharing implementation code on string reps
51 * This REP is templated on CHAR_T. The key is that ALL characters for that string fit inside
52 * CHAR_T, so that the implementation can store them as an array, and index.
53 * So mixed 1,2,3 byte characters all get stored in a char32_t array, and a string with all ascii
54 * characters get stored in a char (1byte stride) array.
55 *
56 * \note - the KEY design choice in StringRepHelperAllFitInSize_::Rep<CHAR_T> is that it contains no
57 * multi-code-point characters. This is what allows the simple calculation of array index
58 * to character offset. So use
59 * StringRepHelperAllFitInSize_::Rep<ASCII> for ascii text
60 * StringRepHelperAllFitInSize_::Rep<LATIN1> for ISOLatin1 text
61 * StringRepHelperAllFitInSize_::Rep<char16_t> for ISOLatin1/anything which is a 2-byte unicode char (not surrogates)
62 * StringRepHelperAllFitInSize_::Rep<char32_t> for anything else - this always works
63 */
64 struct StringRepHelperAllFitInSize_ : String {
65 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
66 struct Rep : public _IRep {
67 private:
68 using inherited = _IRep;
69
70 protected:
71 span<const CHAR_T> _fData;
72
73#if qStroika_Foundation_Debug_AssertionsChecked
74 private:
75 mutable unsigned int fOutstandingIterators_{};
76#endif
77
78 protected:
79 Rep () = default;
80 Rep (span<const CHAR_T> s)
81 requires (not same_as<CHAR_T, char8_t>) // char8 ironically involves 2-byte characters, cuz only ascii encoded as 1 byte
82 : _fData{s}
83 {
84 if constexpr (same_as<CHAR_T, char> or same_as<CHAR_T, char8_t>) {
85 Require (Character::IsASCII (s));
86 }
87 // Any 8-bit sequence valid for Latin1
88 if constexpr (same_as<CHAR_T, char16_t>) {
90 }
91 }
92 Rep& operator= (span<const CHAR_T> s)
93 {
94#if qStroika_Foundation_Debug_AssertionsChecked
95 Require (fOutstandingIterators_ == 0);
96#endif
97 if constexpr (same_as<CHAR_T, char> or same_as<CHAR_T, char8_t>) {
98 Require (Character::IsASCII (s));
99 }
100 if constexpr (same_as<CHAR_T, char16_t>) {
102 }
103 _fData = s;
104 return *this;
105 }
106
107 public:
108 // String::_IRep OVERRIDES
109 virtual Character GetAt (size_t index) const noexcept override
110 {
111 Require (index < _fData.size ());
112 // NOTE - this is safe because we never construct this type with surrogates
113 return Character{static_cast<char32_t> (_fData[index])};
114 }
115 virtual PeekSpanData PeekData (optional<PeekSpanData::StorageCodePointType> /*preferred*/) const noexcept override
116 {
117 // IGNORE preferred, cuz we return what is in our REP - since returning a direct pointer to that data - no conversion possible
118 if constexpr (same_as<CHAR_T, ASCII>) {
119 return PeekSpanData{PeekSpanData::StorageCodePointType::eAscii, {.fAscii = _fData}};
120 }
121 if constexpr (same_as<CHAR_T, Latin1>) {
122 return PeekSpanData{PeekSpanData::StorageCodePointType::eSingleByteLatin1, {.fSingleByteLatin1 = _fData}};
123 }
124 else if constexpr (sizeof (CHAR_T) == 2) {
125 // reinterpret_cast needed cuz of wchar_t case
126 return PeekSpanData{PeekSpanData::StorageCodePointType::eChar16,
127 {.fChar16 = span<const char16_t>{reinterpret_cast<const char16_t*> (_fData.data ()), _fData.size ()}}};
128 }
129 else if constexpr (sizeof (CHAR_T) == 4) {
130 // reinterpret_cast needed cuz of wchar_t case
131 return PeekSpanData{PeekSpanData::StorageCodePointType::eChar32,
132 {.fChar32 = span<const char32_t>{reinterpret_cast<const char32_t*> (_fData.data ()), _fData.size ()}}};
133 }
134 }
135
136 // Overrides for Iterable<Character>
137 // @todo - MAYBE override Apply/Find and a few others to not use default 'iterator object' implementation that has lots of indirect virtual calls
138 public:
139 virtual shared_ptr<Iterable<Character>::_IRep> Clone () const override
140 {
141 AssertNotReached (); // Since String reps now immutable, this should never be called
142 return nullptr;
143 }
144 virtual Traversal::Iterator<value_type> MakeIterator () const override
145 {
146 // NOTE - UNDETECTED CALLER ERROR - if iterator constructed and used after string rep destroyed (never changed) -- LGP 2023-07-07
147 struct MyIterRep_ final : Iterator<Character>::IRep, public Memory::UseBlockAllocationIfAppropriate<MyIterRep_> {
148 span<const CHAR_T> fData_; // clone span (not underlying data)
149 size_t fIdx_{0};
150#if qStroika_Foundation_Debug_AssertionsChecked
151 const Rep* fOwningRep_;
152#endif
153 MyIterRep_ (span<const CHAR_T> data
155 ,
156 const Rep* dbgRep
157#endif
158 )
159 : fData_{data}
161 , fOwningRep_{dbgRep}
162#endif
163 {
164#if qStroika_Foundation_Debug_AssertionsChecked
165 ++fOwningRep_->fOutstandingIterators_;
166#endif
167 }
168#if qStroika_Foundation_Debug_AssertionsChecked
169 virtual ~MyIterRep_ () override
170 {
171 Require (fOwningRep_->fOutstandingIterators_ > 0); // if this fails, probably cuz fOwningRep_ destroyed
172 --fOwningRep_->fOutstandingIterators_;
173 }
174#endif
175
176 virtual unique_ptr<Iterator<Character>::IRep> Clone () const override
177 {
178 return make_unique<MyIterRep_> (fData_.subspan (fIdx_)
180 ,
181 fOwningRep_
182#endif
183 );
184 }
185 virtual void More (optional<Character>* result, bool advance) override
186 {
187 RequireNotNull (result);
188 if (advance) [[likely]] {
189 Require (fIdx_ < fData_.size ());
190 ++fIdx_;
191 }
192 if (fIdx_ < fData_.size ()) [[likely]] {
193 // NOTE - this is safe because we never construct this type with surrogates
194 *result = Character{static_cast<char32_t> (fData_[fIdx_])};
195 }
196 else {
197 *result = nullopt;
198 }
199 }
200 virtual bool Equals (const IRep* rhs) const override
201 {
202 RequireNotNull (rhs);
203 RequireMember (rhs, MyIterRep_);
204 const MyIterRep_* rrhs = Debug::UncheckedDynamicCast<const MyIterRep_*> (rhs);
205 return fData_.data () == rrhs->fData_.data () and fIdx_ == rrhs->fIdx_;
206 }
207 };
208 return Iterator<Character>{make_unique<MyIterRep_> (this->_fData
209
211 ,
212 this
213#endif
214
215 )};
216 }
217 virtual size_t size () const override
218 {
219 return _fData.size ();
220 }
221 virtual bool empty () const override
222 {
223 return _fData.empty ();
224 }
225 virtual Traversal::Iterator<value_type> Find (const function<bool (ArgByValueType<value_type> item)>& that,
226 Execution::SequencePolicy seq) const override
227 {
228 return inherited::Find (that, seq); // @todo rewrite FOR PERFORMANCE to operate on fData_
229 }
230 };
231 };
232
233 /**
234 * Simple string rep, which dynamically allocates its storage on the heap, through an indirect pointer reference.
235 * \note This class may assure nul-terminated (kAddNullTerminator_), and so 'capacity' always at least one greater than length.
236 */
237 struct DynamicallyAllocatedString : StringRepHelperAllFitInSize_ {
238 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
239 struct Rep final : public StringRepHelperAllFitInSize_::Rep<CHAR_T>, public Memory::UseBlockAllocationIfAppropriate<Rep<CHAR_T>> {
240 private:
241 using inherited = StringRepHelperAllFitInSize_::Rep<CHAR_T>;
242
243 public:
244 Rep (span<const CHAR_T> t1)
245 : inherited{mkBuf_ (t1)}
246 {
247 }
248 Rep () = delete;
249 Rep (const Rep&) = delete;
250
251 public:
252 nonvirtual Rep& operator= (const Rep&) = delete;
253
254 public:
255 virtual ~Rep () override
256 {
257 delete[] this->_fData.data ();
258 }
259
260 private:
261 static span<CHAR_T> mkBuf_ (size_t length)
262 {
263 size_t capacity = AdjustCapacity_ (length);
264 Assert (length <= capacity);
265 if constexpr (kAddNullTerminator_) {
266 Assert (length + 1 <= capacity);
267 }
268 CHAR_T* newBuf = new CHAR_T[capacity];
269 return span{newBuf, capacity};
270 }
271 static span<CHAR_T> mkBuf_ (span<const CHAR_T> t1)
272 {
273 size_t len = t1.size ();
274 span<CHAR_T> buf = mkBuf_ (len); // note buf span is over capacity, not size
275 Assert (buf.size () >= len);
276 auto result = Memory::CopyBytes (t1, buf);
277 if constexpr (kAddNullTerminator_) {
278 Assert (len + 1 <= buf.size ());
279 *(buf.data () + len) = '\0';
280 }
281 return result; // return span of just characters, even if we have extra NUL-byte (outside span)
282 }
283
284 public:
285 // String::_IRep OVERRIDES
286 virtual const wchar_t* c_str_peek () const noexcept override
287 {
288 // @todo NOTE DEPRECATED SINCE STROIKA v3.0d13, and same for kAddNullTerminator_
289 if constexpr (kAddNullTerminator_) {
290 Assert (*(this->_fData.data () + this->_fData.size ()) == '\0'); // dont index into buf cuz we cheat and go one past end on purpose
291 return reinterpret_cast<const wchar_t*> (this->_fData.data ());
292 }
293 else {
294 return nullptr;
295 }
296 }
297
298 private:
299 // Stick nul-terminator byte just past the end of the span
300 static constexpr bool kAddNullTerminator_ = sizeof (CHAR_T) == sizeof (wchar_t); // costs nothing to nul-terminate in this case
301
302 private:
303 static size_t AdjustCapacity_ (size_t initialCapacity)
304 {
305 size_t result = initialCapacity;
306 if constexpr (kAddNullTerminator_) {
307 ++result;
308 }
309 return result;
310 }
311 };
312 };
313
314 /**
315 * Most Stroika strings use this 'rep': FixedCapacityInlineStorageString_
316 *
317 * This String rep is like BufferedString_, except that the storage is inline in one struct/allocation
318 * for better memory allocation performance, and more importantly, better locality of data (more cpu cache friendly)
319 */
320 struct FixedCapacityInlineStorageString_ : StringRepHelperAllFitInSize_ {
321 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T, size_t CAPACITY>
322 struct Rep final : public StringRepHelperAllFitInSize_::Rep<CHAR_T>,
323 public Memory::UseBlockAllocationIfAppropriate<Rep<CHAR_T, CAPACITY>> {
324 private:
325 using inherited = StringRepHelperAllFitInSize_::Rep<CHAR_T>;
326
327 private:
328 bool IncludesNullTerminator_ () const
329 {
330 if constexpr (sizeof (CHAR_T) == sizeof (wchar_t)) {
331 return this->_fData.size () < CAPACITY; // else no room
332 }
333 else {
334 return false;
335 }
336 }
337
338 private:
339 CHAR_T fBuf_[CAPACITY];
340
341 public:
342 Rep (span<const CHAR_T> t1)
343 : inherited{}
344 {
345 // must do this logic after base construction since references data member which doesn't exist
346 // til after base class construction. SHOULDNT really matter (since uninitialized data), but on
347 // g++-11, and other compilers, detected as vptr UB violation if we access first
348 Require (t1.size () <= CAPACITY);
349 inherited::operator= (Memory::CopyBytes (t1, span<CHAR_T>{fBuf_}));
350 if (IncludesNullTerminator_ ()) {
351 Assert (t1.size () + 1 <= CAPACITY);
352 fBuf_[t1.size ()] = CHAR_T{'\0'};
353 }
354 }
355 Rep () = delete;
356 Rep (const Rep&) = delete;
357
358 public:
359 nonvirtual Rep& operator= (const Rep&) = delete;
360
361 public:
362 // String::_IRep OVERRIDES
363 virtual const wchar_t* c_str_peek () const noexcept override
364 {
365 if (IncludesNullTerminator_ ()) {
366 Assert (*(this->_fData.data () + this->_fData.size ()) == '\0'); // dont index into buf cuz we cheat and go one past end on purpose
367 return reinterpret_cast<const wchar_t*> (this->_fData.data ());
368 }
369 else {
370 return nullptr;
371 }
372 }
373 };
374 };
375
376 /**
377 * For static full app lifetime string constants...
378 */
379 struct StringConstant_ : public StringRepHelperAllFitInSize_ {
380 using inherited = String;
381
382 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
383 class DirectIndexRep final : public StringRepHelperAllFitInSize_::Rep<CHAR_T>,
384 public Memory::UseBlockAllocationIfAppropriate<Rep<CHAR_T>> {
385 private:
386 using inherited = StringRepHelperAllFitInSize_::Rep<CHAR_T>;
387
388 public:
389 DirectIndexRep (span<const CHAR_T> s)
390 : inherited{s} // don't copy memory - but copy raw pointers! So they MUST BE (externally promised) 'externally owned for the application lifetime and constant' - like c++ string constants
391 {
392 }
393
394 public:
395 // String::_IRep OVERRIDES
396 virtual const wchar_t* c_str_peek () const noexcept override
397 {
398 return nullptr;
399 }
400 };
401 };
402
403 /*
404 * Used for String{move(some_string)}
405 */
406 struct StdStringDelegator_ : public StringRepHelperAllFitInSize_ {
407 using inherited = String;
408
409 template <IStdBasicStringCompatibleCharacter CHAR_T>
410 class Rep final : public StringRepHelperAllFitInSize_::Rep<CHAR_T>, public Memory::UseBlockAllocationIfAppropriate<Rep<CHAR_T>> {
411 private:
412 using inherited = StringRepHelperAllFitInSize_::Rep<CHAR_T>;
413
414 public:
415 Rep (basic_string<CHAR_T>&& s)
416 : inherited{span<const CHAR_T>{}}
417 , fMovedData_{move (s)}
418 {
419 inherited::operator= (span{fMovedData_.data (), fMovedData_.size ()}); // must grab after move
420 }
421
422 public:
423 // String::_IRep OVERRIDES
424 virtual const wchar_t* c_str_peek () const noexcept override
425 {
426 if constexpr (same_as<CHAR_T, wchar_t>) {
427 return fMovedData_.c_str ();
428 }
429 else {
430 return nullptr;
431 }
432 }
433
434 private:
435 basic_string<CHAR_T> fMovedData_;
436 };
437 };
438
439 /**
440 * Delegate to original String::Rep, and add in support for c_str ()
441 */
442 struct StringWithCStr_ : public String {
443 public:
444 class Rep final : public _IRep, public Memory::UseBlockAllocationIfAppropriate<Rep> {
445 private:
446 shared_ptr<_IRep> fUnderlyingRep_;
447 wstring fCString_;
448
449 public:
450 // Caller MUST ASSURE generates right size of Rep based on size in underlyingRepPDS
451 Rep (const shared_ptr<_IRep>& underlyingRep)
452 : fUnderlyingRep_{underlyingRep}
453 , fCString_{}
454 {
455 Memory::StackBuffer<wchar_t> possibleUsedBuf;
456 auto wideSpan = String::GetData<wchar_t> (underlyingRep->PeekData (nullopt), &possibleUsedBuf);
457 fCString_.assign (wideSpan.begin (), wideSpan.end ());
458 }
459
460 // Overrides for Iterable<Character>
461 public:
462 virtual shared_ptr<Iterable<Character>::_IRep> Clone () const override
463 {
464 return fUnderlyingRep_->Clone ();
465 }
466 virtual Traversal::Iterator<value_type> MakeIterator () const override
467 {
468 return fUnderlyingRep_->MakeIterator ();
469 }
470 virtual size_t size () const override
471 {
472 return fUnderlyingRep_->size ();
473 }
474 virtual bool empty () const override
475 {
476 return fUnderlyingRep_->empty ();
477 }
478 virtual Traversal::Iterator<value_type> Find (const function<bool (ArgByValueType<value_type> item)>& that,
479 [[maybe_unused]] Execution::SequencePolicy seq) const override
480 {
481 return fUnderlyingRep_->Find (that, seq);
482 }
483
484 // String::_IRep overrides - delegate
485 public:
486 virtual Character GetAt (size_t index) const noexcept override
487 {
488 return fUnderlyingRep_->GetAt (index);
489 }
490 virtual PeekSpanData PeekData ([[maybe_unused]] optional<PeekSpanData::StorageCodePointType> preferred) const noexcept override
491 {
492 return fUnderlyingRep_->PeekData (preferred);
493 }
494 virtual const wchar_t* c_str_peek () const noexcept override
495 {
496 return fCString_.c_str ();
497 }
498 };
499 };
500}
501
502namespace {
503 template <typename FACET>
504 struct deletable_facet_ final : FACET {
505 template <typename... Args>
506 deletable_facet_ (Args&&... args)
507 : FACET{forward<Args> (args)...}
508 {
509 }
510 ~deletable_facet_ () = default;
511 };
512}
513
514/*
515 ********************************************************************************
516 ******* Characters::Private_::RegularExpression_GetCompiled ********************
517 ********************************************************************************
518 */
519const wregex& Characters::Private_::RegularExpression_GetCompiled (const RegularExpression& regExp)
520{
521 return regExp.GetCompiled ();
522}
523
524/*
525 ********************************************************************************
526 ************************************* String ***********************************
527 ********************************************************************************
528 */
529shared_ptr<String::_IRep> String::CTORFromBasicStringView_ (const basic_string_view<ASCII>& str)
530{
531 RequireExpression (Character::IsASCII (span{str.data (), str.size ()}));
532 return Memory::MakeSharedPtr<StringConstant_::DirectIndexRep<ASCII>> (span{str.data (), str.size ()});
533}
534
535shared_ptr<String::_IRep> String::CTORFromBasicStringView_ (const basic_string_view<char8_t>& str)
536{
537 if (Character::IsASCII (span{str.data (), str.size ()})) {
538 return Memory::MakeSharedPtr<StringConstant_::DirectIndexRep<ASCII>> (Memory::SpanBytesCast<span<const ASCII>> (span{str.data (), str.size ()}));
539 }
540 else {
541 return mk_ (span<const char8_t>{str.data (), str.size ()}); // copies data
542 }
543}
544
545shared_ptr<String::_IRep> String::CTORFromBasicStringView_ (const basic_string_view<char16_t>& str)
546{
547 if (UTFConvert::AllFitsInTwoByteEncoding (span{str})) {
548 return Memory::MakeSharedPtr<StringConstant_::DirectIndexRep<char16_t>> (span{str.data (), str.size ()});
549 }
550 else {
551 return mk_ (span<const char16_t>{str.data (), str.size ()}); // copies data
552 }
553}
554
555shared_ptr<String::_IRep> String::CTORFromBasicStringView_ (const basic_string_view<char32_t>& str)
556{
557 return Memory::MakeSharedPtr<StringConstant_::DirectIndexRep<char32_t>> (span{str.data (), str.size ()});
558}
559
560shared_ptr<String::_IRep> String::CTORFromBasicStringView_ (const basic_string_view<wchar_t>& str)
561{
562 return Memory::MakeSharedPtr<StringConstant_::DirectIndexRep<wchar_t>> (span{str.data (), str.size ()});
563}
564
565String String::FromStringConstant (span<const ASCII> s)
566{
567 Require (Character::IsASCII (s));
568 return String{Memory::MakeSharedPtr<StringConstant_::DirectIndexRep<ASCII>> (s)};
569}
570
571String String::FromStringConstant (span<const char16_t> s)
572{
574 return String{Memory::MakeSharedPtr<StringConstant_::DirectIndexRep<char16_t>> (s)};
575 }
576 else {
577 return String{s};
578 }
579}
580
581String String::FromStringConstant (span<const char32_t> s)
582{
583 return String{Memory::MakeSharedPtr<StringConstant_::DirectIndexRep<char32_t>> (s)};
584}
585
586String String::FromNarrowString (span<const char> s, const locale& l)
587{
588 // Note: this could use CodeCvt, but directly using std::codecvt in this case pretty simple, and
589 // more efficient this way --LGP 2023-02-14
590
591 // See http://en.cppreference.com/w/cpp/locale/codecvt/~codecvt
592 using Destructible_codecvt_byname = deletable_facet_<codecvt_byname<wchar_t, char, mbstate_t>>;
593 Destructible_codecvt_byname cvt{l.name ()};
594
595 // http://en.cppreference.com/w/cpp/locale/codecvt/in
596 mbstate_t mbstate{};
597 Memory::StackBuffer<wchar_t> targetBuf{s.size ()};
598 const char* from_next;
599 wchar_t* to_next;
600 codecvt_base::result result =
601 cvt.in (mbstate, s.data (), s.data () + s.size (), from_next, targetBuf.data (), targetBuf.data () + targetBuf.size (), to_next);
602 if (result != codecvt_base::ok) [[unlikely]] {
603 static const auto kException_ = Execution::RuntimeErrorException{"Error converting locale multibyte string to UNICODE"sv};
604 Execution::Throw (kException_);
605 }
606 return String{span<const wchar_t>{targetBuf.data (), static_cast<size_t> (to_next - targetBuf.data ())}};
607}
608
609shared_ptr<String::_IRep> String::mkEmpty_ ()
610{
611 static constexpr wchar_t kEmptyCStr_[] = L"";
612 static const shared_ptr<_IRep> s_ = Memory::MakeSharedPtr<StringConstant_::DirectIndexRep<wchar_t>> (span{std::begin (kEmptyCStr_), 0});
613 return s_;
614}
615
616template <typename CHAR_T>
617inline auto String::mk_nocheck_ (span<const CHAR_T> s) -> shared_ptr<_IRep>
618 requires (same_as<CHAR_T, ASCII> or same_as<CHAR_T, Latin1> or same_as<CHAR_T, char16_t> or same_as<CHAR_T, char32_t>)
619{
620 // No check means needed checking done before, so these assertions just help enforce that
621 if constexpr (same_as<CHAR_T, ASCII>) {
622 Require (Character::IsASCII (s)); // avoid later assertion error
623 }
624 else if constexpr (same_as<CHAR_T, Latin1>) {
625 // nothing to check
626 }
627 else if constexpr (sizeof (CHAR_T) == 2) {
628 Require (UTFConvert::AllFitsInTwoByteEncoding (s)); // avoid later assertion error
629 }
630 else {
631 // again - if larger, nothing to check
632 }
633
634 /**
635 * We want to TARGET using block-allocator of 64 bytes. This works well for typical (x86) machine
636 * caches, and divides up nicely, and leaves enuf room for a decent number of characters typically.
637 *
638 * So compute/guestimate a few sizes, and add static_asserts to check where we can. Often if these fail
639 * you can just get rid/or fix them. Not truly counted on, just trying ot generate vaguely reasonable
640 * number of characters to use.
641 */
642 constexpr size_t kBaseOfFixedBufSize_ = sizeof (StringRepHelperAllFitInSize_::Rep<CHAR_T>);
643 static_assert (kBaseOfFixedBufSize_ < 64); // this code below assumes, so must re-tune if this ever fails
644 if constexpr (qStroika_Foundation_Common_Platform_Windows and not qStroika_Foundation_Debug_AssertionsChecked) {
645 static_assert (kBaseOfFixedBufSize_ == 3 * sizeof (void*));
646 if constexpr (sizeof (void*) == 4) {
647 static_assert (kBaseOfFixedBufSize_ == 12);
648 }
649 else if constexpr (sizeof (void*) == 8) {
650 static_assert (kBaseOfFixedBufSize_ == 24);
651 }
652 }
653 constexpr size_t kOverheadSizeForMakeShared_ =
654 qStroika_Foundation_Common_Platform_Windows ? (sizeof (void*) == 4 ? 12 : 16) : sizeof (unsigned long) * 2;
655#if qStroika_Foundation_Common_Platform_Windows
656 static_assert (kOverheadSizeForMakeShared_ == sizeof (_Ref_count_base)); // not critically counted on, just to debug/fix sizes
657#endif
658 static constexpr size_t kNElts1_ = (64 - kBaseOfFixedBufSize_ - kOverheadSizeForMakeShared_) / sizeof (CHAR_T);
659 static constexpr size_t kNElts2_ = (96 - kBaseOfFixedBufSize_ - kOverheadSizeForMakeShared_) / sizeof (CHAR_T);
660 static constexpr size_t kNElts3_ = (128 - kBaseOfFixedBufSize_ - kOverheadSizeForMakeShared_) / sizeof (CHAR_T);
661
662 // These checks are NOT important, just for documentation/reference
663 if constexpr (qStroika_Foundation_Common_Platform_Windows and sizeof (CHAR_T) == 1 and not qStroika_Foundation_Debug_AssertionsChecked) {
664 if constexpr (sizeof (void*) == 4) {
665 static_assert (kNElts1_ == 40);
666 static_assert (kNElts2_ == 72);
667 static_assert (kNElts3_ == 104);
668 }
669 if constexpr (sizeof (void*) == 8) {
670 static_assert (kNElts1_ == 24);
671 static_assert (kNElts2_ == 56);
672 static_assert (kNElts3_ == 88);
673 }
674 }
675
676 static_assert (qStroika_Foundation_Debug_AssertionsChecked or kNElts1_ >= 6); // crazy otherwise
677 static_assert (kNElts2_ > kNElts1_); // ""
678 static_assert (kNElts3_ > kNElts2_); // ""
679
680 static_assert (sizeof (FixedCapacityInlineStorageString_::Rep<CHAR_T, kNElts1_>) == 64 - kOverheadSizeForMakeShared_); // not quite guaranteed but close
681 static_assert (sizeof (FixedCapacityInlineStorageString_::Rep<CHAR_T, kNElts2_>) == 96 - kOverheadSizeForMakeShared_); // ""
682 static_assert (sizeof (FixedCapacityInlineStorageString_::Rep<CHAR_T, kNElts3_>) == 128 - kOverheadSizeForMakeShared_); // ""
683
684 size_t sz = s.size ();
685 if (sz <= kNElts1_) {
686 return Memory::MakeSharedPtr<FixedCapacityInlineStorageString_::Rep<CHAR_T, kNElts1_>> (s);
687 }
688 else if (sz <= kNElts2_) {
689 return Memory::MakeSharedPtr<FixedCapacityInlineStorageString_::Rep<CHAR_T, kNElts2_>> (s);
690 }
691 else if (sz <= kNElts3_) {
692 return Memory::MakeSharedPtr<FixedCapacityInlineStorageString_::Rep<CHAR_T, kNElts3_>> (s);
693 }
694 return Memory::MakeSharedPtr<DynamicallyAllocatedString::Rep<CHAR_T>> (s);
695}
696
697template <>
698auto String::mk_ (basic_string<char>&& s) -> shared_ptr<_IRep>
699{
700 Character::CheckASCII (span{s.data (), s.size ()});
701 return Memory::MakeSharedPtr<StdStringDelegator_::Rep<ASCII>> (move (s));
702}
703
704template <>
705auto String::mk_ (basic_string<char16_t>&& s) -> shared_ptr<_IRep>
706{
707 if (UTFConvert::AllFitsInTwoByteEncoding (Memory::ConstSpan (span{s.data (), s.size ()}))) {
708 return Memory::MakeSharedPtr<StdStringDelegator_::Rep<char16_t>> (move (s));
709 }
710 // copy the data if any surrogates
711 Memory::StackBuffer<char32_t> wideUnicodeBuf{Memory::eUninitialized, UTFConvert::ComputeTargetBufferSize<char32_t> (span{s.data (), s.size ()})};
712 return mk_nocheck_ (Memory::ConstSpan (UTFConvert::kThe.ConvertSpan (span{s.data (), s.size ()}, span{wideUnicodeBuf})));
713}
714
715template <>
716auto String::mk_ (basic_string<char32_t>&& s) -> shared_ptr<_IRep>
717{
718 return Memory::MakeSharedPtr<StdStringDelegator_::Rep<char32_t>> (move (s));
719}
720
721template <>
722auto String::mk_ (basic_string<wchar_t>&& s) -> shared_ptr<_IRep>
723{
724 if constexpr (sizeof (wchar_t) == 2) {
725 if (UTFConvert::AllFitsInTwoByteEncoding (Memory::ConstSpan (span{s.data (), s.size ()}))) {
726 return Memory::MakeSharedPtr<StdStringDelegator_::Rep<wchar_t>> (move (s));
727 }
728 // copy the data if any surrogates
729 Memory::StackBuffer<char32_t> wideUnicodeBuf{Memory::eUninitialized,
730 UTFConvert::ComputeTargetBufferSize<char32_t> (span{s.data (), s.size ()})};
731 return mk_nocheck_ (Memory::ConstSpan (UTFConvert::kThe.ConvertSpan (span{s.data (), s.size ()}, span{wideUnicodeBuf})));
732 }
733 else {
734 return Memory::MakeSharedPtr<StdStringDelegator_::Rep<wchar_t>> (move (s));
735 }
736}
737
738String String::Concatenate_ (const String& rhs) const
739{
740 // KISS, simple default 'fall-thru' case
742 span leftSpan = GetData (&ignoredA);
744 span rightSpan = rhs.GetData (&ignoredB);
745 Memory::StackBuffer<char32_t> buf{Memory::eUninitialized, leftSpan.size () + rightSpan.size ()};
746 copy (leftSpan.begin (), leftSpan.end (), buf.data ());
747 copy (rightSpan.begin (), rightSpan.end (), buf.data () + leftSpan.size ());
748 return mk_ (span{buf});
749}
750
751void String::SetCharAt (Character c, size_t i)
752{
753 // @Todo - redo with check if char is actually changing and if so use
754 // mk/4 4 arg string maker instead.??? Or some such...
755 Require (i >= 0);
756 Require (i < size ());
757 // Expensive, but you can use StringBuilder directly to avoid the performance costs
758 StringBuilder sb{*this};
759 Require (i < size ());
760 sb.SetAt (c, i);
761 *this = sb;
762}
763
764String String::InsertAt (span<const Character> s, size_t at) const
765{
766 Require (at >= 0);
767 Require (at <= size ());
768 if (s.empty ()) {
769 return *this;
770 }
772 span<const Character> thisStrData = GetData (&ignored1);
773 StringBuilder sb{thisStrData.subspan (0, at)};
774 sb.Append (s);
775 sb.Append (thisStrData.subspan (at));
776 return sb;
777}
778
779String String::RemoveAt (size_t from, size_t to) const
780{
781 Require (from <= to);
782 Require (to <= size ());
783 if (from == to) {
784 return *this;
785 }
786 if (from == 0) {
787 return SubString (to);
788 }
789 _SafeReadRepAccessor accessor{this};
790 size_t length = accessor._ConstGetRep ().size ();
791 if (to == length) {
792 return SubString (0, from);
793 }
794 else {
796 span d = GetData (&ignored1);
797 Memory::StackBuffer<char32_t> buf{Memory::eUninitialized, d.size () - (to - from)};
798 span<char32_t> bufSpan{buf.data (), buf.size ()};
799 span s1 = d.subspan (0, from);
800 span s2 = d.subspan (to);
801 Memory::CopyBytes (s1, bufSpan);
802 Memory::CopyBytes (s2, bufSpan.subspan (s1.size ()));
803 return String{mk_ (bufSpan)};
804 }
805}
806
808{
809 String tmp = {*this};
810 if (auto o = tmp.Find (c, eWithCase)) {
811 return tmp.RemoveAt (*o);
812 }
813 return tmp;
814}
815String String::RemoveFirstIf (const String& subString) const
816{
817 if (auto o = this->Find (subString, eWithCase)) {
818 return this->SubString (0, *o) + this->SubString (*o + subString.length ());
819 }
820 return *this;
821}
822
824{
825 // @todo REIMPL WITH STRINGBUILDER
826 // quick and dirty inefficient implementation
827 String tmp = {*this};
828 while (auto o = tmp.Find (c, eWithCase)) {
829 tmp = tmp.RemoveAt (*o);
830 }
831 return tmp;
832}
833String String::RemoveAll (const String& subString) const
834{
835 // @todo REIMPL WITH STRINGBUILDER
836 // quick and dirty inefficient implementation
837 String tmp = {*this};
838 while (auto o = tmp.Find (subString, eWithCase)) {
839 tmp = tmp.SubString (0, *o) + tmp.SubString (*o + subString.length ());
840 }
841 return tmp;
842}
843
844optional<size_t> String::Find (Character c, size_t startAt, CompareOptions co) const
845{
846 PeekSpanData pds = GetPeekSpanData<ASCII> ();
847 // OPTIMIZED PATHS: Common case(s) and should be fast
849 if (c.IsASCII ()) {
850 span<const char> examineSpan = pds.fAscii.subspan (startAt);
851 if (co == eWithCase) {
852 if (auto i = std::find (examineSpan.begin (), examineSpan.end (), c.GetAsciiCode ()); i != examineSpan.end ()) {
853 return i - examineSpan.begin () + startAt;
854 }
855 }
856 else {
857 char lc = c.ToLowerCase ().GetAsciiCode ();
858 size_t reportIdx = startAt;
859 for (auto ci : examineSpan) {
860 if (tolower (ci) == lc) {
861 return reportIdx;
862 }
863 ++reportIdx;
864 }
865 }
866 return nullopt; // not found, possibly cuz not ascii
867 }
868 }
869 // fallback on more generic algorithm - and copy to full character objects
870 //
871 // performance notes
872 // Could iterate using CharAt() and that would perform better in the case where you find c early
873 // in a string, and the string is short. The problem with the current code is that it converts the
874 // entire string (could be long) and then might not look at much of the converted data.
875 // on the other hand, if our reps are either 'ascii or char32_t wide' - which we may end up with - then
876 // this isn't too bad - cuz no copying for char32_ case either...
877 Memory::StackBuffer<Character> maybeIgnoreBuf;
878 span<const Character> charSpan = GetData (pds, &maybeIgnoreBuf);
879 Require (startAt <= charSpan.size ());
880 span<const Character> examineSpan = charSpan.subspan (startAt);
881 switch (co) {
882 case eCaseInsensitive: {
883 Character lcc = c.ToLowerCase ();
884 for (auto i = examineSpan.begin (); i != examineSpan.end (); ++i) {
885 if (i->ToLowerCase () == lcc) {
886 return startAt + (i - examineSpan.begin ());
887 }
888 }
889 } break;
890 case eWithCase: {
891 if (auto i = std::find (examineSpan.begin (), examineSpan.end (), c); i != examineSpan.end ()) {
892 return startAt + i - examineSpan.begin ();
893 }
894 } break;
895 }
896 return nullopt; // not found any which way
897}
898
899optional<size_t> String::Find (const String& subString, size_t startAt, CompareOptions co) const
900{
901 //@todo: FIX HORRIBLE PERFORMANCE!!!
902 _SafeReadRepAccessor accessor{this};
903 Require (startAt <= accessor._ConstGetRep ().size ());
904
905 size_t subStrLen = subString.size ();
906 if (subStrLen == 0) {
907 return (accessor._ConstGetRep ().size () == 0) ? optional<size_t>{} : 0;
908 }
909 if (accessor._ConstGetRep ().size () < subStrLen) {
910 return {}; // important test cuz size_t is unsigned
911 }
912
913 size_t limit = accessor._ConstGetRep ().size () - subStrLen;
914 switch (co) {
915 case eCaseInsensitive: {
916 for (size_t i = startAt; i <= limit; ++i) {
917 for (size_t j = 0; j < subStrLen; ++j) {
918 if (accessor._ConstGetRep ().GetAt (i + j).ToLowerCase () != subString[j].ToLowerCase ()) {
919 goto nogood1;
920 }
921 }
922 return i;
923 nogood1:;
924 }
925 } break;
926 case eWithCase: {
927 for (size_t i = startAt; i <= limit; ++i) {
928 for (size_t j = 0; j < subStrLen; ++j) {
929 if (accessor._ConstGetRep ().GetAt (i + j) != subString[j]) {
930 goto nogood2;
931 }
932 }
933 return i;
934 nogood2:;
935 }
936 } break;
937 }
938 return {};
939}
940
941optional<pair<size_t, size_t>> String::Find (const RegularExpression& regEx, size_t startAt) const
942{
943 Require (startAt <= size ());
944 wstring tmp = As<wstring> ();
945 Require (startAt < tmp.size ());
946 tmp = tmp.substr (startAt);
947 wsmatch res;
948 regex_search (tmp, res, regEx.GetCompiled ());
949 if (res.size () >= 1) {
950 size_t startOfMatch = startAt + res.position ();
951 return pair<size_t, size_t>{startOfMatch, startOfMatch + res.length ()};
952 }
953 return {};
954}
955
956Containers::Sequence<size_t> String::FindEach (const String& string2SearchFor, CompareOptions co) const
957{
958 vector<size_t> result;
959 for (optional<size_t> i = Find (string2SearchFor, 0, co); i; i = Find (string2SearchFor, *i, co)) {
960 result.push_back (*i);
961 *i += string2SearchFor.length (); // this cannot point past end of this string because we FOUND string2SearchFor
962 }
963 return Containers::Concrete::Sequence_stdvector{move (result)};
964}
965
967{
968 vector<pair<size_t, size_t>> result;
969 //@TODO - FIX - IF we get back zero length match
970 wstring tmp{As<wstring> ()};
971 wsmatch res;
972 regex_search (tmp, res, regEx.GetCompiled ());
973 size_t nMatches = res.size ();
974 result.reserve (nMatches);
975 for (size_t mi = 0; mi < nMatches; ++mi) {
976 size_t matchLen = res.length (mi); // avoid populating with lots of empty matches - special case of empty search
977 if (matchLen != 0) {
978 result.push_back (pair<size_t, size_t>{res.position (mi), matchLen});
979 }
980 }
981 return Containers::Concrete::Sequence_stdvector{move (result)};
982}
983
985{
986 vector<RegularExpressionMatch> result;
987 wstring tmp{As<wstring> ()};
988 for (wsregex_iterator i = wsregex_iterator{tmp.begin (), tmp.end (), regEx.GetCompiled ()}; i != wsregex_iterator (); ++i) {
989 wsmatch match{*i};
990 Assert (match.size () != 0);
991 size_t n = match.size ();
993 for (size_t j = 1; j < n; ++j) {
994 s.Append (match.str (j));
995 }
996 result.push_back (RegularExpressionMatch{match.str (0), s});
997 }
998 return Containers::Concrete::Sequence_stdvector{move (result)};
999}
1000
1002{
1003 vector<String> result;
1004 wstring tmp{As<wstring> ()};
1005 for (wsregex_iterator i = wsregex_iterator{tmp.begin (), tmp.end (), regEx.GetCompiled ()}; i != wsregex_iterator (); ++i) {
1006 result.push_back (String{i->str ()});
1007 }
1008 return Containers::Concrete::Sequence_stdvector{move (result)};
1009}
1010
1011optional<size_t> String::RFind (Character c) const noexcept
1012{
1013 //@todo: FIX HORRIBLE PERFORMANCE!!!
1014 _SafeReadRepAccessor accessor{this};
1015 const _IRep& useRep = accessor._ConstGetRep ();
1016 size_t length = useRep.size ();
1017 for (size_t i = length; i > 0; --i) {
1018 if (useRep.GetAt (i - 1) == c) {
1019 return i - 1;
1020 }
1021 }
1022 return nullopt;
1023}
1024
1025optional<size_t> String::RFind (const String& subString) const
1026{
1027 //@todo: FIX HORRIBLE PERFORMANCE!!!
1028 /*
1029 * Do quickie implementation, and don't worry about efficiency...
1030 */
1031 size_t subStrLen = subString.size ();
1032 if (subStrLen == 0) {
1033 return ((size () == 0) ? optional<size_t>{} : size () - 1);
1034 }
1035
1036 size_t limit = size () - subStrLen + 1;
1037 for (size_t i = limit; i > 0; --i) {
1038 if (SubString (i - 1, i - 1 + subStrLen) == subString) {
1039 return i - 1;
1040 }
1041 }
1042 return nullopt;
1043}
1044
1045String String::Replace (size_t from, size_t to, const String& replacement) const
1046{
1048 span<const wchar_t> thisSpan = GetData (&ignored);
1049 Require (from <= to);
1050 Require (to <= this->size ());
1051 Assert (to < thisSpan.size ());
1052 StringBuilder sb{thisSpan.subspan (0, from)};
1053 sb.Append (replacement);
1054 sb.Append (thisSpan.subspan (to));
1055 Ensure (sb == SubString (0, from) + replacement + SubString (to));
1056 return sb;
1057}
1058
1059bool String::StartsWith (const Character& c, CompareOptions co) const
1060{
1061 _SafeReadRepAccessor accessor{this};
1062 if (accessor._ConstGetRep ().size () == 0) {
1063 return false;
1064 }
1065 return Character::EqualsComparer{co}(accessor._ConstGetRep ().GetAt (0), c);
1066}
1067
1068bool String::StartsWith (const String& subString, CompareOptions co) const
1069{
1070 Require (not subString.empty ());
1071 if (subString.size () > size ()) {
1072 return false;
1073 }
1074#if qStroika_Foundation_Debug_AssertionsChecked
1075 bool referenceResult = ThreeWayComparer{co}(SubString (0, subString.size ()), subString) == 0;
1076#endif
1077 Memory::StackBuffer<Character> maybeIgnoreBuf1;
1078 Memory::StackBuffer<Character> maybeIgnoreBuf2;
1079 span<const Character> subStrData = subString.GetData (&maybeIgnoreBuf1);
1080 span<const Character> thisData = GetData (&maybeIgnoreBuf2);
1081 bool result = Character::Compare (thisData.subspan (0, subStrData.size ()), subStrData, co) == 0;
1082#if qStroika_Foundation_Debug_AssertionsChecked
1083 Ensure (result == referenceResult);
1084#endif
1085 return result;
1086}
1087
1088bool String::EndsWith (const Character& c, CompareOptions co) const
1089{
1090 _SafeReadRepAccessor accessor{this};
1091 const _IRep& useRep = accessor._ConstGetRep ();
1092 size_t thisStrLen = useRep.size ();
1093 if (thisStrLen == 0) {
1094 return false;
1095 }
1096 return Character::EqualsComparer{co}(useRep.GetAt (thisStrLen - 1), c);
1097}
1098
1099bool String::EndsWith (const String& subString, CompareOptions co) const
1100{
1101 Require (not subString.empty ());
1102 _SafeReadRepAccessor subStrAccessor{&subString};
1103 _SafeReadRepAccessor accessor{this};
1104 size_t thisStrLen = accessor._ConstGetRep ().size ();
1105 size_t subStrLen = subString.size ();
1106 if (subStrLen > thisStrLen) {
1107 return false;
1108 }
1109#if qStroika_Foundation_Debug_AssertionsChecked
1110 bool referenceResult = String::EqualsComparer{co}(SubString (thisStrLen - subStrLen, thisStrLen), subString);
1111#endif
1112 Memory::StackBuffer<Character> maybeIgnoreBuf1;
1113 Memory::StackBuffer<Character> maybeIgnoreBuf2;
1114 span<const Character> subStrData = subString.GetData (&maybeIgnoreBuf1);
1115 span<const Character> thisData = GetData (&maybeIgnoreBuf2);
1116 bool result = Character::Compare (thisData.subspan (thisStrLen - subStrLen), subStrData, co) == 0;
1117#if qStroika_Foundation_Debug_AssertionsChecked
1118 Ensure (result == referenceResult);
1119#endif
1120 return result;
1121}
1122
1123String String::AssureEndsWith (const Character& c, CompareOptions co) const
1124{
1125 if (EndsWith (c, co)) {
1126 return *this;
1127 }
1128 StringBuilder sb = *this;
1129 sb.Append (c);
1130 return sb;
1131}
1132
1133bool String::Matches (const RegularExpression& regEx) const
1134{
1135 wstring tmp{As<wstring> ()};
1136 return regex_match (tmp.begin (), tmp.end (), regEx.GetCompiled ());
1137}
1138
1139bool String::Matches (const RegularExpression& regEx, Sequence<String>* matches) const
1140{
1141 RequireNotNull (matches);
1142 //tmphack
1143 wstring tmp{As<wstring> ()};
1144 wsmatch base_match;
1145 if (regex_match (tmp, base_match, regEx.GetCompiled ())) {
1146 matches->clear ();
1147 for (size_t i = 1; i < base_match.size (); ++i) {
1148 matches->Append (base_match[i].str ());
1149 }
1150 return true;
1151 }
1152 return false;
1153}
1154
1155String String::ReplaceAll (const RegularExpression& regEx, const String& with) const
1156{
1157 return String{regex_replace (As<wstring> (), regEx.GetCompiled (), with.As<wstring> ())};
1158}
1159
1160String String::ReplaceAll (const String& string2SearchFor, const String& with, CompareOptions co) const
1161{
1162 Require (not string2SearchFor.empty ());
1163 // simplistic quickie impl...
1164 String result{*this};
1165 optional<size_t> i{0};
1166 while ((i = result.Find (string2SearchFor, *i, co))) {
1167 result = result.SubString (0, *i) + with + result.SubString (*i + string2SearchFor.length ());
1168 *i += with.length ();
1169 }
1170 return result;
1171}
1172
1173String String::ReplaceAll (const function<bool (Character)>& replaceCharP, const String& with) const
1174{
1175 StringBuilder sb;
1176 for (Character i : *this) {
1177 if (replaceCharP (i)) {
1178 sb << with;
1179 }
1180 else {
1181 sb << i;
1182 }
1183 }
1184 return sb;
1185}
1186
1187String String::ReplaceAll (const Set<Character>& charSet, const String& with) const
1188{
1189 StringBuilder sb;
1190 for (Character i : *this) {
1191 if (charSet.Contains (i)) {
1192 sb << with;
1193 }
1194 else {
1195 sb << i;
1196 }
1197 }
1198 return sb;
1199}
1200
1202{
1203 PeekSpanData pds = GetPeekSpanData<ASCII> ();
1204 Memory::StackBuffer<Character> maybeIgnoreBuf;
1205 span<const Character> charSpan = GetData (pds, &maybeIgnoreBuf);
1206 StringBuilder sb;
1207 bool everChanged{false};
1208 for (auto ci = charSpan.begin (); ci != charSpan.end (); ++ci) {
1209 Character c = *ci;
1210 if (c == '\r') {
1211 // peek at next character - and if we have a CRLF sequence - then advance pointer
1212 // (so we skip next NL) and pretend this was an NL..
1213 if (ci + 1 != charSpan.end () and *(ci + 1) == '\n') {
1214 ++ci;
1215 }
1216 everChanged = true;
1217 c = '\n';
1218 }
1219 sb << c;
1220 }
1221 if (everChanged) {
1222 return sb;
1223 }
1224 else {
1225 return *this;
1226 }
1227}
1228
1229String String::NormalizeSpace (Character useSpaceCharacter) const
1230{
1231 return ReplaceAll ("\\s+"_RegEx, String{useSpaceCharacter});
1232}
1233
1238Sequence<String> String::Tokenize (const function<bool (Character)>& isTokenSeparator) const
1239{
1241 bool inToken = false;
1242 StringBuilder curToken;
1243 size_t len = size ();
1244 for (size_t i = 0; i != len; ++i) {
1245 Character c = GetCharAt (i);
1246 bool newInToken = not isTokenSeparator (c);
1247 if (inToken != newInToken) {
1248 if (inToken) {
1249 String s{curToken.str ()};
1250 r += s;
1251 curToken.clear ();
1252 inToken = false;
1253 }
1254 else {
1255 inToken = true;
1256 }
1257 }
1258 if (inToken) {
1259 curToken << c;
1260 }
1261 }
1262 if (inToken) {
1263 String s{curToken.str ()};
1264 r += s;
1265 }
1266 return r;
1267}
1268
1269Sequence<String> String::Tokenize (const RegularExpression& isSeparator) const
1270{
1272 size_t len = this->length ();
1273 for (size_t startAt = 0; startAt < len;) {
1274 if (optional<pair<size_t, size_t>> ofi = Find (isSeparator, startAt)) {
1275 Assert (ofi->first >= startAt);
1276 Assert (ofi->first <= ofi->second);
1277 if (ofi->first == ofi->second) [[unlikely]] {
1278 static const auto kException_ =
1279 Execution::RuntimeErrorException{"separator regular expression argument to Tokenize must be non-empty or not match"sv};
1280 Execution::Throw (kException_);
1281 }
1282 if (ofi->first > startAt) {
1283 r += SubString (startAt, ofi->first);
1284 }
1285 else {
1286 Assert (startAt == 0); // special case - start of string
1287 }
1288 startAt = ofi->second;
1289 Assert (startAt <= len);
1290 }
1291 else {
1292 r += SubString (startAt); // if no match, the rest of the string is a non-separator
1293 break;
1294 }
1295 }
1296 return r;
1297}
1298Sequence<String> String::Tokenize (const Set<Character>& delimiters) const
1299{
1300 /*
1301 * @todo Inefficient impl, to encourage code saving. Do more efficiently.
1302 */
1303 return Tokenize ([delimiters] (Character c) -> bool { return delimiters.Contains (c); });
1304}
1305
1307{
1309 StringBuilder curLineSB;
1310 for (auto i = this->MakeIterator (); i; ++i) {
1311 Character c = *i;
1312 // look for \r, \r\n, or \n
1313 switch (c.GetCharacterCode ()) {
1314 case '\r': {
1315 auto ii = i;
1316 ++ii;
1317 if (ii and *ii == '\n') {
1318 i = ii;
1319 }
1320 r += curLineSB.str ();
1321 curLineSB.clear ();
1322 break;
1323 }
1324 case '\n': {
1325 r += curLineSB.str ();
1326 curLineSB.clear ();
1327 break;
1328 }
1329 default: {
1330 curLineSB.push_back (c);
1331 break;
1332 }
1333 }
1334 }
1335 if (not curLineSB.empty ()) { // non-terminated lines included
1336 r += curLineSB.str ();
1337 }
1338 return r;
1339}
1340
1341Sequence<String> String::Grep (const String& fgrepArg) const
1342{
1344 for (auto i : AsLines ()) {
1345 if (i.Contains (fgrepArg)) {
1346 r += i;
1347 }
1348 }
1349 return r;
1350}
1351Sequence<String> String::Grep (const RegularExpression& egrepArg) const
1352{
1354 for (auto i : AsLines ()) {
1355 if (i.Matches (egrepArg)) {
1356 r += i;
1357 }
1358 }
1359 return r;
1360}
1361
1362optional<String> String::Col (size_t i) const
1363{
1364 static const RegularExpression kWS_ = "\\s+"_RegEx;
1365 return Col (i, kWS_);
1366}
1367
1368optional<String> String::Col (size_t i, const RegularExpression& separator) const
1369{
1370 return Tokenize (separator).Nth (i);
1371}
1372
1373String String::SubString_ (const _SafeReadRepAccessor& thisAccessor, size_t from, size_t to) const
1374{
1375 constexpr bool kWholeStringOptionization_ =
1376 false; // empirically, this costs about 1%. My WAG is that 1% cost not a good tradeoff cuz I dont think this gets triggered that often - LGP 2023-09-26
1377 Require (from <= to);
1378 Require (to <= this->size ());
1379
1380 // Could do this more simply, but since this function is a bottleneck, handle each representation case separately
1381 if (from == to) [[unlikely]] {
1382 return mkEmpty_ ();
1383 }
1384 PeekSpanData psd = thisAccessor._ConstGetRep ().PeekData (nullopt);
1385 switch (psd.fInCP) {
1386 case PeekSpanData::eAscii: {
1387 if constexpr (kWholeStringOptionization_) {
1388 if (from == 0 and to == psd.fAscii.size ()) [[unlikely]] {
1389 return *this; // unclear if this optimization is worthwhile
1390 }
1391 }
1392 return mk_nocheck_ (psd.fAscii.subspan (from, to - from)); // no check cuz we already know its all ASCII and nothing smaller
1393 }
1395 if constexpr (kWholeStringOptionization_) {
1396 if (from == 0 and to == psd.fSingleByteLatin1.size ()) [[unlikely]] {
1397 return *this; // unclear if this optimization is worthwhile
1398 }
1399 }
1400 return mk_ (psd.fSingleByteLatin1.subspan (from, to - from)); // note still needs to re-examine text, cuz subset maybe pure ascii (etc)
1401 }
1402 case PeekSpanData::eChar16: {
1403 if constexpr (kWholeStringOptionization_) {
1404 if (from == 0 and to == psd.fChar16.size ()) [[unlikely]] {
1405 return *this; // unclear if this optimization is worthwhile
1406 }
1407 }
1408 return mk_ (psd.fChar16.subspan (from, to - from)); // note still needs to re-examine text, cuz subset maybe pure ascii (etc)
1409 }
1410 case PeekSpanData::eChar32: {
1411 if constexpr (kWholeStringOptionization_) {
1412 if (from == 0 and to == psd.fChar32.size ()) [[unlikely]] {
1413 return *this; // unclear if this optimization is worthwhile
1414 }
1415 }
1416 return mk_ (psd.fChar32.subspan (from, to - from)); // note still needs to re-examine text, cuz subset maybe pure ascii (etc)
1417 }
1418 default:
1420 return String{};
1421 }
1422}
1423
1424String String::Repeat (unsigned int count) const
1425{
1426 switch (count) {
1427 case 0:
1428 return String{};
1429 case 1:
1430 return *this;
1431 case 2:
1432 return *this + *this;
1433 default: {
1434 StringBuilder result;
1435 for (unsigned int i = 0; i < count; ++i) {
1436 result << *this;
1437 }
1438 return result;
1439 }
1440 }
1441}
1442
1443String String::LTrim (bool (*shouldBeTrimmed) (Character)) const
1444{
1445 RequireNotNull (shouldBeTrimmed);
1446 auto referenceImpl = [&] () {
1447 _SafeReadRepAccessor accessor{this};
1448 size_t length = accessor._ConstGetRep ().size ();
1449 for (size_t i = 0; i < length; ++i) {
1450 if (not(*shouldBeTrimmed) (accessor._ConstGetRep ().GetAt (i))) {
1451 if (i == 0) {
1452 return *this; // no change in string
1453 }
1454 else {
1455 return SubString (i, length);
1456 }
1457 }
1458 }
1459 return String{}; // all trimmed
1460 };
1461 auto commonAlgorithm = [&]<typename T> (span<const T> lowLevelCharSpan) -> String {
1462 size_t length = lowLevelCharSpan.size ();
1463 for (size_t i = 0; i < length; ++i) {
1464 static_assert (Common::IAnyOf<T, ASCII, Latin1, char32_t>); // this works for ASCII, Latin1, char32_t, but for char16_t - not so much - trickier
1465 Character c{lowLevelCharSpan[i]};
1466 // drop not-so-subtle hint to optimizer this is likely the function, and can be called, and hopefully hoisted outside the loop, and inlined
1467 bool thisCharacterTrimmed = [&] () {
1468 if (shouldBeTrimmed == (bool (*) (Character))Character::IsWhitespace) [[likely]] {
1469 return Character::IsWhitespace (c);
1470 }
1471 else {
1472 return shouldBeTrimmed (c);
1473 }
1474 }();
1475 if (not thisCharacterTrimmed) {
1476 if (i == 0) {
1477#if qStroika_Foundation_Debug_AssertionsChecked
1478 Assert (*this == referenceImpl ());
1479#endif
1480 return *this; // no change in string
1481 }
1482 else {
1483#if qStroika_Foundation_Debug_AssertionsChecked
1484 Assert (mk_ (lowLevelCharSpan.subspan (i)) == referenceImpl ());
1485#endif
1486 return mk_ (lowLevelCharSpan.subspan (i));
1487 }
1488 }
1489 }
1490 return String{}; // all trimmed
1491 };
1492 _SafeReadRepAccessor accessor{this};
1493 PeekSpanData psd = accessor._ConstGetRep ().PeekData (nullopt);
1494 switch (psd.fInCP) {
1495 case PeekSpanData::eAscii: {
1496 return commonAlgorithm (psd.fAscii);
1497 }
1499 return commonAlgorithm (psd.fSingleByteLatin1);
1500 }
1501 case PeekSpanData::eChar32: {
1502 return commonAlgorithm (psd.fChar32);
1503 }
1504 }
1505 return referenceImpl (); // due to tricks with surrogates, and rarity, not worth worrying about char16_t case
1506}
1507
1508String String::RTrim (bool (*shouldBeTrimmed) (Character)) const
1509{
1510 RequireNotNull (shouldBeTrimmed);
1511 auto referenceImpl = [&] () {
1512 _SafeReadRepAccessor accessor{this};
1513 ptrdiff_t length = accessor._ConstGetRep ().size ();
1514 ptrdiff_t endOfFirstTrim = length;
1515 for (; endOfFirstTrim != 0; --endOfFirstTrim) {
1516 if ((*shouldBeTrimmed) (accessor._ConstGetRep ().GetAt (endOfFirstTrim - 1))) {
1517 // keep going backwards
1518 }
1519 else {
1520 break;
1521 }
1522 }
1523 if (endOfFirstTrim == 0) {
1524 return String{}; // all trimmed
1525 }
1526 else if (endOfFirstTrim == length) {
1527 return *this; // nothing trimmed
1528 }
1529 else {
1530 return SubString (0, endOfFirstTrim);
1531 }
1532 };
1533
1534 auto commonAlgorithm = [&]<typename T> (span<const T> lowLevelCharSpan) -> String {
1535 size_t length = lowLevelCharSpan.size ();
1536 ptrdiff_t endOfFirstTrim = length;
1537 for (; endOfFirstTrim != 0; --endOfFirstTrim) {
1538 static_assert (Common::IAnyOf<T, ASCII, Latin1, char32_t>); // this works for ASCII, Latin1, char32_t, but for char16_t - not so much - trickier
1539 Character c{lowLevelCharSpan[endOfFirstTrim - 1]};
1540 // drop not-so-subtle hint to optimizer this is likely the function, and can be called, and hopefully hoisted outside the loop, and inlined
1541 bool thisCharacterTrimmed = [&] () {
1542 if (shouldBeTrimmed == (bool (*) (Character))Character::IsWhitespace) [[likely]] {
1543 return Character::IsWhitespace (c);
1544 }
1545 else {
1546 return shouldBeTrimmed (c);
1547 }
1548 }();
1549 if (thisCharacterTrimmed) {
1550 // keep going backwards
1551 }
1552 else {
1553 break;
1554 }
1555 }
1556 if (endOfFirstTrim == 0) {
1557#if qStroika_Foundation_Debug_AssertionsChecked
1558 Assert (String{} == referenceImpl ());
1559#endif
1560 return String{}; // all trimmed
1561 }
1562 else if (static_cast<size_t> (endOfFirstTrim) == length) {
1563#if qStroika_Foundation_Debug_AssertionsChecked
1564 Assert (*this == referenceImpl ());
1565#endif
1566 return *this; // nothing trimmed
1567 }
1568 else {
1569#if qStroika_Foundation_Debug_AssertionsChecked
1570 Assert (mk_ (lowLevelCharSpan.subspan (0, endOfFirstTrim)) == referenceImpl ());
1571#endif
1572 return mk_ (lowLevelCharSpan.subspan (0, endOfFirstTrim)); //return SubString (0, endOfFirstTrim);
1573 }
1574 };
1575
1576 _SafeReadRepAccessor accessor{this};
1577 PeekSpanData psd = accessor._ConstGetRep ().PeekData (nullopt);
1578 switch (psd.fInCP) {
1579 case PeekSpanData::eAscii: {
1580 return commonAlgorithm (psd.fAscii);
1581 }
1583 return commonAlgorithm (psd.fSingleByteLatin1);
1584 }
1585 case PeekSpanData::eChar32: {
1586 return commonAlgorithm (psd.fChar32);
1587 }
1588 }
1589 return referenceImpl (); // due to tricks with surrogates, and rarity, not worth worrying about char16_t case
1590}
1591
1592String String::Trim (bool (*shouldBeTrimmed) (Character)) const
1593{
1594 RequireNotNull (shouldBeTrimmed);
1595
1596 auto referenceImpl = [&] () { return LTrim (shouldBeTrimmed).RTrim (shouldBeTrimmed); };
1597
1598 // declared here to encourage inlining the common case of Character::IsWhitespace
1599 auto useCharTrimmedFunc = [&] (Character c) {
1600 if (shouldBeTrimmed == (bool (*) (Character))Character::IsWhitespace) [[likely]] {
1601 return Character::IsWhitespace (c);
1602 }
1603 else {
1604 return shouldBeTrimmed (c);
1605 }
1606 };
1607
1608 auto commonAlgorithm = [&]<typename T> (span<const T> lowLevelCharSpan) -> String {
1609 size_t length = lowLevelCharSpan.size ();
1610 size_t firstKeptIdx = 0;
1611 for (; firstKeptIdx < length; ++firstKeptIdx) {
1612 static_assert (Common::IAnyOf<T, ASCII, Latin1, char32_t>); // this works for ASCII, Latin1, char32_t, but for char16_t - not so much - trickier
1613 Character c{lowLevelCharSpan[firstKeptIdx]};
1614 if (not useCharTrimmedFunc (c)) {
1615 break;
1616 }
1617 }
1618 ptrdiff_t endOfFirstTrim = length;
1619 for (; static_cast<size_t> (endOfFirstTrim) != firstKeptIdx; --endOfFirstTrim) {
1620 static_assert (Common::IAnyOf<T, ASCII, Latin1, char32_t>); // this works for ASCII, Latin1, char32_t, but for char16_t - not so much - trickier
1621 Character c{lowLevelCharSpan[endOfFirstTrim - 1]};
1622 if (useCharTrimmedFunc (c)) {
1623 // keep going backwards
1624 }
1625 else {
1626 break;
1627 }
1628 }
1629 if (firstKeptIdx == 0 and static_cast<size_t> (endOfFirstTrim) == length) {
1630#if qStroika_Foundation_Debug_AssertionsChecked
1631 Assert (*this == referenceImpl ());
1632#endif
1633 return *this; // nothing changed, just bump reference count on shared_ptr
1634 }
1635 if (firstKeptIdx == length) {
1636#if qStroika_Foundation_Debug_AssertionsChecked
1637 Assert (String{} == referenceImpl ());
1638#endif
1639 return String{}; // trimmed everything way
1640 }
1641 Assert (static_cast<ptrdiff_t> (firstKeptIdx) < endOfFirstTrim);
1642#if qStroika_Foundation_Debug_AssertionsChecked
1643 Assert (mk_ (lowLevelCharSpan.subspan (firstKeptIdx, endOfFirstTrim - firstKeptIdx)) == referenceImpl ());
1644#endif
1645 return mk_ (lowLevelCharSpan.subspan (firstKeptIdx, endOfFirstTrim - firstKeptIdx));
1646 };
1647
1648 _SafeReadRepAccessor accessor{this};
1649 PeekSpanData psd = accessor._ConstGetRep ().PeekData (nullopt);
1650 switch (psd.fInCP) {
1651 case PeekSpanData::eAscii: {
1652 return commonAlgorithm (psd.fAscii);
1653 }
1655 return commonAlgorithm (psd.fSingleByteLatin1);
1656 }
1657 case PeekSpanData::eChar32: {
1658 return commonAlgorithm (psd.fChar32);
1659 }
1660 }
1661 return referenceImpl (); // due to tricks with surrogates, and rarity, not worth worrying about char16_t case
1662}
1663
1664String String::StripAll (bool (*removeCharIf) (Character)) const
1665{
1666 RequireNotNull (removeCharIf);
1667
1668 // NB: optimize special case where removeCharIf is always false
1669 //
1670 // Walk string and find first character we need to remove
1671 StringBuilder<StringBuilder_Options<char32_t>> result{*this}; // StringBuilder_Options<char32_t> so operator[] is fast
1672 size_t n = result.size ();
1673 for (size_t i = 0; i < n; ++i) {
1674 Character c = result[i];
1675 if (removeCharIf (c)) {
1676 // on first removal, clone part of string done so far, and start appending
1677 StringBuilder tmp = result.As<String> ().SubString (0, i);
1678 // Now keep iterating IN THIS LOOP appending characters and return at the end of this loop
1679 ++i;
1680 for (; i < n; ++i) {
1681 c = result[i];
1682 if (not removeCharIf (c)) {
1683 tmp += c;
1684 }
1685 }
1686 return tmp;
1687 }
1688 }
1689 return *this; // if we NEVER get removeCharIf return false, just clone this
1690}
1691
1692String String::Join (const Iterable<String>& list, const String& separator)
1693{
1694 StringBuilder result;
1695 for (const String& i : list) {
1696 result << i << separator;
1697 }
1698 if (result.empty ()) {
1699 return result.str ();
1700 }
1701 else {
1702 return result.str ().SubString (0, -static_cast<int> (separator.size ()));
1703 }
1704}
1705
1707{
1708 StringBuilder result;
1709 bool changed{false}; // if no change, no need to allocate new object
1710 _SafeReadRepAccessor accessor{this};
1711 PeekSpanData psd = accessor._ConstGetRep ().PeekData (nullopt);
1712 if (psd.fInCP == PeekSpanData::eAscii) [[likely]] {
1713 // optimization but other case would work no matter what
1714 for (auto c : psd.fAscii) {
1715 if (isupper (c)) {
1716 changed = true;
1717 result.push_back (static_cast<ASCII> (tolower (c)));
1718 }
1719 else {
1720 result.push_back (c);
1721 }
1722 }
1723 }
1724 else {
1725 Memory::StackBuffer<Character> maybeIgnoreBuf1;
1726 for (Character c : GetData (psd, &maybeIgnoreBuf1)) {
1727 if (c.IsUpperCase ()) {
1728 changed = true;
1729 result.push_back (c.ToLowerCase ());
1730 }
1731 else {
1732 result.push_back (c);
1733 }
1734 }
1735 }
1736 if (changed) {
1737 return result.str ();
1738 }
1739 else {
1740 return *this;
1741 }
1742}
1743
1745{
1746 StringBuilder result;
1747 bool changed{false}; // if no change, no need to allocate new object
1748 _SafeReadRepAccessor accessor{this};
1749 PeekSpanData psd = accessor._ConstGetRep ().PeekData (nullopt);
1750 if (psd.fInCP == PeekSpanData::eAscii) [[likely]] {
1751 // optimization but other case would work no matter what
1752 for (auto c : psd.fAscii) {
1753 if (islower (c)) {
1754 changed = true;
1755 result.push_back (static_cast<ASCII> (toupper (c)));
1756 }
1757 else {
1758 result.push_back (c);
1759 }
1760 }
1761 }
1762 else {
1763 Memory::StackBuffer<Character> maybeIgnoreBuf1;
1764 for (Character c : GetData (psd, &maybeIgnoreBuf1)) {
1765 if (c.IsLowerCase ()) {
1766 changed = true;
1767 result.push_back (c.ToUpperCase ());
1768 }
1769 else {
1770 result.push_back (c);
1771 }
1772 }
1773 }
1774 if (changed) {
1775 return result.str ();
1776 }
1777 else {
1778 return *this;
1779 }
1780}
1781
1783{
1784 // It is all whitespace if the first non-whitespace character is 'EOF'
1785 return not Find ([] (Character c) -> bool { return not c.IsWhitespace (); });
1786}
1787
1788String String::LimitLength (size_t maxLen, StringShorteningPreference keepPref, const String& ellipsis) const
1789{
1790 // @todo Consider making this the 'REFERENCE' impl, and doing a specific one with a specific StringBuilder, and doing
1791 // the trim/split directly, if I see this show up in a profile, for performance sake --LGP 2023-12-11
1792 if (length () < maxLen) [[likely]] {
1793 return *this; // frequent optimization
1794 }
1795 String operateOn = [&] () {
1796 switch (keepPref) {
1797 case StringShorteningPreference::ePreferKeepLeft:
1798 return LTrim ();
1799 case StringShorteningPreference::ePreferKeepRight:
1800 return RTrim ();
1801 case StringShorteningPreference::ePreferKeepMid:
1802 return Trim (); // not sure we need to trim - but probably best
1803 default:
1805 return *this;
1806 }
1807 }();
1808 if (operateOn.length () <= maxLen) {
1809 return operateOn;
1810 }
1811 size_t useLen = [&] () {
1812 size_t useLen = maxLen;
1813 size_t ellipsisTotalLen = ellipsis.length ();
1814 if (keepPref == StringShorteningPreference::ePreferKeepMid) {
1815 ellipsisTotalLen *= 2;
1816 }
1817 if (useLen > ellipsisTotalLen) {
1818 useLen -= ellipsisTotalLen;
1819 }
1820 else {
1821 useLen = 0;
1822 }
1823 return useLen;
1824 }();
1825 switch (keepPref) {
1826 case StringShorteningPreference::ePreferKeepLeft:
1827 return operateOn.substr (0, useLen) + ellipsis;
1828 case StringShorteningPreference::ePreferKeepRight:
1829 return ellipsis + operateOn.substr (operateOn.length () - useLen);
1830 case StringShorteningPreference::ePreferKeepMid:
1831 return ellipsis + operateOn.substr (operateOn.length () / 2 - useLen / 2, useLen) + ellipsis;
1832 default:
1834 return *this;
1835 }
1836}
1837
1838string String::AsNarrowString (const locale& l) const
1839{
1840 // Note: this could use CodeCvt, but directly using std::codecvt in this case pretty simple, and
1841 // more efficient this way --LGP 2023-02-14
1842
1843 // See http://en.cppreference.com/w/cpp/locale/codecvt/~codecvt
1844 using Destructible_codecvt_byname = deletable_facet_<codecvt_byname<wchar_t, char, mbstate_t>>;
1845 Destructible_codecvt_byname cvt{l.name ()};
1846
1847 Memory::StackBuffer<wchar_t> maybeIgnoreBuf1;
1848 span<const wchar_t> thisData = GetData (&maybeIgnoreBuf1);
1849 // http://en.cppreference.com/w/cpp/locale/codecvt/out
1850 mbstate_t mbstate{};
1851 const wchar_t* from_next;
1852 char* to_next;
1853 Memory::StackBuffer<char> into{Memory::eUninitialized, thisData.size () * 5}; // not sure what size is always big enuf
1854 codecvt_base::result result =
1855 cvt.out (mbstate, thisData.data (), thisData.data () + thisData.size (), from_next, into.data (), into.end (), to_next);
1856 if (result != codecvt_base::ok) [[unlikely]] {
1857 static const auto kException_ = Execution::RuntimeErrorException{"Error converting locale multibyte string to UNICODE"sv};
1858 Execution::Throw (kException_);
1859 }
1860 return string{into.data (), to_next};
1861}
1862
1863string String::AsNarrowString (const locale& l, AllowMissingCharacterErrorsFlag) const
1864{
1865 // Note: this could use CodeCvt, but directly using std::codecvt in this case pretty simple, and
1866 // more efficient this way --LGP 2023-02-14
1867
1868 // See http://en.cppreference.com/w/cpp/locale/codecvt/~codecvt
1869 using Destructible_codecvt_byname = deletable_facet_<codecvt_byname<wchar_t, char, mbstate_t>>;
1870 Destructible_codecvt_byname cvt{l.name ()};
1871
1872 Memory::StackBuffer<wchar_t> maybeIgnoreBuf1;
1873 span<const wchar_t> thisData = GetData (&maybeIgnoreBuf1);
1874 // http://en.cppreference.com/w/cpp/locale/codecvt/out
1875 mbstate_t mbstate{};
1876 Memory::StackBuffer<char> into{Memory::eUninitialized, thisData.size () * 5}; // not sure what size is always big enuf
1877 const wchar_t* readFrom = thisData.data ();
1878 char* intoIndex = into.data ();
1879Again:
1880 const wchar_t* from_next{nullptr};
1881 char* to_next{nullptr};
1882 codecvt_base::result result = cvt.out (mbstate, readFrom, thisData.data () + thisData.size (), from_next, intoIndex, into.end (), to_next);
1883 if (result != codecvt_base::ok) [[unlikely]] {
1884 if (from_next != thisData.data () + thisData.size ()) {
1885 readFrom = from_next + 1; // unclear how much to skip (due to surrogates) - but likely this is a good guess
1886 *to_next = '?'; // write 'bad' character
1887 intoIndex = to_next + 1;
1888 goto Again;
1889 }
1890 }
1891 return string{into.data (), to_next};
1892}
1893
1894void String::erase (size_t from)
1895{
1896 *this = RemoveAt (from, size ());
1897}
1898
1899void String::erase (size_t from, size_t count)
1900{
1901 // http://stroika-bugs.sophists.com/browse/STK-445
1902 // @todo - NOT ENVELOPE THREADSAFE
1903 // MUST ACQUIRE ACCESSOR HERE - not just that RemoteAt threadsafe - but must SYNC at this point - need AssureExternallySycnonized stuff here!!!
1904 //
1905 // TODO: Double check STL definition - but I think they allow for count to be 'too much' - and silently trim to end...
1906 size_t max2Erase = static_cast<size_t> (max (static_cast<ptrdiff_t> (0), static_cast<ptrdiff_t> (size ()) - static_cast<ptrdiff_t> (from)));
1907 *this = RemoveAt (from, from + min (count, max2Erase));
1908}
1909
1910const wchar_t* String::c_str () const noexcept
1911{
1912 // UNSAFE - DEPRECATED - lose before v3 actually released -- LGP 2023-06-28
1914 DISABLE_COMPILER_GCC_WARNING_START ("GCC diagnostic ignored \"-Wdeprecated-declarations\"");
1915 DISABLE_COMPILER_CLANG_WARNING_START ("clang diagnostic ignored \"-Wdeprecated-declarations\"");
1916 return const_cast<String*> (this)->c_str ();
1917 DISABLE_COMPILER_MSC_WARNING_END (4996);
1918 DISABLE_COMPILER_GCC_WARNING_END ("GCC diagnostic ignored \"-Wdeprecated-declarations\"");
1919 DISABLE_COMPILER_CLANG_WARNING_END ("clang diagnostic ignored \"-Wdeprecated-declarations\"");
1920}
1921const wchar_t* String::c_str ()
1922{
1923 // DEPRECATED SINCE STROIKA v3.0d13
1924 // Rarely used mechanism, of replacing the underlying rep, for the iterable, as needed
1925 _SafeReadRepAccessor accessor{this};
1926 const wchar_t* result = accessor._ConstGetRep ().c_str_peek ();
1927 if (result == nullptr) {
1928 _fRep = Memory::MakeSharedPtr<StringWithCStr_::Rep> (accessor._ConstGetRepSharedPtr ());
1929 result = _SafeReadRepAccessor{this}._ConstGetRep ().c_str_peek ();
1930 AssertNotNull (result);
1931 }
1932 EnsureNotNull (result);
1933 Ensure (result[size ()] == '\0' or (::wcslen (result) > size () and sizeof (wchar_t) == 2)); // if there are surrogates, wcslen () might be larger than size
1934 return result;
1935}
1936
1937[[noreturn]] void String::ThrowInvalidAsciiException_ ()
1938{
1939 static const auto kException_ = Execution::RuntimeErrorException{"Error converting non-ascii text to string"sv};
1940 Execution::Throw (kException_);
1941}
1942
1943#if qStroika_Foundation_Characters_AsPathAutoMapMSYSAndCygwin
1944template <>
1945std::filesystem::path String::As<std::filesystem::path> () const
1946{
1947 // CYGWIN creates paths like /cygdrive/c/folder for c:/folder
1948 // MSYS creates paths like /c/folder for c:/folder
1949 static const String kMSYSDrivePrefix_ = "/"sv;
1950 static const String kCygrivePrefix_ = "/cygdrive/"sv;
1951 if (StartsWith (kCygrivePrefix_)) {
1952 String ss = SubString (kCygrivePrefix_.length ());
1953 if (ss.length () > 1 and ss[0].IsASCII () and ss[0].IsAlphabetic () and ss[1] == '/') {
1954 wstring w = ss.As<wstring> (); // now map c/folder to c:/folder
1955 w.insert (w.begin () + 1, ':');
1956 return filesystem::path{w};
1957 }
1958 }
1959 if (StartsWith (kMSYSDrivePrefix_)) {
1960 String ss = SubString (kMSYSDrivePrefix_.length ());
1961 if (ss.length () > 1 and ss[0].IsASCII () and ss[0].IsAlphabetic () and ss[1] == '/') {
1962 wstring w = ss.As<wstring> (); // now map c/folder to c:/folder
1963 w.insert (w.begin () + 1, ':');
1964 return filesystem::path{w};
1965 }
1966 }
1967 return filesystem::path{As<wstring> ()};
1968}
1969#endif
1970
1971/*
1972 ********************************************************************************
1973 ****************************** StringCombiner **********************************
1974 ********************************************************************************
1975 */
1976template <>
1977String StringCombiner<String>::operator() (const String& lhs, const String& rhs, bool isLast) const
1978{
1979 StringBuilder sb{lhs};
1980 if (isLast and fSpecialSeparatorForLastPair) [[unlikely]] {
1981 sb << *fSpecialSeparatorForLastPair;
1982 }
1983 else {
1984 sb << fSeparator;
1985 }
1986 sb << rhs;
1987 return sb;
1988}
1989
1990/*
1991 ********************************************************************************
1992 ******************* Iterable<Characters::String>::Join *************************
1993 ********************************************************************************
1994 */
1995namespace Stroika::Foundation::Traversal {
1996 // specialized as performance optimization
1997 template <>
1998 Characters::String Iterable<Characters::String>::Join (const Characters::String& separator, const optional<Characters::String>& finalSeparator) const
1999 {
2000 using namespace Characters;
2001#if qStroika_Foundation_Debug_AssertionsChecked
2002 String referenceResult =
2004 Characters::StringCombiner<String>{.fSeparator = separator, .fSpecialSeparatorForLastPair = finalSeparator});
2005#endif
2006 StringBuilder sb;
2007 size_t cnt = this->size ();
2008 this->Apply ([&, idx = 0u] (const String& i) mutable {
2009 if (idx == 0) {
2010 sb = i;
2011 }
2012 else {
2013 if (finalSeparator and idx + 1 == cnt) [[unlikely]] {
2014 sb << *finalSeparator;
2015 }
2016 else {
2017 sb << separator;
2018 }
2019 sb << i;
2020 }
2021 ++idx;
2022 });
2023#if qStroika_Foundation_Debug_AssertionsChecked
2024 Ensure (sb == referenceResult);
2025#endif
2026 return sb;
2027 }
2028}
2029
2030/*
2031 ********************************************************************************
2032 ********************************** operator<< **********************************
2033 ********************************************************************************
2034 */
2035wostream& Characters::operator<< (wostream& out, const String& s)
2036{
2037 Memory::StackBuffer<wchar_t> maybeIgnoreBuf1;
2038 span<const wchar_t> sData = s.GetData (&maybeIgnoreBuf1);
2039 out.write (sData.data (), sData.size ());
2040 return out;
2041}
2042ostream& Characters::operator<< (ostream& out, const String& s)
2043{
2044 return out << s.AsNarrowSDKString (eIgnoreErrors);
2045}
2046
2047/*
2048 ********************************************************************************
2049 *********** hash<Stroika::Foundation::Characters::String> **********************
2050 ********************************************************************************
2051 */
2052size_t std::hash<String>::operator() (const String& arg) const
2053{
2054 using namespace Cryptography::Digest;
2055 using DIGESTER = Digester<Algorithm::SuperFastHash>; // pick arbitrarily which algorithm to use for now -- err on the side of quick and dirty
2056 static constexpr DIGESTER kDigester_{};
2057 // Note this could easily use char8_t, wchar_t, char32_t, or whatever. Choose char8_t on the theory that
2058 // this will most often avoid a copy, and making the most often case faster is probably a win. Also, even close, it
2059 // will have less 'empty space' and be more compact, so will digest faster.
2060 Memory::StackBuffer<char8_t> maybeIgnoreBuf1;
2061 span<const char8_t> s = arg.GetData (&maybeIgnoreBuf1);
2062 if (s.empty ()) {
2063 static const size_t kZeroDigest_ = kDigester_ (nullptr, nullptr);
2064 return kZeroDigest_;
2065 }
2066 else {
2067 return kDigester_ (as_bytes (s));
2068 }
2069}
2070
2071/*
2072 ********************************************************************************
2073 ******************** DataExchange::DefaultSerializer<String> *******************
2074 ********************************************************************************
2075 */
2077{
2078 //
2079 // Could have used char8_t, char16_t, or char32_t here quite plausibly. Chose char8_t for several reasons:
2080 // > Nearly always smallest representation (assuming most data is ascii)
2081 // > It is cross-platform/portable - not byte order dependent (NOT a promise going forward, so maybe
2082 // not a good thing - but a thing)
2083 // > Since we expect most data reps to be ascii, this will involve the least copying, most likely, in
2084 // the GetData call
2085 //
2086 Memory::StackBuffer<char8_t> maybeIgnoreBuf1;
2087 return Memory::BLOB{as_bytes (arg.GetData (&maybeIgnoreBuf1))};
2088}
#define AssertNotNull(p)
Definition Assertions.h:333
#define EnsureNotNull(p)
Definition Assertions.h:340
#define RequireMember(p, c)
Definition Assertions.h:326
#define RequireNotReached()
Definition Assertions.h:385
#define qStroika_Foundation_Debug_AssertionsChecked
The qStroika_Foundation_Debug_AssertionsChecked flag determines if assertions are checked and validat...
Definition Assertions.h:48
#define RequireNotNull(p)
Definition Assertions.h:347
#define RequireExpression(c)
Definition Assertions.h:267
#define AssertNotReached()
Definition Assertions.h:355
conditional_t< qStroika_Foundation_Memory_PreferBlockAllocation and andTrueCheck, BlockAllocationUseHelper< T >, Common::Empty > UseBlockAllocationIfAppropriate
Use this to enable block allocation for a particular class. Beware of subclassing.
bool Equals(const T *lhs, const T *rhs)
strcmp or wsccmp() as appropriate == 0
constexpr bool IsASCII() const noexcept
Return true iff the given character (or all in span) is (are) in the ascii range [0....
static constexpr void CheckASCII(span< const CHAR_T > s)
if not IsASCII (arg) throw RuntimeException...
nonvirtual Character ToLowerCase() const noexcept
nonvirtual ASCII GetAsciiCode() const noexcept
static constexpr strong_ordering Compare(span< const CHAR_T, E1 > lhs, span< const CHAR_T, E2 > rhs, CompareOptions co) noexcept
nonvirtual bool IsLowerCase() const noexcept
constexpr char32_t GetCharacterCode() const noexcept
Return the char32_t UNICODE code-point associated with this character.
nonvirtual Character ToUpperCase() const noexcept
constexpr bool IsWhitespace() const noexcept
nonvirtual bool IsUpperCase() const noexcept
RegularExpression is a compiled regular expression which can be used to match on a String class.
virtual Character GetAt(size_t index) const noexcept=0
Similar to String, but intended to more efficiently construct a String. Mutable type (String is large...
nonvirtual size_t size() const noexcept
nonvirtual void Append(span< const CHAR_T > s)
String is like std::u32string, except it is much easier to use, often much more space efficient,...
Definition String.h:201
nonvirtual size_t length() const noexcept
Definition String.inl:1045
nonvirtual String ToUpperCase() const
Definition String.cpp:1744
static String FromNarrowString(const char *from, const locale &l)
Definition String.inl:340
nonvirtual bool Matches(const RegularExpression &regEx) const
Definition String.cpp:1133
nonvirtual bool IsWhitespace() const
Definition String.cpp:1782
nonvirtual String NormalizeTextToNL() const
Definition String.cpp:1201
static String Join(const Iterable< String > &list, const String &separator=", "sv)
Definition String.cpp:1692
static String FromStringConstant(const CHAR_T(&cString)[SIZE])
Take the given argument data (constant span) - which must remain unchanged - constant - for the appli...
Definition String.inl:386
nonvirtual String NormalizeSpace(Character useSpaceCharacter=' ') const
Replace sequences of whitespace characters (space, tab, newline etc) with a single space (or argument...
Definition String.cpp:1229
nonvirtual Containers::Sequence< pair< size_t, size_t > > FindEach(const RegularExpression &regEx) const
Definition String.cpp:966
nonvirtual String Repeat(unsigned int count) const
Definition String.cpp:1424
nonvirtual String LimitLength(size_t maxLen, StringShorteningPreference keepPref=StringShorteningPreference::ePreferKeepLeft) const
return the first maxLen (or fewer if string shorter) characters of this string (adding ellipsis if tr...
Definition String.inl:741
nonvirtual String RemoveAll(Character c) const
Definition String.cpp:823
nonvirtual Containers::Sequence< RegularExpressionMatch > FindEachMatch(const RegularExpression &regEx) const
Definition String.cpp:984
nonvirtual String RemoveFirstIf(Character c) const
Definition String.cpp:807
nonvirtual string AsNarrowSDKString() const
Definition String.inl:830
nonvirtual optional< String > Col(size_t i) const
Useful to replace 'awk print $3' - replace with Col(2) - zero based.
Definition String.cpp:1362
nonvirtual String InsertAt(Character c, size_t at) const
Definition String.inl:715
nonvirtual string AsNarrowString(const locale &l) const
Definition String.cpp:1838
nonvirtual size_t size() const noexcept
Definition String.inl:534
nonvirtual bool EndsWith(const Character &c, CompareOptions co=eWithCase) const
Definition String.cpp:1088
nonvirtual String ToLowerCase() const
Definition String.cpp:1706
nonvirtual String ReplaceAll(const RegularExpression &regEx, const String &with) const
Definition String.cpp:1155
nonvirtual String Replace(size_t from, size_t to, const String &replacement) const
Definition String.cpp:1045
nonvirtual String SubString(SZ from) const
nonvirtual String Trim(bool(*shouldBeTrimmed)(Character)=Character::IsWhitespace) const
Definition String.cpp:1592
nonvirtual bool StartsWith(const Character &c, CompareOptions co=eWithCase) const
Definition String.cpp:1059
nonvirtual String StripAll(bool(*removeCharIf)(Character)) const
Definition String.cpp:1664
nonvirtual String AssureEndsWith(const Character &c, CompareOptions co=eWithCase) const
Return *this if it ends with argument character, or append 'c' so that it ends with a 'c'.
Definition String.cpp:1123
nonvirtual Containers::Sequence< String > AsLines() const
break the String into a series of lines;
Definition String.cpp:1306
nonvirtual String LTrim(bool(*shouldBeTrimmed)(Character)=Character::IsWhitespace) const
Definition String.cpp:1443
nonvirtual Containers::Sequence< String > Grep(const String &fgrepArg) const
Breaks this string into Lines, with AsLines (), and applies the argument filter (as if with ....
Definition String.cpp:1341
nonvirtual Containers::Sequence< String > FindEachString(const RegularExpression &regEx) const
Definition String.cpp:1001
nonvirtual optional< size_t > RFind(Character c) const noexcept
Definition String.cpp:1011
static span< const CHAR_TYPE > GetData(const PeekSpanData &pds, Memory::StackBuffer< CHAR_TYPE, STACK_BUFFER_SZ > *possiblyUsedBuffer)
return the constant character data inside the string (rep) in the form of a span, possibly quickly an...
Definition String.inl:961
nonvirtual Containers::Sequence< String > Tokenize() const
Definition String.cpp:1234
nonvirtual String RemoveAt(size_t charAt) const
Definition String.inl:604
nonvirtual String RTrim(bool(*shouldBeTrimmed)(Character)=Character::IsWhitespace) const
Definition String.cpp:1508
nonvirtual optional< size_t > Find(Character c, CompareOptions co=eWithCase) const
Definition String.inl:681
nonvirtual String substr(size_t from, size_t count=npos) const
Definition String.inl:1086
static const UTFConvert kThe
Nearly always use this default UTFConvert.
Definition UTFConvert.h:369
static constexpr bool AllFitsInTwoByteEncoding(span< const CHAR_T > s) noexcept
Sequence_stdvector<T> is an std::vector-based concrete implementation of the Sequence<T> container pa...
A generalization of a vector: a container whose elements are keyed by the natural numbers.
Definition Sequence.h:187
nonvirtual void push_back(ArgByValueType< value_type > item)
Definition Sequence.inl:436
nonvirtual void Append(ArgByValueType< value_type > item)
Definition Sequence.inl:330
Set<T> is a container of T, where once an item is added, additionally adds () do nothing.
Definition Set.h:105
Logically halfway between std::array and std::vector; Smart 'direct memory array' - which when needed...
nonvirtual size_t size() const noexcept
Iterable<T> is a base class for containers which easily produce an Iterator<T> to traverse them.
Definition Iterable.h:237
nonvirtual RESULT_T Join(const CONVERT_TO_RESULT &convertToResult=kDefaultToStringConverter<>, const COMBINER &combiner=Characters::kDefaultStringCombiner) const
ape the JavaScript/python 'join' function - take the parts of 'this' iterable and combine them into a...
nonvirtual size_t size() const
Returns the number of items contained.
Definition Iterable.inl:300
nonvirtual Iterator< Character > MakeIterator() const
Create an iterator object which can be used to traverse the 'Iterable'.
Definition Iterable.inl:294
An Iterator<T> is a copyable object which allows traversing the contents of some container....
Definition Iterator.h:225
concept - trivial shorthand for variadic same_as A or same_as B, or ...
Definition Concepts.h:175
char ASCII
Stroika's string/character classes treat 'char' as being an ASCII character.
Definition Character.h:59
wostream & operator<<(wostream &out, const String &s)
Definition String.cpp:2035
conditional_t<(sizeof(CHECK_T)<=2 *sizeof(void *)) and is_trivially_copyable_v< CHECK_T >, CHECK_T, const CHECK_T & > ArgByValueType
This is an alias for 'T' - but how we want to pass it on stack as formal parameter.
Definition TypeHints.h:32
SequencePolicy
equivalent which of 4 types being used std::execution::sequenced_policy, parallel_policy,...
void Throw(T &&e2Throw)
identical to builtin C++ 'throw' except that it does helpful, type dependent DbgTrace() messages firs...
Definition Throw.inl:43
Summary data for raw contents of rep - each rep will support at least one of these span forms.
Definition String.h:1261
StringCombiner is a simple function object used to combine two strings visually - used in Iterable<>:...
Definition String.h:1912