Stroika Library 3.0d23x
 
Loading...
Searching...
No Matches
String.cpp
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2026. All rights reserved
3 */
4#include "Stroika/Foundation/StroikaPreComp.h"
5
6#include <algorithm>
7#include <climits>
8#include <cstdarg>
9#include <istream>
10#include <regex>
11#include <string>
12
15#include "Stroika/Foundation/Characters/SDKString.h"
19#include "Stroika/Foundation/Containers/Set.h"
20#include "Stroika/Foundation/Containers/Support/ReserveTweaks.h"
23#include "Stroika/Foundation/Execution/Exceptions.h"
24#include "Stroika/Foundation/Execution/Throw.h"
25#include "Stroika/Foundation/Math/Common.h"
27#include "Stroika/Foundation/Memory/Common.h"
29
30#include "String.h"
31
32using namespace Stroika::Foundation;
35using namespace Stroika::Foundation::Common;
36
37using Memory::MakeSharedPtr;
40
41// see Satisfies Concepts:
42static_assert (regular<String>);
43
44#if qStroika_Foundation_Characters_AsPathAutoMapMSYSAndCygwin
45#include <filesystem>
46#endif
47
48namespace {
49
50 /**
51 * Helper for sharing implementation code on string reps
52 * This REP is templated on CHAR_T. The key is that ALL characters for that string fit inside
53 * CHAR_T, so that the implementation can store them as an array, and index.
54 * So mixed 1,2,3 byte characters all get stored in a char32_t array, and a string with all ascii
55 * characters get stored in a char (1byte stride) array.
56 *
57 * \note - the KEY design choice in StringRepHelperAllFitInSize_::Rep<CHAR_T> is that it contains no
58 * multi-code-point characters. This is what allows the simple calculation of array index
59 * to character offset. So use
60 * StringRepHelperAllFitInSize_::Rep<ASCII> for ascii text
61 * StringRepHelperAllFitInSize_::Rep<LATIN1> for ISOLatin1 text
62 * StringRepHelperAllFitInSize_::Rep<char16_t> for ISOLatin1/anything which is a 2-byte unicode char (not surrogates)
63 * StringRepHelperAllFitInSize_::Rep<char32_t> for anything else - this always works
64 */
65 struct StringRepHelperAllFitInSize_ : String {
66 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
67 struct Rep : public _IRep {
68 private:
69 using inherited = _IRep;
70
71 protected:
72 span<const CHAR_T> _fData;
73
74#if qStroika_Foundation_Debug_AssertionsChecked
75 private:
76 mutable unsigned int fOutstandingIterators_{};
77#endif
78
79 protected:
80 Rep () = default;
81 Rep (span<const CHAR_T> s)
82 requires (not same_as<CHAR_T, char8_t>) // char8 ironically involves 2-byte characters, cuz only ascii encoded as 1 byte
83 : _fData{s}
84 {
85 if constexpr (same_as<CHAR_T, char> or same_as<CHAR_T, char8_t>) {
86 Require (Character::IsASCII (s));
87 }
88 // Any 8-bit sequence valid for Latin1
89 if constexpr (same_as<CHAR_T, char16_t>) {
91 }
92 }
93 Rep& operator= (span<const CHAR_T> s)
94 {
95#if qStroika_Foundation_Debug_AssertionsChecked
96 Require (fOutstandingIterators_ == 0);
97#endif
98 if constexpr (same_as<CHAR_T, char> or same_as<CHAR_T, char8_t>) {
99 Require (Character::IsASCII (s));
100 }
101 if constexpr (same_as<CHAR_T, char16_t>) {
103 }
104 _fData = s;
105 return *this;
106 }
107
108 public:
109 // String::_IRep OVERRIDES
110 virtual Character GetAt (size_t index) const noexcept override
111 {
112 Require (index < _fData.size ());
113 // NOTE - this is safe because we never construct this type with surrogates
114 return Character{static_cast<char32_t> (_fData[index])};
115 }
116 virtual PeekSpanData PeekData (optional<PeekSpanData::StorageCodePointType> /*preferred*/) const noexcept override
117 {
118 // IGNORE preferred, cuz we return what is in our REP - since returning a direct pointer to that data - no conversion possible
119 if constexpr (same_as<CHAR_T, ASCII>) {
120 return PeekSpanData{PeekSpanData::StorageCodePointType::eAscii, {.fAscii = _fData}};
121 }
122 if constexpr (same_as<CHAR_T, Latin1>) {
123 return PeekSpanData{PeekSpanData::StorageCodePointType::eSingleByteLatin1, {.fSingleByteLatin1 = _fData}};
124 }
125 else if constexpr (sizeof (CHAR_T) == 2) {
126 // reinterpret_cast needed cuz of wchar_t case
127 return PeekSpanData{PeekSpanData::StorageCodePointType::eChar16,
128 {.fChar16 = span<const char16_t>{reinterpret_cast<const char16_t*> (_fData.data ()), _fData.size ()}}};
129 }
130 else if constexpr (sizeof (CHAR_T) == 4) {
131 // reinterpret_cast needed cuz of wchar_t case
132 return PeekSpanData{PeekSpanData::StorageCodePointType::eChar32,
133 {.fChar32 = span<const char32_t>{reinterpret_cast<const char32_t*> (_fData.data ()), _fData.size ()}}};
134 }
135 }
136
137 // Overrides for Iterable<Character>
138 // @todo - MAYBE override Apply/Find and a few others to not use default 'iterator object' implementation that has lots of indirect virtual calls
139 public:
140 virtual shared_ptr<Iterable<Character>::_IRep> Clone () const override
141 {
142 AssertNotReached (); // Since String reps now immutable, this should never be called
143 return nullptr;
144 }
145 virtual Traversal::Iterator<value_type> MakeIterator () const override
146 {
147 // NOTE - UNDETECTED CALLER ERROR - if iterator constructed and used after string rep destroyed (never changed) -- LGP 2023-07-07
148 struct MyIterRep_ final : Iterator<Character>::IRep, public Memory::UseBlockAllocationIfAppropriate<MyIterRep_> {
149 span<const CHAR_T> fData_; // clone span (not underlying data)
150 size_t fIdx_{0};
151#if qStroika_Foundation_Debug_AssertionsChecked
152 const Rep* fOwningRep_;
153#endif
154 MyIterRep_ (span<const CHAR_T> data
156 ,
157 const Rep* dbgRep
158#endif
159 )
160 : fData_{data}
162 , fOwningRep_{dbgRep}
163#endif
164 {
165#if qStroika_Foundation_Debug_AssertionsChecked
166 ++fOwningRep_->fOutstandingIterators_;
167#endif
168 }
169#if qStroika_Foundation_Debug_AssertionsChecked
170 virtual ~MyIterRep_ () override
171 {
172 Require (fOwningRep_->fOutstandingIterators_ > 0); // if this fails, probably cuz fOwningRep_ destroyed
173 --fOwningRep_->fOutstandingIterators_;
174 }
175#endif
176
177 virtual unique_ptr<Iterator<Character>::IRep> Clone () const override
178 {
179 return make_unique<MyIterRep_> (fData_.subspan (fIdx_)
181 ,
182 fOwningRep_
183#endif
184 );
185 }
186 virtual void More (optional<Character>* result, bool advance) override
187 {
188 RequireNotNull (result);
189 if (advance) [[likely]] {
190 Require (fIdx_ < fData_.size ());
191 ++fIdx_;
192 }
193 if (fIdx_ < fData_.size ()) [[likely]] {
194 // NOTE - this is safe because we never construct this type with surrogates
195 *result = Character{static_cast<char32_t> (fData_[fIdx_])};
196 }
197 else {
198 *result = nullopt;
199 }
200 }
201 virtual bool Equals (const IRep* rhs) const override
202 {
203 RequireNotNull (rhs);
204 RequireMember (rhs, MyIterRep_);
205 const MyIterRep_* rrhs = Debug::UncheckedDynamicCast<const MyIterRep_*> (rhs);
206 return fData_.data () == rrhs->fData_.data () and fIdx_ == rrhs->fIdx_;
207 }
208 };
209 return Iterator<Character>{make_unique<MyIterRep_> (this->_fData
210
212 ,
213 this
214#endif
215
216 )};
217 }
218 virtual size_t size () const override
219 {
220 return _fData.size ();
221 }
222 virtual bool empty () const override
223 {
224 return _fData.empty ();
225 }
226 virtual Traversal::Iterator<value_type> Find (const function<bool (ArgByValueType<value_type> item)>& that,
227 Execution::SequencePolicy seq) const override
228 {
229 return inherited::Find (that, seq); // @todo rewrite FOR PERFORMANCE to operate on fData_
230 }
231 };
232 };
233
234 /**
235 * Simple string rep, which dynamically allocates its storage on the heap, through an indirect pointer reference.
236 * \note This class may assure nul-terminated (kAddNullTerminator_), and so 'capacity' always at least one greater than length.
237 */
238 struct DynamicallyAllocatedString : StringRepHelperAllFitInSize_ {
239 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
240 struct Rep final : public StringRepHelperAllFitInSize_::Rep<CHAR_T>, public Memory::UseBlockAllocationIfAppropriate<Rep<CHAR_T>> {
241 private:
242 using inherited = StringRepHelperAllFitInSize_::Rep<CHAR_T>;
243
244 public:
245 Rep (span<const CHAR_T> t1)
246 : inherited{mkBuf_ (t1)}
247 {
248 }
249 Rep () = delete;
250 Rep (const Rep&) = delete;
251
252 public:
253 nonvirtual Rep& operator= (const Rep&) = delete;
254
255 public:
256 virtual ~Rep () override
257 {
258 delete[] this->_fData.data ();
259 }
260
261 private:
262 static span<CHAR_T> mkBuf_ (size_t length)
263 {
264 size_t capacity = AdjustCapacity_ (length);
265 Assert (length <= capacity);
266 if constexpr (kAddNullTerminator_) {
267 Assert (length + 1 <= capacity);
268 }
269 CHAR_T* newBuf = new CHAR_T[capacity];
270 return span{newBuf, capacity};
271 }
272 static span<CHAR_T> mkBuf_ (span<const CHAR_T> t1)
273 {
274 size_t len = t1.size ();
275 span<CHAR_T> buf = mkBuf_ (len); // note buf span is over capacity, not size
276 Assert (buf.size () >= len);
277 auto result = Memory::CopyBytes (t1, buf);
278 if constexpr (kAddNullTerminator_) {
279 Assert (len + 1 <= buf.size ());
280 *(buf.data () + len) = '\0';
281 }
282 return result; // return span of just characters, even if we have extra NUL-byte (outside span)
283 }
284
285 public:
286 // String::_IRep OVERRIDES
287 virtual const wchar_t* c_str_peek () const noexcept override
288 {
289 // @todo NOTE DEPRECATED SINCE STROIKA v3.0d13, and same for kAddNullTerminator_
290 if constexpr (kAddNullTerminator_) {
291 Assert (*(this->_fData.data () + this->_fData.size ()) == '\0'); // dont index into buf cuz we cheat and go one past end on purpose
292 return reinterpret_cast<const wchar_t*> (this->_fData.data ());
293 }
294 else {
295 return nullptr;
296 }
297 }
298
299 private:
300 // Stick nul-terminator byte just past the end of the span
301 static constexpr bool kAddNullTerminator_ = sizeof (CHAR_T) == sizeof (wchar_t); // costs nothing to nul-terminate in this case
302
303 private:
304 static size_t AdjustCapacity_ (size_t initialCapacity)
305 {
306 size_t result = initialCapacity;
307 if constexpr (kAddNullTerminator_) {
308 ++result;
309 }
310 return result;
311 }
312 };
313 };
314
315 /**
316 * Most Stroika strings use this 'rep': FixedCapacityInlineStorageString_
317 *
318 * This String rep is like BufferedString_, except that the storage is inline in one struct/allocation
319 * for better memory allocation performance, and more importantly, better locality of data (more cpu cache friendly)
320 */
321 struct FixedCapacityInlineStorageString_ : StringRepHelperAllFitInSize_ {
322 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T, size_t CAPACITY>
323 struct Rep final : public StringRepHelperAllFitInSize_::Rep<CHAR_T>,
324 public Memory::UseBlockAllocationIfAppropriate<Rep<CHAR_T, CAPACITY>> {
325 private:
326 using inherited = StringRepHelperAllFitInSize_::Rep<CHAR_T>;
327
328 private:
329 bool IncludesNullTerminator_ () const
330 {
331 if constexpr (sizeof (CHAR_T) == sizeof (wchar_t)) {
332 return this->_fData.size () < CAPACITY; // else no room
333 }
334 else {
335 return false;
336 }
337 }
338
339 private:
340 CHAR_T fBuf_[CAPACITY];
341
342 public:
343 Rep (span<const CHAR_T> t1)
344 : inherited{}
345 {
346 // must do this logic after base construction since references data member which doesn't exist
347 // til after base class construction. SHOULDNT really matter (since uninitialized data), but on
348 // g++-11, and other compilers, detected as vptr UB violation if we access first
349 Require (t1.size () <= CAPACITY);
350 inherited::operator= (Memory::CopyBytes (t1, span<CHAR_T>{fBuf_}));
351 if (IncludesNullTerminator_ ()) {
352 Assert (t1.size () + 1 <= CAPACITY);
353 fBuf_[t1.size ()] = CHAR_T{'\0'};
354 }
355 }
356 Rep () = delete;
357 Rep (const Rep&) = delete;
358
359 public:
360 nonvirtual Rep& operator= (const Rep&) = delete;
361
362 public:
363 // String::_IRep OVERRIDES
364 virtual const wchar_t* c_str_peek () const noexcept override
365 {
366 if (IncludesNullTerminator_ ()) {
367 Assert (*(this->_fData.data () + this->_fData.size ()) == '\0'); // dont index into buf cuz we cheat and go one past end on purpose
368 return reinterpret_cast<const wchar_t*> (this->_fData.data ());
369 }
370 else {
371 return nullptr;
372 }
373 }
374 };
375 };
376
377 /**
378 * For static full app lifetime string constants...
379 */
380 struct StringConstant_ : public StringRepHelperAllFitInSize_ {
381 using inherited = String;
382
383 template <IUNICODECanUnambiguouslyConvertFrom CHAR_T>
384 class DirectIndexRep final : public StringRepHelperAllFitInSize_::Rep<CHAR_T>,
385 public Memory::UseBlockAllocationIfAppropriate<Rep<CHAR_T>> {
386 private:
387 using inherited = StringRepHelperAllFitInSize_::Rep<CHAR_T>;
388
389 public:
390 DirectIndexRep (span<const CHAR_T> s)
391 : inherited{s} // don't copy memory - but copy raw pointers! So they MUST BE (externally promised) 'externally owned for the application lifetime and constant' - like c++ string constants
392 {
393 }
394
395 public:
396 // String::_IRep OVERRIDES
397 virtual const wchar_t* c_str_peek () const noexcept override
398 {
399 return nullptr;
400 }
401 };
402 };
403
404 /*
405 * Used for String{move(some_string)}
406 */
407 struct StdStringDelegator_ : public StringRepHelperAllFitInSize_ {
408 using inherited = String;
409
410 template <IStdBasicStringCompatibleCharacter CHAR_T>
411 class Rep final : public StringRepHelperAllFitInSize_::Rep<CHAR_T>, public Memory::UseBlockAllocationIfAppropriate<Rep<CHAR_T>> {
412 private:
413 using inherited = StringRepHelperAllFitInSize_::Rep<CHAR_T>;
414
415 public:
416 Rep (basic_string<CHAR_T>&& s)
417 : inherited{span<const CHAR_T>{}}
418 , fMovedData_{move (s)}
419 {
420 inherited::operator= (span{fMovedData_.data (), fMovedData_.size ()}); // must grab after move
421 }
422
423 public:
424 // String::_IRep OVERRIDES
425 virtual const wchar_t* c_str_peek () const noexcept override
426 {
427 if constexpr (same_as<CHAR_T, wchar_t>) {
428 return fMovedData_.c_str ();
429 }
430 else {
431 return nullptr;
432 }
433 }
434
435 private:
436 basic_string<CHAR_T> fMovedData_;
437 };
438 };
439
440 /**
441 * Delegate to original String::Rep, and add in support for c_str ()
442 */
443 struct StringWithCStr_ : public String {
444 public:
445 class Rep final : public _IRep, public Memory::UseBlockAllocationIfAppropriate<Rep> {
446 private:
447 shared_ptr<_IRep> fUnderlyingRep_;
448 wstring fCString_;
449
450 public:
451 // Caller MUST ASSURE generates right size of Rep based on size in underlyingRepPDS
452 Rep (const shared_ptr<_IRep>& underlyingRep)
453 : fUnderlyingRep_{underlyingRep}
454 , fCString_{}
455 {
456 Memory::StackBuffer<wchar_t> possibleUsedBuf;
457 auto wideSpan = String::GetData<wchar_t> (underlyingRep->PeekData (nullopt), &possibleUsedBuf);
458 fCString_.assign (wideSpan.begin (), wideSpan.end ());
459 }
460
461 // Overrides for Iterable<Character>
462 public:
463 virtual shared_ptr<Iterable<Character>::_IRep> Clone () const override
464 {
465 return fUnderlyingRep_->Clone ();
466 }
467 virtual Traversal::Iterator<value_type> MakeIterator () const override
468 {
469 return fUnderlyingRep_->MakeIterator ();
470 }
471 virtual size_t size () const override
472 {
473 return fUnderlyingRep_->size ();
474 }
475 virtual bool empty () const override
476 {
477 return fUnderlyingRep_->empty ();
478 }
479 virtual Traversal::Iterator<value_type> Find (const function<bool (ArgByValueType<value_type> item)>& that,
480 [[maybe_unused]] Execution::SequencePolicy seq) const override
481 {
482 return fUnderlyingRep_->Find (that, seq);
483 }
484
485 // String::_IRep overrides - delegate
486 public:
487 virtual Character GetAt (size_t index) const noexcept override
488 {
489 return fUnderlyingRep_->GetAt (index);
490 }
491 virtual PeekSpanData PeekData ([[maybe_unused]] optional<PeekSpanData::StorageCodePointType> preferred) const noexcept override
492 {
493 return fUnderlyingRep_->PeekData (preferred);
494 }
495 virtual const wchar_t* c_str_peek () const noexcept override
496 {
497 return fCString_.c_str ();
498 }
499 };
500 };
501}
502
503namespace {
504 template <typename FACET>
505 struct deletable_facet_ final : FACET {
506 template <typename... Args>
507 deletable_facet_ (Args&&... args)
508 : FACET{forward<Args> (args)...}
509 {
510 }
511 ~deletable_facet_ () = default;
512 };
513}
514
515/*
516 ********************************************************************************
517 ******* Characters::Private_::RegularExpression_GetCompiled ********************
518 ********************************************************************************
519 */
520const wregex& Characters::Private_::RegularExpression_GetCompiled (const RegularExpression& regExp)
521{
522 return regExp.GetCompiled ();
523}
524
525/*
526 ********************************************************************************
527 ************************************* String ***********************************
528 ********************************************************************************
529 */
530shared_ptr<String::_IRep> String::CTORFromBasicStringView_ (const basic_string_view<ASCII>& str)
531{
532 RequireExpression (Character::IsASCII (span{str.data (), str.size ()}));
533 return MakeSharedPtr<StringConstant_::DirectIndexRep<ASCII>> (span{str.data (), str.size ()});
534}
535
536shared_ptr<String::_IRep> String::CTORFromBasicStringView_ (const basic_string_view<char8_t>& str)
537{
538 if (Character::IsASCII (span{str.data (), str.size ()})) {
539 return MakeSharedPtr<StringConstant_::DirectIndexRep<ASCII>> (Memory::SpanBytesCast<span<const ASCII>> (span{str.data (), str.size ()}));
540 }
541 else {
542 return mk_ (span<const char8_t>{str.data (), str.size ()}); // copies data
543 }
544}
545
546shared_ptr<String::_IRep> String::CTORFromBasicStringView_ (const basic_string_view<char16_t>& str)
547{
548 if (UTFConvert::AllFitsInTwoByteEncoding (span{str})) {
549 return MakeSharedPtr<StringConstant_::DirectIndexRep<char16_t>> (span{str.data (), str.size ()});
550 }
551 else {
552 return mk_ (span<const char16_t>{str.data (), str.size ()}); // copies data
553 }
554}
555
556shared_ptr<String::_IRep> String::CTORFromBasicStringView_ (const basic_string_view<char32_t>& str)
557{
558 return MakeSharedPtr<StringConstant_::DirectIndexRep<char32_t>> (span{str.data (), str.size ()});
559}
560
561shared_ptr<String::_IRep> String::CTORFromBasicStringView_ (const basic_string_view<wchar_t>& str)
562{
563 return MakeSharedPtr<StringConstant_::DirectIndexRep<wchar_t>> (span{str.data (), str.size ()});
564}
565
566String String::FromStringConstant (span<const ASCII> s)
567{
568 Require (Character::IsASCII (s));
569 return String{MakeSharedPtr<StringConstant_::DirectIndexRep<ASCII>> (s)};
570}
571
572String String::FromStringConstant (span<const char16_t> s)
573{
575 return String{MakeSharedPtr<StringConstant_::DirectIndexRep<char16_t>> (s)};
576 }
577 else {
578 return String{s};
579 }
580}
581
582String String::FromStringConstant (span<const char32_t> s)
583{
584 return String{MakeSharedPtr<StringConstant_::DirectIndexRep<char32_t>> (s)};
585}
586
587String String::FromNarrowString (span<const char> s, const locale& l)
588{
589 // Note: this could use CodeCvt, but directly using std::codecvt in this case pretty simple, and
590 // more efficient this way --LGP 2023-02-14
591
592 // See http://en.cppreference.com/w/cpp/locale/codecvt/~codecvt
593 using Destructible_codecvt_byname = deletable_facet_<codecvt_byname<wchar_t, char, mbstate_t>>;
594 Destructible_codecvt_byname cvt{l.name ()};
595
596 // http://en.cppreference.com/w/cpp/locale/codecvt/in
597 mbstate_t mbstate{};
598 Memory::StackBuffer<wchar_t> targetBuf{s.size ()};
599 const char* from_next;
600 wchar_t* to_next;
601 codecvt_base::result result =
602 cvt.in (mbstate, s.data (), s.data () + s.size (), from_next, targetBuf.data (), targetBuf.data () + targetBuf.size (), to_next);
603 if (result != codecvt_base::ok) [[unlikely]] {
604 static const auto kException_ = Execution::RuntimeErrorException{"Error converting locale multibyte string to UNICODE"sv};
605 Execution::Throw (kException_);
606 }
607 return String{span<const wchar_t>{targetBuf.data (), static_cast<size_t> (to_next - targetBuf.data ())}};
608}
609
610shared_ptr<String::_IRep> String::mkEmpty_ ()
611{
612 static constexpr wchar_t kEmptyCStr_[] = L"";
613 static const shared_ptr<_IRep> s_ = MakeSharedPtr<StringConstant_::DirectIndexRep<wchar_t>> (span{std::begin (kEmptyCStr_), 0});
614 return s_;
615}
616
617template <typename CHAR_T>
618inline auto String::mk_nocheck_ (span<const CHAR_T> s) -> shared_ptr<_IRep>
619 requires (same_as<CHAR_T, ASCII> or same_as<CHAR_T, Latin1> or same_as<CHAR_T, char16_t> or same_as<CHAR_T, char32_t>)
620{
621 // No check means needed checking done before, so these assertions just help enforce that
622 if constexpr (same_as<CHAR_T, ASCII>) {
623 Require (Character::IsASCII (s)); // avoid later assertion error
624 }
625 else if constexpr (same_as<CHAR_T, Latin1>) {
626 // nothing to check
627 }
628 else if constexpr (sizeof (CHAR_T) == 2) {
629 Require (UTFConvert::AllFitsInTwoByteEncoding (s)); // avoid later assertion error
630 }
631 else {
632 // again - if larger, nothing to check
633 }
634
635 /**
636 * We want to TARGET using block-allocator of 64 bytes. This works well for typical (x86) machine
637 * caches, and divides up nicely, and leaves enuf room for a decent number of characters typically.
638 *
639 * So compute/guestimate a few sizes, and add static_asserts to check where we can. Often if these fail
640 * you can just get rid/or fix them. Not truly counted on, just trying ot generate vaguely reasonable
641 * number of characters to use.
642 */
643 constexpr size_t kBaseOfFixedBufSize_ = sizeof (StringRepHelperAllFitInSize_::Rep<CHAR_T>);
644 static_assert (kBaseOfFixedBufSize_ < 64); // this code below assumes, so must re-tune if this ever fails
645 if constexpr (qStroika_Foundation_Common_Platform_Windows and not qStroika_Foundation_Debug_AssertionsChecked) {
646 static_assert (kBaseOfFixedBufSize_ == 3 * sizeof (void*));
647 if constexpr (sizeof (void*) == 4) {
648 static_assert (kBaseOfFixedBufSize_ == 12);
649 }
650 else if constexpr (sizeof (void*) == 8) {
651 static_assert (kBaseOfFixedBufSize_ == 24);
652 }
653 }
654 constexpr size_t kOverheadSizeForMakeShared_ =
655 qStroika_Foundation_Common_Platform_Windows ? (sizeof (void*) == 4 ? 12 : 16) : sizeof (unsigned long) * 2;
656#if qStroika_Foundation_Common_Platform_Windows
657 static_assert (kOverheadSizeForMakeShared_ == sizeof (_Ref_count_base)); // not critically counted on, just to debug/fix sizes
658#endif
659 static constexpr size_t kNElts1_ = (64 - kBaseOfFixedBufSize_ - kOverheadSizeForMakeShared_) / sizeof (CHAR_T);
660 static constexpr size_t kNElts2_ = (96 - kBaseOfFixedBufSize_ - kOverheadSizeForMakeShared_) / sizeof (CHAR_T);
661 static constexpr size_t kNElts3_ = (128 - kBaseOfFixedBufSize_ - kOverheadSizeForMakeShared_) / sizeof (CHAR_T);
662
663 // These checks are NOT important, just for documentation/reference
664 if constexpr (qStroika_Foundation_Common_Platform_Windows and sizeof (CHAR_T) == 1 and not qStroika_Foundation_Debug_AssertionsChecked) {
665 if constexpr (sizeof (void*) == 4) {
666 static_assert (kNElts1_ == 40);
667 static_assert (kNElts2_ == 72);
668 static_assert (kNElts3_ == 104);
669 }
670 if constexpr (sizeof (void*) == 8) {
671 static_assert (kNElts1_ == 24);
672 static_assert (kNElts2_ == 56);
673 static_assert (kNElts3_ == 88);
674 }
675 }
676
677 static_assert (qStroika_Foundation_Debug_AssertionsChecked or kNElts1_ >= 6); // crazy otherwise
678 static_assert (kNElts2_ > kNElts1_); // ""
679 static_assert (kNElts3_ > kNElts2_); // ""
680
681 static_assert (sizeof (FixedCapacityInlineStorageString_::Rep<CHAR_T, kNElts1_>) == 64 - kOverheadSizeForMakeShared_); // not quite guaranteed but close
682 static_assert (sizeof (FixedCapacityInlineStorageString_::Rep<CHAR_T, kNElts2_>) == 96 - kOverheadSizeForMakeShared_); // ""
683 static_assert (sizeof (FixedCapacityInlineStorageString_::Rep<CHAR_T, kNElts3_>) == 128 - kOverheadSizeForMakeShared_); // ""
684
685 size_t sz = s.size ();
686 if (sz <= kNElts1_) {
687 return MakeSharedPtr<FixedCapacityInlineStorageString_::Rep<CHAR_T, kNElts1_>> (s);
688 }
689 else if (sz <= kNElts2_) {
690 return MakeSharedPtr<FixedCapacityInlineStorageString_::Rep<CHAR_T, kNElts2_>> (s);
691 }
692 else if (sz <= kNElts3_) {
693 return MakeSharedPtr<FixedCapacityInlineStorageString_::Rep<CHAR_T, kNElts3_>> (s);
694 }
695 return MakeSharedPtr<DynamicallyAllocatedString::Rep<CHAR_T>> (s);
696}
697
698template <>
699auto String::mk_ (basic_string<char>&& s) -> shared_ptr<_IRep>
700{
701 Character::CheckASCII (span{s.data (), s.size ()});
702 return MakeSharedPtr<StdStringDelegator_::Rep<ASCII>> (move (s));
703}
704
705template <>
706auto String::mk_ (basic_string<char16_t>&& s) -> shared_ptr<_IRep>
707{
708 if (UTFConvert::AllFitsInTwoByteEncoding (Memory::ConstSpan (span{s.data (), s.size ()}))) {
709 return MakeSharedPtr<StdStringDelegator_::Rep<char16_t>> (move (s));
710 }
711 // copy the data if any surrogates
712 Memory::StackBuffer<char32_t> wideUnicodeBuf{Memory::eUninitialized, UTFConvert::ComputeTargetBufferSize<char32_t> (span{s.data (), s.size ()})};
713 return mk_nocheck_ (Memory::ConstSpan (UTFConvert::kThe.ConvertSpan (span{s.data (), s.size ()}, span{wideUnicodeBuf})));
714}
715
716template <>
717auto String::mk_ (basic_string<char32_t>&& s) -> shared_ptr<_IRep>
718{
719 return MakeSharedPtr<StdStringDelegator_::Rep<char32_t>> (move (s));
720}
721
722template <>
723auto String::mk_ (basic_string<wchar_t>&& s) -> shared_ptr<_IRep>
724{
725 if constexpr (sizeof (wchar_t) == 2) {
726 if (UTFConvert::AllFitsInTwoByteEncoding (Memory::ConstSpan (span{s.data (), s.size ()}))) {
727 return MakeSharedPtr<StdStringDelegator_::Rep<wchar_t>> (move (s));
728 }
729 // copy the data if any surrogates
730 Memory::StackBuffer<char32_t> wideUnicodeBuf{Memory::eUninitialized,
731 UTFConvert::ComputeTargetBufferSize<char32_t> (span{s.data (), s.size ()})};
732 return mk_nocheck_ (Memory::ConstSpan (UTFConvert::kThe.ConvertSpan (span{s.data (), s.size ()}, span{wideUnicodeBuf})));
733 }
734 else {
735 return MakeSharedPtr<StdStringDelegator_::Rep<wchar_t>> (move (s));
736 }
737}
738
739String String::Concatenate_ (const String& rhs) const
740{
741 // KISS, simple default 'fall-thru' case
743 span leftSpan = GetData (&ignoredA);
745 span rightSpan = rhs.GetData (&ignoredB);
746 Memory::StackBuffer<char32_t> buf{Memory::eUninitialized, leftSpan.size () + rightSpan.size ()};
747 copy (leftSpan.begin (), leftSpan.end (), buf.data ());
748 copy (rightSpan.begin (), rightSpan.end (), buf.data () + leftSpan.size ());
749 return mk_ (span{buf});
750}
751
752void String::SetCharAt (Character c, size_t i)
753{
754 // @Todo - redo with check if char is actually changing and if so use
755 // mk/4 4 arg string maker instead.??? Or some such...
756 Require (i >= 0);
757 Require (i < size ());
758 // Expensive, but you can use StringBuilder directly to avoid the performance costs
759 StringBuilder sb{*this};
760 Require (i < size ());
761 sb.SetAt (c, i);
762 *this = sb;
763}
764
765String String::InsertAt (span<const Character> s, size_t at) const
766{
767 Require (at >= 0);
768 Require (at <= size ());
769 if (s.empty ()) {
770 return *this;
771 }
773 span<const Character> thisStrData = GetData (&ignored1);
774 StringBuilder sb{thisStrData.subspan (0, at)};
775 sb.Append (s);
776 sb.Append (thisStrData.subspan (at));
777 return sb;
778}
779
780String String::RemoveAt (size_t from, size_t to) const
781{
782 Require (from <= to);
783 Require (to <= size ());
784 if (from == to) {
785 return *this;
786 }
787 if (from == 0) {
788 return SubString (to);
789 }
790 _SafeReadRepAccessor accessor{this};
791 size_t length = accessor._ConstGetRep ().size ();
792 if (to == length) {
793 return SubString (0, from);
794 }
795 else {
797 span d = GetData (&ignored1);
798 Memory::StackBuffer<char32_t> buf{Memory::eUninitialized, d.size () - (to - from)};
799 span<char32_t> bufSpan{buf.data (), buf.size ()};
800 span s1 = d.subspan (0, from);
801 span s2 = d.subspan (to);
802 Memory::CopyBytes (s1, bufSpan);
803 Memory::CopyBytes (s2, bufSpan.subspan (s1.size ()));
804 return String{mk_ (bufSpan)};
805 }
806}
807
809{
810 String tmp = {*this};
811 if (auto o = tmp.Find (c, eWithCase)) {
812 return tmp.RemoveAt (*o);
813 }
814 return tmp;
815}
816String String::RemoveFirstIf (const String& subString) const
817{
818 if (auto o = this->Find (subString, eWithCase)) {
819 return this->SubString (0, *o) + this->SubString (*o + subString.length ());
820 }
821 return *this;
822}
823
825{
826 // @todo REIMPL WITH STRINGBUILDER
827 // quick and dirty inefficient implementation
828 String tmp = {*this};
829 while (auto o = tmp.Find (c, eWithCase)) {
830 tmp = tmp.RemoveAt (*o);
831 }
832 return tmp;
833}
834String String::RemoveAll (const String& subString) const
835{
836 // @todo REIMPL WITH STRINGBUILDER
837 // quick and dirty inefficient implementation
838 String tmp = {*this};
839 while (auto o = tmp.Find (subString, eWithCase)) {
840 tmp = tmp.SubString (0, *o) + tmp.SubString (*o + subString.length ());
841 }
842 return tmp;
843}
844
845optional<size_t> String::Find (Character c, size_t startAt, CompareOptions co) const
846{
847 PeekSpanData pds = GetPeekSpanData<ASCII> ();
848 // OPTIMIZED PATHS: Common case(s) and should be fast
850 if (c.IsASCII ()) {
851 span<const char> examineSpan = pds.fAscii.subspan (startAt);
852 if (co == eWithCase) {
853 if (auto i = std::find (examineSpan.begin (), examineSpan.end (), c.GetAsciiCode ()); i != examineSpan.end ()) {
854 return i - examineSpan.begin () + startAt;
855 }
856 }
857 else {
858 char lc = c.ToLowerCase ().GetAsciiCode ();
859 size_t reportIdx = startAt;
860 for (auto ci : examineSpan) {
861 if (tolower (ci) == lc) {
862 return reportIdx;
863 }
864 ++reportIdx;
865 }
866 }
867 return nullopt; // not found, possibly cuz not ascii
868 }
869 }
870 // fallback on more generic algorithm - and copy to full character objects
871 //
872 // performance notes
873 // Could iterate using CharAt() and that would perform better in the case where you find c early
874 // in a string, and the string is short. The problem with the current code is that it converts the
875 // entire string (could be long) and then might not look at much of the converted data.
876 // on the other hand, if our reps are either 'ascii or char32_t wide' - which we may end up with - then
877 // this isn't too bad - cuz no copying for char32_ case either...
878 Memory::StackBuffer<Character> maybeIgnoreBuf;
879 span<const Character> charSpan = GetData (pds, &maybeIgnoreBuf);
880 Require (startAt <= charSpan.size ());
881 span<const Character> examineSpan = charSpan.subspan (startAt);
882 switch (co) {
883 case eCaseInsensitive: {
884 Character lcc = c.ToLowerCase ();
885 for (auto i = examineSpan.begin (); i != examineSpan.end (); ++i) {
886 if (i->ToLowerCase () == lcc) {
887 return startAt + (i - examineSpan.begin ());
888 }
889 }
890 } break;
891 case eWithCase: {
892 if (auto i = std::find (examineSpan.begin (), examineSpan.end (), c); i != examineSpan.end ()) {
893 return startAt + i - examineSpan.begin ();
894 }
895 } break;
896 }
897 return nullopt; // not found any which way
898}
899
900optional<size_t> String::Find (const String& subString, size_t startAt, CompareOptions co) const
901{
902 //@todo: FIX HORRIBLE PERFORMANCE!!!
903 _SafeReadRepAccessor accessor{this};
904 Require (startAt <= accessor._ConstGetRep ().size ());
905
906 size_t subStrLen = subString.size ();
907 if (subStrLen == 0) {
908 return (accessor._ConstGetRep ().size () == 0) ? optional<size_t>{} : 0;
909 }
910 if (accessor._ConstGetRep ().size () < subStrLen) {
911 return {}; // important test cuz size_t is unsigned
912 }
913
914 size_t limit = accessor._ConstGetRep ().size () - subStrLen;
915 switch (co) {
916 case eCaseInsensitive: {
917 for (size_t i = startAt; i <= limit; ++i) {
918 for (size_t j = 0; j < subStrLen; ++j) {
919 if (accessor._ConstGetRep ().GetAt (i + j).ToLowerCase () != subString[j].ToLowerCase ()) {
920 goto nogood1;
921 }
922 }
923 return i;
924 nogood1:;
925 }
926 } break;
927 case eWithCase: {
928 for (size_t i = startAt; i <= limit; ++i) {
929 for (size_t j = 0; j < subStrLen; ++j) {
930 if (accessor._ConstGetRep ().GetAt (i + j) != subString[j]) {
931 goto nogood2;
932 }
933 }
934 return i;
935 nogood2:;
936 }
937 } break;
938 }
939 return {};
940}
941
942optional<pair<size_t, size_t>> String::Find (const RegularExpression& regEx, size_t startAt) const
943{
944 Require (startAt <= size ());
945 wstring tmp = As<wstring> ();
946 Require (startAt < tmp.size ());
947 tmp = tmp.substr (startAt);
948 wsmatch res;
949 regex_search (tmp, res, regEx.GetCompiled ());
950 if (res.size () >= 1) {
951 size_t startOfMatch = startAt + res.position ();
952 return pair<size_t, size_t>{startOfMatch, startOfMatch + res.length ()};
953 }
954 return {};
955}
956
957Containers::Sequence<size_t> String::FindEach (const String& string2SearchFor, CompareOptions co) const
958{
959 vector<size_t> result;
960 for (optional<size_t> i = Find (string2SearchFor, 0, co); i; i = Find (string2SearchFor, *i, co)) {
961 result.push_back (*i);
962 *i += string2SearchFor.length (); // this cannot point past end of this string because we FOUND string2SearchFor
963 }
964 return Containers::Concrete::Sequence_stdvector{move (result)};
965}
966
968{
969 vector<pair<size_t, size_t>> result;
970 //@TODO - FIX - IF we get back zero length match
971 wstring tmp{As<wstring> ()};
972 wsmatch res;
973 regex_search (tmp, res, regEx.GetCompiled ());
974 size_t nMatches = res.size ();
975 result.reserve (nMatches);
976 for (size_t mi = 0; mi < nMatches; ++mi) {
977 size_t matchLen = res.length (mi); // avoid populating with lots of empty matches - special case of empty search
978 if (matchLen != 0) {
979 result.push_back (pair<size_t, size_t>{res.position (mi), matchLen});
980 }
981 }
982 return Containers::Concrete::Sequence_stdvector{move (result)};
983}
984
986{
987 vector<RegularExpressionMatch> result;
988 wstring tmp{As<wstring> ()};
989 for (wsregex_iterator i = wsregex_iterator{tmp.begin (), tmp.end (), regEx.GetCompiled ()}; i != wsregex_iterator (); ++i) {
990 wsmatch match{*i};
991 Assert (match.size () != 0);
992 size_t n = match.size ();
994 for (size_t j = 1; j < n; ++j) {
995 s.Append (match.str (j));
996 }
997 result.push_back (RegularExpressionMatch{match.str (0), s});
998 }
999 return Containers::Concrete::Sequence_stdvector{move (result)};
1000}
1001
1003{
1004 vector<String> result;
1005 wstring tmp{As<wstring> ()};
1006 for (wsregex_iterator i = wsregex_iterator{tmp.begin (), tmp.end (), regEx.GetCompiled ()}; i != wsregex_iterator (); ++i) {
1007 result.push_back (String{i->str ()});
1008 }
1009 return Containers::Concrete::Sequence_stdvector{move (result)};
1010}
1011
1012optional<size_t> String::RFind (Character c) const noexcept
1013{
1014 //@todo: FIX HORRIBLE PERFORMANCE!!!
1015 _SafeReadRepAccessor accessor{this};
1016 const _IRep& useRep = accessor._ConstGetRep ();
1017 size_t length = useRep.size ();
1018 for (size_t i = length; i > 0; --i) {
1019 if (useRep.GetAt (i - 1) == c) {
1020 return i - 1;
1021 }
1022 }
1023 return nullopt;
1024}
1025
1026optional<size_t> String::RFind (const String& subString) const
1027{
1028 //@todo: FIX HORRIBLE PERFORMANCE!!!
1029 /*
1030 * Do quickie implementation, and don't worry about efficiency...
1031 */
1032 size_t subStrLen = subString.size ();
1033 if (subStrLen == 0) {
1034 return ((size () == 0) ? optional<size_t>{} : size () - 1);
1035 }
1036
1037 size_t limit = size () - subStrLen + 1;
1038 for (size_t i = limit; i > 0; --i) {
1039 if (SubString (i - 1, i - 1 + subStrLen) == subString) {
1040 return i - 1;
1041 }
1042 }
1043 return nullopt;
1044}
1045
1046String String::Replace (size_t from, size_t to, const String& replacement) const
1047{
1049 span<const wchar_t> thisSpan = GetData (&ignored);
1050 Require (from <= to);
1051 Require (to <= this->size ());
1052 Assert (to < thisSpan.size ());
1053 StringBuilder sb{thisSpan.subspan (0, from)};
1054 sb.Append (replacement);
1055 sb.Append (thisSpan.subspan (to));
1056 Ensure (sb == SubString (0, from) + replacement + SubString (to));
1057 return sb;
1058}
1059
1060bool String::StartsWith (const Character& c, CompareOptions co) const
1061{
1062 _SafeReadRepAccessor accessor{this};
1063 if (accessor._ConstGetRep ().size () == 0) {
1064 return false;
1065 }
1066 return Character::EqualsComparer{co}(accessor._ConstGetRep ().GetAt (0), c);
1067}
1068
1069bool String::StartsWith (const String& subString, CompareOptions co) const
1070{
1071 Require (not subString.empty ());
1072 if (subString.size () > size ()) {
1073 return false;
1074 }
1075#if qStroika_Foundation_Debug_AssertionsChecked
1076 bool referenceResult = ThreeWayComparer{co}(SubString (0, subString.size ()), subString) == 0;
1077#endif
1078 Memory::StackBuffer<Character> maybeIgnoreBuf1;
1079 Memory::StackBuffer<Character> maybeIgnoreBuf2;
1080 span<const Character> subStrData = subString.GetData (&maybeIgnoreBuf1);
1081 span<const Character> thisData = GetData (&maybeIgnoreBuf2);
1082 bool result = Character::Compare (thisData.subspan (0, subStrData.size ()), subStrData, co) == 0;
1083#if qStroika_Foundation_Debug_AssertionsChecked
1084 Ensure (result == referenceResult);
1085#endif
1086 return result;
1087}
1088
1089bool String::EndsWith (const Character& c, CompareOptions co) const
1090{
1091 _SafeReadRepAccessor accessor{this};
1092 const _IRep& useRep = accessor._ConstGetRep ();
1093 size_t thisStrLen = useRep.size ();
1094 if (thisStrLen == 0) {
1095 return false;
1096 }
1097 return Character::EqualsComparer{co}(useRep.GetAt (thisStrLen - 1), c);
1098}
1099
1100bool String::EndsWith (const String& subString, CompareOptions co) const
1101{
1102 Require (not subString.empty ());
1103 _SafeReadRepAccessor subStrAccessor{&subString};
1104 _SafeReadRepAccessor accessor{this};
1105 size_t thisStrLen = accessor._ConstGetRep ().size ();
1106 size_t subStrLen = subString.size ();
1107 if (subStrLen > thisStrLen) {
1108 return false;
1109 }
1110#if qStroika_Foundation_Debug_AssertionsChecked
1111 bool referenceResult = String::EqualsComparer{co}(SubString (thisStrLen - subStrLen, thisStrLen), subString);
1112#endif
1113 Memory::StackBuffer<Character> maybeIgnoreBuf1;
1114 Memory::StackBuffer<Character> maybeIgnoreBuf2;
1115 span<const Character> subStrData = subString.GetData (&maybeIgnoreBuf1);
1116 span<const Character> thisData = GetData (&maybeIgnoreBuf2);
1117 bool result = Character::Compare (thisData.subspan (thisStrLen - subStrLen), subStrData, co) == 0;
1118#if qStroika_Foundation_Debug_AssertionsChecked
1119 Ensure (result == referenceResult);
1120#endif
1121 return result;
1122}
1123
1124String String::AssureEndsWith (const Character& c, CompareOptions co) const
1125{
1126 if (EndsWith (c, co)) {
1127 return *this;
1128 }
1129 StringBuilder sb = *this;
1130 sb.Append (c);
1131 return sb;
1132}
1133
1134bool String::Matches (const RegularExpression& regEx) const
1135{
1136 wstring tmp{As<wstring> ()};
1137 return regex_match (tmp.begin (), tmp.end (), regEx.GetCompiled ());
1138}
1139
1140bool String::Matches (const RegularExpression& regEx, Sequence<String>* matches) const
1141{
1142 RequireNotNull (matches);
1143 //tmphack
1144 wstring tmp{As<wstring> ()};
1145 wsmatch base_match;
1146 if (regex_match (tmp, base_match, regEx.GetCompiled ())) {
1147 matches->clear ();
1148 for (size_t i = 1; i < base_match.size (); ++i) {
1149 matches->Append (base_match[i].str ());
1150 }
1151 return true;
1152 }
1153 return false;
1154}
1155
1156String String::ReplaceAll (const RegularExpression& regEx, const String& with) const
1157{
1158 return String{regex_replace (As<wstring> (), regEx.GetCompiled (), with.As<wstring> ())};
1159}
1160
1161String String::ReplaceAll (const String& string2SearchFor, const String& with, CompareOptions co) const
1162{
1163 Require (not string2SearchFor.empty ());
1164 // simplistic quickie impl...
1165 String result{*this};
1166 optional<size_t> i{0};
1167 while ((i = result.Find (string2SearchFor, *i, co))) {
1168 result = result.SubString (0, *i) + with + result.SubString (*i + string2SearchFor.length ());
1169 *i += with.length ();
1170 }
1171 return result;
1172}
1173
1174String String::ReplaceAll (const function<bool (Character)>& replaceCharP, const String& with) const
1175{
1176 StringBuilder sb;
1177 for (Character i : *this) {
1178 if (replaceCharP (i)) {
1179 sb << with;
1180 }
1181 else {
1182 sb << i;
1183 }
1184 }
1185 return sb;
1186}
1187
1188String String::ReplaceAll (const Set<Character>& charSet, const String& with) const
1189{
1190 StringBuilder sb;
1191 for (Character i : *this) {
1192 if (charSet.Contains (i)) {
1193 sb << with;
1194 }
1195 else {
1196 sb << i;
1197 }
1198 }
1199 return sb;
1200}
1201
1203{
1204 PeekSpanData pds = GetPeekSpanData<ASCII> ();
1205 Memory::StackBuffer<Character> maybeIgnoreBuf;
1206 span<const Character> charSpan = GetData (pds, &maybeIgnoreBuf);
1207 StringBuilder sb;
1208 bool everChanged{false};
1209 for (auto ci = charSpan.begin (); ci != charSpan.end (); ++ci) {
1210 Character c = *ci;
1211 if (c == '\r') {
1212 // peek at next character - and if we have a CRLF sequence - then advance pointer
1213 // (so we skip next NL) and pretend this was an NL..
1214 if (ci + 1 != charSpan.end () and *(ci + 1) == '\n') {
1215 ++ci;
1216 }
1217 everChanged = true;
1218 c = '\n';
1219 }
1220 sb << c;
1221 }
1222 if (everChanged) {
1223 return sb;
1224 }
1225 else {
1226 return *this;
1227 }
1228}
1229
1230String String::NormalizeSpace (Character useSpaceCharacter) const
1231{
1232 return ReplaceAll ("\\s+"_RegEx, String{useSpaceCharacter});
1233}
1234
1239Sequence<String> String::Tokenize (const function<bool (Character)>& isTokenSeparator) const
1240{
1242 bool inToken = false;
1243 StringBuilder curToken;
1244 size_t len = size ();
1245 for (size_t i = 0; i != len; ++i) {
1246 Character c = GetCharAt (i);
1247 bool newInToken = not isTokenSeparator (c);
1248 if (inToken != newInToken) {
1249 if (inToken) {
1250 String s{curToken.str ()};
1251 r += s;
1252 curToken.clear ();
1253 inToken = false;
1254 }
1255 else {
1256 inToken = true;
1257 }
1258 }
1259 if (inToken) {
1260 curToken << c;
1261 }
1262 }
1263 if (inToken) {
1264 String s{curToken.str ()};
1265 r += s;
1266 }
1267 return r;
1268}
1269
1270Sequence<String> String::Tokenize (const RegularExpression& isSeparator) const
1271{
1273 size_t len = this->length ();
1274 for (size_t startAt = 0; startAt < len;) {
1275 if (optional<pair<size_t, size_t>> ofi = Find (isSeparator, startAt)) {
1276 Assert (ofi->first >= startAt);
1277 Assert (ofi->first <= ofi->second);
1278 if (ofi->first == ofi->second) [[unlikely]] {
1279 static const auto kException_ =
1280 Execution::RuntimeErrorException{"separator regular expression argument to Tokenize must be non-empty or not match"sv};
1281 Execution::Throw (kException_);
1282 }
1283 if (ofi->first > startAt) {
1284 r += SubString (startAt, ofi->first);
1285 }
1286 else {
1287 Assert (startAt == 0); // special case - start of string
1288 }
1289 startAt = ofi->second;
1290 Assert (startAt <= len);
1291 }
1292 else {
1293 r += SubString (startAt); // if no match, the rest of the string is a non-separator
1294 break;
1295 }
1296 }
1297 return r;
1298}
1299Sequence<String> String::Tokenize (const Set<Character>& delimiters) const
1300{
1301 /*
1302 * @todo Inefficient impl, to encourage code saving. Do more efficiently.
1303 */
1304 return Tokenize ([delimiters] (Character c) -> bool { return delimiters.Contains (c); });
1305}
1306
1308{
1310 StringBuilder curLineSB;
1311 for (auto i = this->MakeIterator (); i; ++i) {
1312 Character c = *i;
1313 // look for \r, \r\n, or \n
1314 switch (c.GetCharacterCode ()) {
1315 case '\r': {
1316 auto ii = i;
1317 ++ii;
1318 if (ii and *ii == '\n') {
1319 i = ii;
1320 }
1321 r += curLineSB.str ();
1322 curLineSB.clear ();
1323 break;
1324 }
1325 case '\n': {
1326 r += curLineSB.str ();
1327 curLineSB.clear ();
1328 break;
1329 }
1330 default: {
1331 curLineSB.push_back (c);
1332 break;
1333 }
1334 }
1335 }
1336 if (not curLineSB.empty ()) { // non-terminated lines included
1337 r += curLineSB.str ();
1338 }
1339 return r;
1340}
1341
1342Sequence<String> String::Grep (const String& fgrepArg) const
1343{
1345 for (auto i : AsLines ()) {
1346 if (i.Contains (fgrepArg)) {
1347 r += i;
1348 }
1349 }
1350 return r;
1351}
1352Sequence<String> String::Grep (const RegularExpression& egrepArg) const
1353{
1355 for (auto i : AsLines ()) {
1356 if (i.Matches (egrepArg)) {
1357 r += i;
1358 }
1359 }
1360 return r;
1361}
1362
1363optional<String> String::Col (size_t i) const
1364{
1365 static const RegularExpression kWS_ = "\\s+"_RegEx;
1366 return Col (i, kWS_);
1367}
1368
1369optional<String> String::Col (size_t i, const RegularExpression& separator) const
1370{
1371 return Tokenize (separator).Nth (i);
1372}
1373
1374String String::SubString_ (const _SafeReadRepAccessor& thisAccessor, size_t from, size_t to) const
1375{
1376 constexpr bool kWholeStringOptionization_ =
1377 false; // empirically, this costs about 1%. My WAG is that 1% cost not a good tradeoff cuz I dont think this gets triggered that often - LGP 2023-09-26
1378 Require (from <= to);
1379 Require (to <= this->size ());
1380
1381 // Could do this more simply, but since this function is a bottleneck, handle each representation case separately
1382 if (from == to) [[unlikely]] {
1383 return mkEmpty_ ();
1384 }
1385 PeekSpanData psd = thisAccessor._ConstGetRep ().PeekData (nullopt);
1386 switch (psd.fInCP) {
1387 case PeekSpanData::eAscii: {
1388 if constexpr (kWholeStringOptionization_) {
1389 if (from == 0 and to == psd.fAscii.size ()) [[unlikely]] {
1390 return *this; // unclear if this optimization is worthwhile
1391 }
1392 }
1393 return mk_nocheck_ (psd.fAscii.subspan (from, to - from)); // no check cuz we already know its all ASCII and nothing smaller
1394 }
1396 if constexpr (kWholeStringOptionization_) {
1397 if (from == 0 and to == psd.fSingleByteLatin1.size ()) [[unlikely]] {
1398 return *this; // unclear if this optimization is worthwhile
1399 }
1400 }
1401 return mk_ (psd.fSingleByteLatin1.subspan (from, to - from)); // note still needs to re-examine text, cuz subset maybe pure ascii (etc)
1402 }
1403 case PeekSpanData::eChar16: {
1404 if constexpr (kWholeStringOptionization_) {
1405 if (from == 0 and to == psd.fChar16.size ()) [[unlikely]] {
1406 return *this; // unclear if this optimization is worthwhile
1407 }
1408 }
1409 return mk_ (psd.fChar16.subspan (from, to - from)); // note still needs to re-examine text, cuz subset maybe pure ascii (etc)
1410 }
1411 case PeekSpanData::eChar32: {
1412 if constexpr (kWholeStringOptionization_) {
1413 if (from == 0 and to == psd.fChar32.size ()) [[unlikely]] {
1414 return *this; // unclear if this optimization is worthwhile
1415 }
1416 }
1417 return mk_ (psd.fChar32.subspan (from, to - from)); // note still needs to re-examine text, cuz subset maybe pure ascii (etc)
1418 }
1419 default:
1421 return String{};
1422 }
1423}
1424
1425String String::Repeat (unsigned int count) const
1426{
1427 switch (count) {
1428 case 0:
1429 return String{};
1430 case 1:
1431 return *this;
1432 case 2:
1433 return *this + *this;
1434 default: {
1435 StringBuilder result;
1436 for (unsigned int i = 0; i < count; ++i) {
1437 result << *this;
1438 }
1439 return result;
1440 }
1441 }
1442}
1443
1444String String::LTrim (bool (*shouldBeTrimmed) (Character)) const
1445{
1446 RequireNotNull (shouldBeTrimmed);
1447 auto referenceImpl = [&] () {
1448 _SafeReadRepAccessor accessor{this};
1449 size_t length = accessor._ConstGetRep ().size ();
1450 for (size_t i = 0; i < length; ++i) {
1451 if (not(*shouldBeTrimmed) (accessor._ConstGetRep ().GetAt (i))) {
1452 if (i == 0) {
1453 return *this; // no change in string
1454 }
1455 else {
1456 return SubString (i, length);
1457 }
1458 }
1459 }
1460 return String{}; // all trimmed
1461 };
1462 auto commonAlgorithm = [&]<typename T> (span<const T> lowLevelCharSpan) -> String {
1463 size_t length = lowLevelCharSpan.size ();
1464 for (size_t i = 0; i < length; ++i) {
1465 static_assert (Common::IAnyOf<T, ASCII, Latin1, char32_t>); // this works for ASCII, Latin1, char32_t, but for char16_t - not so much - trickier
1466 Character c{lowLevelCharSpan[i]};
1467 // drop not-so-subtle hint to optimizer this is likely the function, and can be called, and hopefully hoisted outside the loop, and inlined
1468 bool thisCharacterTrimmed = [&] () {
1469 if (shouldBeTrimmed == (bool (*) (Character))Character::IsWhitespace) [[likely]] {
1470 return Character::IsWhitespace (c);
1471 }
1472 else {
1473 return shouldBeTrimmed (c);
1474 }
1475 }();
1476 if (not thisCharacterTrimmed) {
1477 if (i == 0) {
1478#if qStroika_Foundation_Debug_AssertionsChecked
1479 Assert (*this == referenceImpl ());
1480#endif
1481 return *this; // no change in string
1482 }
1483 else {
1484#if qStroika_Foundation_Debug_AssertionsChecked
1485 Assert (mk_ (lowLevelCharSpan.subspan (i)) == referenceImpl ());
1486#endif
1487 return mk_ (lowLevelCharSpan.subspan (i));
1488 }
1489 }
1490 }
1491 return String{}; // all trimmed
1492 };
1493 _SafeReadRepAccessor accessor{this};
1494 PeekSpanData psd = accessor._ConstGetRep ().PeekData (nullopt);
1495 switch (psd.fInCP) {
1496 case PeekSpanData::eAscii: {
1497 return commonAlgorithm (psd.fAscii);
1498 }
1500 return commonAlgorithm (psd.fSingleByteLatin1);
1501 }
1502 case PeekSpanData::eChar32: {
1503 return commonAlgorithm (psd.fChar32);
1504 }
1505 }
1506 return referenceImpl (); // due to tricks with surrogates, and rarity, not worth worrying about char16_t case
1507}
1508
1509String String::RTrim (bool (*shouldBeTrimmed) (Character)) const
1510{
1511 RequireNotNull (shouldBeTrimmed);
1512 auto referenceImpl = [&] () {
1513 _SafeReadRepAccessor accessor{this};
1514 ptrdiff_t length = accessor._ConstGetRep ().size ();
1515 ptrdiff_t endOfFirstTrim = length;
1516 for (; endOfFirstTrim != 0; --endOfFirstTrim) {
1517 if ((*shouldBeTrimmed) (accessor._ConstGetRep ().GetAt (endOfFirstTrim - 1))) {
1518 // keep going backwards
1519 }
1520 else {
1521 break;
1522 }
1523 }
1524 if (endOfFirstTrim == 0) {
1525 return String{}; // all trimmed
1526 }
1527 else if (endOfFirstTrim == length) {
1528 return *this; // nothing trimmed
1529 }
1530 else {
1531 return SubString (0, endOfFirstTrim);
1532 }
1533 };
1534
1535 auto commonAlgorithm = [&]<typename T> (span<const T> lowLevelCharSpan) -> String {
1536 size_t length = lowLevelCharSpan.size ();
1537 ptrdiff_t endOfFirstTrim = length;
1538 for (; endOfFirstTrim != 0; --endOfFirstTrim) {
1539 static_assert (Common::IAnyOf<T, ASCII, Latin1, char32_t>); // this works for ASCII, Latin1, char32_t, but for char16_t - not so much - trickier
1540 Character c{lowLevelCharSpan[endOfFirstTrim - 1]};
1541 // drop not-so-subtle hint to optimizer this is likely the function, and can be called, and hopefully hoisted outside the loop, and inlined
1542 bool thisCharacterTrimmed = [&] () {
1543 if (shouldBeTrimmed == (bool (*) (Character))Character::IsWhitespace) [[likely]] {
1544 return Character::IsWhitespace (c);
1545 }
1546 else {
1547 return shouldBeTrimmed (c);
1548 }
1549 }();
1550 if (thisCharacterTrimmed) {
1551 // keep going backwards
1552 }
1553 else {
1554 break;
1555 }
1556 }
1557 if (endOfFirstTrim == 0) {
1558#if qStroika_Foundation_Debug_AssertionsChecked
1559 Assert (String{} == referenceImpl ());
1560#endif
1561 return String{}; // all trimmed
1562 }
1563 else if (static_cast<size_t> (endOfFirstTrim) == length) {
1564#if qStroika_Foundation_Debug_AssertionsChecked
1565 Assert (*this == referenceImpl ());
1566#endif
1567 return *this; // nothing trimmed
1568 }
1569 else {
1570#if qStroika_Foundation_Debug_AssertionsChecked
1571 Assert (mk_ (lowLevelCharSpan.subspan (0, endOfFirstTrim)) == referenceImpl ());
1572#endif
1573 return mk_ (lowLevelCharSpan.subspan (0, endOfFirstTrim)); //return SubString (0, endOfFirstTrim);
1574 }
1575 };
1576
1577 _SafeReadRepAccessor accessor{this};
1578 PeekSpanData psd = accessor._ConstGetRep ().PeekData (nullopt);
1579 switch (psd.fInCP) {
1580 case PeekSpanData::eAscii: {
1581 return commonAlgorithm (psd.fAscii);
1582 }
1584 return commonAlgorithm (psd.fSingleByteLatin1);
1585 }
1586 case PeekSpanData::eChar32: {
1587 return commonAlgorithm (psd.fChar32);
1588 }
1589 }
1590 return referenceImpl (); // due to tricks with surrogates, and rarity, not worth worrying about char16_t case
1591}
1592
1593String String::Trim (bool (*shouldBeTrimmed) (Character)) const
1594{
1595 RequireNotNull (shouldBeTrimmed);
1596
1597 auto referenceImpl = [&] () { return LTrim (shouldBeTrimmed).RTrim (shouldBeTrimmed); };
1598
1599 // declared here to encourage inlining the common case of Character::IsWhitespace
1600 auto useCharTrimmedFunc = [&] (Character c) {
1601 if (shouldBeTrimmed == (bool (*) (Character))Character::IsWhitespace) [[likely]] {
1602 return Character::IsWhitespace (c);
1603 }
1604 else {
1605 return shouldBeTrimmed (c);
1606 }
1607 };
1608
1609 auto commonAlgorithm = [&]<typename T> (span<const T> lowLevelCharSpan) -> String {
1610 size_t length = lowLevelCharSpan.size ();
1611 size_t firstKeptIdx = 0;
1612 for (; firstKeptIdx < length; ++firstKeptIdx) {
1613 static_assert (Common::IAnyOf<T, ASCII, Latin1, char32_t>); // this works for ASCII, Latin1, char32_t, but for char16_t - not so much - trickier
1614 Character c{lowLevelCharSpan[firstKeptIdx]};
1615 if (not useCharTrimmedFunc (c)) {
1616 break;
1617 }
1618 }
1619 ptrdiff_t endOfFirstTrim = length;
1620 for (; static_cast<size_t> (endOfFirstTrim) != firstKeptIdx; --endOfFirstTrim) {
1621 static_assert (Common::IAnyOf<T, ASCII, Latin1, char32_t>); // this works for ASCII, Latin1, char32_t, but for char16_t - not so much - trickier
1622 Character c{lowLevelCharSpan[endOfFirstTrim - 1]};
1623 if (useCharTrimmedFunc (c)) {
1624 // keep going backwards
1625 }
1626 else {
1627 break;
1628 }
1629 }
1630 if (firstKeptIdx == 0 and static_cast<size_t> (endOfFirstTrim) == length) {
1631#if qStroika_Foundation_Debug_AssertionsChecked
1632 Assert (*this == referenceImpl ());
1633#endif
1634 return *this; // nothing changed, just bump reference count on shared_ptr
1635 }
1636 if (firstKeptIdx == length) {
1637#if qStroika_Foundation_Debug_AssertionsChecked
1638 Assert (String{} == referenceImpl ());
1639#endif
1640 return String{}; // trimmed everything way
1641 }
1642 Assert (static_cast<ptrdiff_t> (firstKeptIdx) < endOfFirstTrim);
1643#if qStroika_Foundation_Debug_AssertionsChecked
1644 Assert (mk_ (lowLevelCharSpan.subspan (firstKeptIdx, endOfFirstTrim - firstKeptIdx)) == referenceImpl ());
1645#endif
1646 return mk_ (lowLevelCharSpan.subspan (firstKeptIdx, endOfFirstTrim - firstKeptIdx));
1647 };
1648
1649 _SafeReadRepAccessor accessor{this};
1650 PeekSpanData psd = accessor._ConstGetRep ().PeekData (nullopt);
1651 switch (psd.fInCP) {
1652 case PeekSpanData::eAscii: {
1653 return commonAlgorithm (psd.fAscii);
1654 }
1656 return commonAlgorithm (psd.fSingleByteLatin1);
1657 }
1658 case PeekSpanData::eChar32: {
1659 return commonAlgorithm (psd.fChar32);
1660 }
1661 }
1662 return referenceImpl (); // due to tricks with surrogates, and rarity, not worth worrying about char16_t case
1663}
1664
1665String String::StripAll (bool (*removeCharIf) (Character)) const
1666{
1667 RequireNotNull (removeCharIf);
1668
1669 // NB: optimize special case where removeCharIf is always false
1670 //
1671 // Walk string and find first character we need to remove
1672 StringBuilder<StringBuilder_Options<char32_t>> result{*this}; // StringBuilder_Options<char32_t> so operator[] is fast
1673 size_t n = result.size ();
1674 for (size_t i = 0; i < n; ++i) {
1675 Character c = result[i];
1676 if (removeCharIf (c)) {
1677 // on first removal, clone part of string done so far, and start appending
1678 StringBuilder tmp = result.As<String> ().SubString (0, i);
1679 // Now keep iterating IN THIS LOOP appending characters and return at the end of this loop
1680 ++i;
1681 for (; i < n; ++i) {
1682 c = result[i];
1683 if (not removeCharIf (c)) {
1684 tmp += c;
1685 }
1686 }
1687 return tmp;
1688 }
1689 }
1690 return *this; // if we NEVER get removeCharIf return false, just clone this
1691}
1692
1693String String::Join (const Iterable<String>& list, const String& separator)
1694{
1695 StringBuilder result;
1696 for (const String& i : list) {
1697 result << i << separator;
1698 }
1699 if (result.empty ()) {
1700 return result.str ();
1701 }
1702 else {
1703 return result.str ().SubString (0, -static_cast<int> (separator.size ()));
1704 }
1705}
1706
1708{
1709 StringBuilder result;
1710 bool changed{false}; // if no change, no need to allocate new object
1711 _SafeReadRepAccessor accessor{this};
1712 PeekSpanData psd = accessor._ConstGetRep ().PeekData (nullopt);
1713 if (psd.fInCP == PeekSpanData::eAscii) [[likely]] {
1714 // optimization but other case would work no matter what
1715 for (auto c : psd.fAscii) {
1716 if (isupper (c)) {
1717 changed = true;
1718 result.push_back (static_cast<ASCII> (tolower (c)));
1719 }
1720 else {
1721 result.push_back (c);
1722 }
1723 }
1724 }
1725 else {
1726 Memory::StackBuffer<Character> maybeIgnoreBuf1;
1727 for (Character c : GetData (psd, &maybeIgnoreBuf1)) {
1728 if (c.IsUpperCase ()) {
1729 changed = true;
1730 result.push_back (c.ToLowerCase ());
1731 }
1732 else {
1733 result.push_back (c);
1734 }
1735 }
1736 }
1737 if (changed) {
1738 return result.str ();
1739 }
1740 else {
1741 return *this;
1742 }
1743}
1744
1746{
1747 StringBuilder result;
1748 bool changed{false}; // if no change, no need to allocate new object
1749 _SafeReadRepAccessor accessor{this};
1750 PeekSpanData psd = accessor._ConstGetRep ().PeekData (nullopt);
1751 if (psd.fInCP == PeekSpanData::eAscii) [[likely]] {
1752 // optimization but other case would work no matter what
1753 for (auto c : psd.fAscii) {
1754 if (islower (c)) {
1755 changed = true;
1756 result.push_back (static_cast<ASCII> (toupper (c)));
1757 }
1758 else {
1759 result.push_back (c);
1760 }
1761 }
1762 }
1763 else {
1764 Memory::StackBuffer<Character> maybeIgnoreBuf1;
1765 for (Character c : GetData (psd, &maybeIgnoreBuf1)) {
1766 if (c.IsLowerCase ()) {
1767 changed = true;
1768 result.push_back (c.ToUpperCase ());
1769 }
1770 else {
1771 result.push_back (c);
1772 }
1773 }
1774 }
1775 if (changed) {
1776 return result.str ();
1777 }
1778 else {
1779 return *this;
1780 }
1781}
1782
1784{
1785 // It is all whitespace if the first non-whitespace character is 'EOF'
1786 return not Find ([] (Character c) -> bool { return not c.IsWhitespace (); });
1787}
1788
1789String String::LimitLength (size_t maxLen, StringShorteningPreference keepPref, const String& ellipsis) const
1790{
1791 // @todo Consider making this the 'REFERENCE' impl, and doing a specific one with a specific StringBuilder, and doing
1792 // the trim/split directly, if I see this show up in a profile, for performance sake --LGP 2023-12-11
1793 if (length () < maxLen) [[likely]] {
1794 return *this; // frequent optimization
1795 }
1796 String operateOn = [&] () {
1797 switch (keepPref) {
1798 case StringShorteningPreference::ePreferKeepLeft:
1799 return LTrim ();
1800 case StringShorteningPreference::ePreferKeepRight:
1801 return RTrim ();
1802 case StringShorteningPreference::ePreferKeepMid:
1803 return Trim (); // not sure we need to trim - but probably best
1804 default:
1806 return *this;
1807 }
1808 }();
1809 if (operateOn.length () <= maxLen) {
1810 return operateOn;
1811 }
1812 size_t useLen = [&] () {
1813 size_t useLen = maxLen;
1814 size_t ellipsisTotalLen = ellipsis.length ();
1815 if (keepPref == StringShorteningPreference::ePreferKeepMid) {
1816 ellipsisTotalLen *= 2;
1817 }
1818 if (useLen > ellipsisTotalLen) {
1819 useLen -= ellipsisTotalLen;
1820 }
1821 else {
1822 useLen = 0;
1823 }
1824 return useLen;
1825 }();
1826 switch (keepPref) {
1827 case StringShorteningPreference::ePreferKeepLeft:
1828 return operateOn.substr (0, useLen) + ellipsis;
1829 case StringShorteningPreference::ePreferKeepRight:
1830 return ellipsis + operateOn.substr (operateOn.length () - useLen);
1831 case StringShorteningPreference::ePreferKeepMid:
1832 return ellipsis + operateOn.substr (operateOn.length () / 2 - useLen / 2, useLen) + ellipsis;
1833 default:
1835 return *this;
1836 }
1837}
1838
1839string String::AsNarrowString (const locale& l) const
1840{
1841 // Note: this could use CodeCvt, but directly using std::codecvt in this case pretty simple, and
1842 // more efficient this way --LGP 2023-02-14
1843
1844 // See http://en.cppreference.com/w/cpp/locale/codecvt/~codecvt
1845 using Destructible_codecvt_byname = deletable_facet_<codecvt_byname<wchar_t, char, mbstate_t>>;
1846 Destructible_codecvt_byname cvt{l.name ()};
1847
1848 Memory::StackBuffer<wchar_t> maybeIgnoreBuf1;
1849 span<const wchar_t> thisData = GetData (&maybeIgnoreBuf1);
1850 // http://en.cppreference.com/w/cpp/locale/codecvt/out
1851 mbstate_t mbstate{};
1852 const wchar_t* from_next;
1853 char* to_next;
1854 Memory::StackBuffer<char> into{Memory::eUninitialized, thisData.size () * 5}; // not sure what size is always big enuf
1855 codecvt_base::result result =
1856 cvt.out (mbstate, thisData.data (), thisData.data () + thisData.size (), from_next, into.data (), into.end (), to_next);
1857 if (result != codecvt_base::ok) [[unlikely]] {
1858 static const auto kException_ = Execution::RuntimeErrorException{"Error converting locale multibyte string to UNICODE"sv};
1859 Execution::Throw (kException_);
1860 }
1861 return string{into.data (), to_next};
1862}
1863
1864string String::AsNarrowString (const locale& l, AllowMissingCharacterErrorsFlag) const
1865{
1866 // Note: this could use CodeCvt, but directly using std::codecvt in this case pretty simple, and
1867 // more efficient this way --LGP 2023-02-14
1868
1869 // See http://en.cppreference.com/w/cpp/locale/codecvt/~codecvt
1870 using Destructible_codecvt_byname = deletable_facet_<codecvt_byname<wchar_t, char, mbstate_t>>;
1871 Destructible_codecvt_byname cvt{l.name ()};
1872
1873 Memory::StackBuffer<wchar_t> maybeIgnoreBuf1;
1874 span<const wchar_t> thisData = GetData (&maybeIgnoreBuf1);
1875 // http://en.cppreference.com/w/cpp/locale/codecvt/out
1876 mbstate_t mbstate{};
1877 Memory::StackBuffer<char> into{Memory::eUninitialized, thisData.size () * 5}; // not sure what size is always big enuf
1878 const wchar_t* readFrom = thisData.data ();
1879 char* intoIndex = into.data ();
1880Again:
1881 const wchar_t* from_next{nullptr};
1882 char* to_next{nullptr};
1883 codecvt_base::result result = cvt.out (mbstate, readFrom, thisData.data () + thisData.size (), from_next, intoIndex, into.end (), to_next);
1884 if (result != codecvt_base::ok) [[unlikely]] {
1885 if (from_next != thisData.data () + thisData.size ()) {
1886 readFrom = from_next + 1; // unclear how much to skip (due to surrogates) - but likely this is a good guess
1887 *to_next = '?'; // write 'bad' character
1888 intoIndex = to_next + 1;
1889 goto Again;
1890 }
1891 }
1892 return string{into.data (), to_next};
1893}
1894
1895void String::erase (size_t from)
1896{
1897 *this = RemoveAt (from, size ());
1898}
1899
1900void String::erase (size_t from, size_t count)
1901{
1902 // http://stroika-bugs.sophists.com/browse/STK-445
1903 // @todo - NOT ENVELOPE THREADSAFE
1904 // MUST ACQUIRE ACCESSOR HERE - not just that RemoteAt threadsafe - but must SYNC at this point - need AssureExternallySycnonized stuff here!!!
1905 //
1906 // TODO: Double check STL definition - but I think they allow for count to be 'too much' - and silently trim to end...
1907 size_t max2Erase = static_cast<size_t> (max (static_cast<ptrdiff_t> (0), static_cast<ptrdiff_t> (size ()) - static_cast<ptrdiff_t> (from)));
1908 *this = RemoveAt (from, from + min (count, max2Erase));
1909}
1910
1911const wchar_t* String::c_str () const noexcept
1912{
1913 // UNSAFE - DEPRECATED - lose before v3 actually released -- LGP 2023-06-28
1915 DISABLE_COMPILER_GCC_WARNING_START ("GCC diagnostic ignored \"-Wdeprecated-declarations\"");
1916 DISABLE_COMPILER_CLANG_WARNING_START ("clang diagnostic ignored \"-Wdeprecated-declarations\"");
1917 return const_cast<String*> (this)->c_str ();
1918 DISABLE_COMPILER_MSC_WARNING_END (4996);
1919 DISABLE_COMPILER_GCC_WARNING_END ("GCC diagnostic ignored \"-Wdeprecated-declarations\"");
1920 DISABLE_COMPILER_CLANG_WARNING_END ("clang diagnostic ignored \"-Wdeprecated-declarations\"");
1921}
1922const wchar_t* String::c_str ()
1923{
1924 // DEPRECATED SINCE STROIKA v3.0d13
1925 // Rarely used mechanism, of replacing the underlying rep, for the iterable, as needed
1926 _SafeReadRepAccessor accessor{this};
1927 const wchar_t* result = accessor._ConstGetRep ().c_str_peek ();
1928 if (result == nullptr) {
1929 _fRep = MakeSharedPtr<StringWithCStr_::Rep> (accessor._ConstGetRepSharedPtr ());
1930 result = _SafeReadRepAccessor{this}._ConstGetRep ().c_str_peek ();
1931 AssertNotNull (result);
1932 }
1933 EnsureNotNull (result);
1934 Ensure (result[size ()] == '\0' or (::wcslen (result) > size () and sizeof (wchar_t) == 2)); // if there are surrogates, wcslen () might be larger than size
1935 return result;
1936}
1937
1938[[noreturn]] void String::ThrowInvalidAsciiException_ ()
1939{
1940 static const auto kException_ = Execution::RuntimeErrorException{"Error converting non-ascii text to string"sv};
1941 Execution::Throw (kException_);
1942}
1943
1944#if qStroika_Foundation_Characters_AsPathAutoMapMSYSAndCygwin
1945template <>
1946std::filesystem::path String::As<std::filesystem::path> () const
1947{
1948 // CYGWIN creates paths like /cygdrive/c/folder for c:/folder
1949 // MSYS creates paths like /c/folder for c:/folder
1950 static const String kMSYSDrivePrefix_ = "/"sv;
1951 static const String kCygrivePrefix_ = "/cygdrive/"sv;
1952 if (StartsWith (kCygrivePrefix_)) {
1953 String ss = SubString (kCygrivePrefix_.length ());
1954 if (ss.length () > 1 and ss[0].IsASCII () and ss[0].IsAlphabetic () and ss[1] == '/') {
1955 wstring w = ss.As<wstring> (); // now map c/folder to c:/folder
1956 w.insert (w.begin () + 1, ':');
1957 return filesystem::path{w};
1958 }
1959 }
1960 if (StartsWith (kMSYSDrivePrefix_)) {
1961 String ss = SubString (kMSYSDrivePrefix_.length ());
1962 if (ss.length () > 1 and ss[0].IsASCII () and ss[0].IsAlphabetic () and ss[1] == '/') {
1963 wstring w = ss.As<wstring> (); // now map c/folder to c:/folder
1964 w.insert (w.begin () + 1, ':');
1965 return filesystem::path{w};
1966 }
1967 }
1968 return filesystem::path{As<wstring> ()};
1969}
1970#endif
1971
1972/*
1973 ********************************************************************************
1974 ****************************** StringCombiner **********************************
1975 ********************************************************************************
1976 */
1977template <>
1978String StringCombiner<String>::operator() (const String& lhs, const String& rhs, bool isLast) const
1979{
1980 StringBuilder sb{lhs};
1981 if (isLast and fSpecialSeparatorForLastPair) [[unlikely]] {
1982 sb << *fSpecialSeparatorForLastPair;
1983 }
1984 else {
1985 sb << fSeparator;
1986 }
1987 sb << rhs;
1988 return sb;
1989}
1990
1991/*
1992 ********************************************************************************
1993 ******************* Iterable<Characters::String>::Join *************************
1994 ********************************************************************************
1995 */
1996namespace Stroika::Foundation::Traversal {
1997 // specialized as performance optimization
1998 template <>
1999 Characters::String Iterable<Characters::String>::Join (const Characters::String& separator, const optional<Characters::String>& finalSeparator) const
2000 {
2001 using namespace Characters;
2002#if qStroika_Foundation_Debug_AssertionsChecked
2003 String referenceResult =
2005 Characters::StringCombiner<String>{.fSeparator = separator, .fSpecialSeparatorForLastPair = finalSeparator});
2006#endif
2007 StringBuilder sb;
2008 size_t cnt = this->size ();
2009 this->Apply ([&, idx = 0u] (const String& i) mutable {
2010 if (idx == 0) {
2011 sb = i;
2012 }
2013 else {
2014 if (finalSeparator and idx + 1 == cnt) [[unlikely]] {
2015 sb << *finalSeparator;
2016 }
2017 else {
2018 sb << separator;
2019 }
2020 sb << i;
2021 }
2022 ++idx;
2023 });
2024#if qStroika_Foundation_Debug_AssertionsChecked
2025 Ensure (sb == referenceResult);
2026#endif
2027 return sb;
2028 }
2029}
2030
2031/*
2032 ********************************************************************************
2033 ********************************** operator<< **********************************
2034 ********************************************************************************
2035 */
2036wostream& Characters::operator<< (wostream& out, const String& s)
2037{
2038 Memory::StackBuffer<wchar_t> maybeIgnoreBuf1;
2039 span<const wchar_t> sData = s.GetData (&maybeIgnoreBuf1);
2040 out.write (sData.data (), sData.size ());
2041 return out;
2042}
2043ostream& Characters::operator<< (ostream& out, const String& s)
2044{
2045 return out << s.AsNarrowSDKString (eIgnoreErrors);
2046}
2047
2048/*
2049 ********************************************************************************
2050 *********** hash<Stroika::Foundation::Characters::String> **********************
2051 ********************************************************************************
2052 */
2053size_t std::hash<String>::operator() (const String& arg) const
2054{
2055 using namespace Cryptography::Digest;
2056 using DIGESTER = Digester<Algorithm::SuperFastHash>; // pick arbitrarily which algorithm to use for now -- err on the side of quick and dirty
2057 static constexpr DIGESTER kDigester_{};
2058 // Note this could easily use char8_t, wchar_t, char32_t, or whatever. Choose char8_t on the theory that
2059 // this will most often avoid a copy, and making the most often case faster is probably a win. Also, even close, it
2060 // will have less 'empty space' and be more compact, so will digest faster.
2061 Memory::StackBuffer<char8_t> maybeIgnoreBuf1;
2062 span<const char8_t> s = arg.GetData (&maybeIgnoreBuf1);
2063 if (s.empty ()) {
2064 static const size_t kZeroDigest_ = kDigester_ (nullptr, nullptr);
2065 return kZeroDigest_;
2066 }
2067 else {
2068 return kDigester_ (as_bytes (s));
2069 }
2070}
2071
2072/*
2073 ********************************************************************************
2074 ******************** DataExchange::DefaultSerializer<String> *******************
2075 ********************************************************************************
2076 */
2078{
2079 //
2080 // Could have used char8_t, char16_t, or char32_t here quite plausibly. Chose char8_t for several reasons:
2081 // > Nearly always smallest representation (assuming most data is ascii)
2082 // > It is cross-platform/portable - not byte order dependent (NOT a promise going forward, so maybe
2083 // not a good thing - but a thing)
2084 // > Since we expect most data reps to be ascii, this will involve the least copying, most likely, in
2085 // the GetData call
2086 //
2087 Memory::StackBuffer<char8_t> maybeIgnoreBuf1;
2088 return Memory::BLOB{as_bytes (arg.GetData (&maybeIgnoreBuf1))};
2089}
#define AssertNotNull(p)
Definition Assertions.h:333
#define EnsureNotNull(p)
Definition Assertions.h:340
#define RequireMember(p, c)
Definition Assertions.h:326
#define RequireNotReached()
Definition Assertions.h:385
#define qStroika_Foundation_Debug_AssertionsChecked
The qStroika_Foundation_Debug_AssertionsChecked flag determines if assertions are checked and validat...
Definition Assertions.h:48
#define RequireNotNull(p)
Definition Assertions.h:347
#define RequireExpression(c)
Definition Assertions.h:267
#define AssertNotReached()
Definition Assertions.h:355
conditional_t< qStroika_Foundation_Memory_PreferBlockAllocation and andTrueCheck, BlockAllocationUseHelper< T >, Common::Empty > UseBlockAllocationIfAppropriate
Use this to enable block allocation for a particular class. Beware of subclassing.
bool Equals(const T *lhs, const T *rhs)
strcmp or wsccmp() as appropriate == 0
constexpr bool IsASCII() const noexcept
Return true iff the given character (or all in span) is (are) in the ascii range [0....
static constexpr void CheckASCII(span< const CHAR_T > s)
if not IsASCII (arg) throw RuntimeException...
nonvirtual Character ToLowerCase() const noexcept
nonvirtual ASCII GetAsciiCode() const noexcept
static constexpr strong_ordering Compare(span< const CHAR_T, E1 > lhs, span< const CHAR_T, E2 > rhs, CompareOptions co) noexcept
nonvirtual bool IsLowerCase() const noexcept
constexpr char32_t GetCharacterCode() const noexcept
Return the char32_t UNICODE code-point associated with this character.
nonvirtual Character ToUpperCase() const noexcept
constexpr bool IsWhitespace() const noexcept
nonvirtual bool IsUpperCase() const noexcept
RegularExpression is a compiled regular expression which can be used to match on a String class.
virtual Character GetAt(size_t index) const noexcept=0
Similar to String, but intended to more efficiently construct a String. Mutable type (String is large...
nonvirtual size_t size() const noexcept
nonvirtual void Append(span< const CHAR_T > s)
String is like std::u32string, except it is much easier to use, often much more space efficient,...
Definition String.h:201
nonvirtual size_t length() const noexcept
Definition String.inl:1051
nonvirtual String ToUpperCase() const
Definition String.cpp:1745
static String FromNarrowString(const char *from, const locale &l)
Definition String.inl:340
nonvirtual bool Matches(const RegularExpression &regEx) const
Definition String.cpp:1134
nonvirtual bool IsWhitespace() const
Definition String.cpp:1783
nonvirtual String NormalizeTextToNL() const
Definition String.cpp:1202
static String Join(const Iterable< String > &list, const String &separator=", "sv)
Definition String.cpp:1693
static String FromStringConstant(const CHAR_T(&cString)[SIZE])
Take the given argument data (constant span) - which must remain unchanged - constant - for the appli...
Definition String.inl:386
nonvirtual String NormalizeSpace(Character useSpaceCharacter=' ') const
Replace sequences of whitespace characters (space, tab, newline etc) with a single space (or argument...
Definition String.cpp:1230
nonvirtual Containers::Sequence< pair< size_t, size_t > > FindEach(const RegularExpression &regEx) const
Definition String.cpp:967
nonvirtual String Repeat(unsigned int count) const
Definition String.cpp:1425
nonvirtual String LimitLength(size_t maxLen, StringShorteningPreference keepPref=StringShorteningPreference::ePreferKeepLeft) const
return the first maxLen (or fewer if string shorter) characters of this string (adding ellipsis if tr...
Definition String.inl:745
nonvirtual String RemoveAll(Character c) const
Definition String.cpp:824
nonvirtual Containers::Sequence< RegularExpressionMatch > FindEachMatch(const RegularExpression &regEx) const
Definition String.cpp:985
nonvirtual String RemoveFirstIf(Character c) const
Definition String.cpp:808
nonvirtual string AsNarrowSDKString() const
Definition String.inl:834
nonvirtual optional< String > Col(size_t i) const
Useful to replace 'awk print $3' - replace with Col(2) - zero based.
Definition String.cpp:1363
nonvirtual String InsertAt(Character c, size_t at) const
Definition String.inl:719
nonvirtual string AsNarrowString(const locale &l) const
Definition String.cpp:1839
nonvirtual size_t size() const noexcept
Definition String.inl:534
nonvirtual bool EndsWith(const Character &c, CompareOptions co=eWithCase) const
Definition String.cpp:1089
nonvirtual String ToLowerCase() const
Definition String.cpp:1707
nonvirtual String ReplaceAll(const RegularExpression &regEx, const String &with) const
Definition String.cpp:1156
nonvirtual String Replace(size_t from, size_t to, const String &replacement) const
Definition String.cpp:1046
nonvirtual String SubString(SZ from) const
nonvirtual String Trim(bool(*shouldBeTrimmed)(Character)=Character::IsWhitespace) const
Definition String.cpp:1593
nonvirtual bool StartsWith(const Character &c, CompareOptions co=eWithCase) const
Definition String.cpp:1060
nonvirtual String StripAll(bool(*removeCharIf)(Character)) const
Definition String.cpp:1665
nonvirtual String AssureEndsWith(const Character &c, CompareOptions co=eWithCase) const
Return *this if it ends with argument character, or append 'c' so that it ends with a 'c'.
Definition String.cpp:1124
nonvirtual Containers::Sequence< String > AsLines() const
break the String into a series of lines;
Definition String.cpp:1307
nonvirtual String LTrim(bool(*shouldBeTrimmed)(Character)=Character::IsWhitespace) const
Definition String.cpp:1444
nonvirtual Containers::Sequence< String > Grep(const String &fgrepArg) const
Breaks this string into Lines, with AsLines (), and applies the argument filter (as if with ....
Definition String.cpp:1342
nonvirtual Containers::Sequence< String > FindEachString(const RegularExpression &regEx) const
Definition String.cpp:1002
nonvirtual optional< size_t > RFind(Character c) const noexcept
Definition String.cpp:1012
static span< const CHAR_TYPE > GetData(const PeekSpanData &pds, Memory::StackBuffer< CHAR_TYPE, STACK_BUFFER_SZ > *possiblyUsedBuffer)
return the constant character data inside the string (rep) in the form of a span, possibly quickly an...
Definition String.inl:967
nonvirtual Containers::Sequence< String > Tokenize() const
Definition String.cpp:1235
nonvirtual String RemoveAt(size_t charAt) const
Definition String.inl:608
nonvirtual String RTrim(bool(*shouldBeTrimmed)(Character)=Character::IsWhitespace) const
Definition String.cpp:1509
nonvirtual optional< size_t > Find(Character c, CompareOptions co=eWithCase) const
Definition String.inl:685
nonvirtual String substr(size_t from, size_t count=npos) const
Definition String.inl:1092
static const UTFConvert kThe
Nearly always use this default UTFConvert.
Definition UTFConvert.h:369
static constexpr bool AllFitsInTwoByteEncoding(span< const CHAR_T > s) noexcept
Sequence_stdvector<T> is an std::vector-based concrete implementation of the Sequence<T> container pa...
A generalization of a vector: a container whose elements are keyed by the natural numbers.
nonvirtual void push_back(ArgByValueType< value_type > item)
Definition Sequence.inl:436
nonvirtual void Append(ArgByValueType< value_type > item)
Definition Sequence.inl:330
Set<T> is a container of T, where once an item is added, additionally adds () do nothing.
Logically halfway between std::array and std::vector; Smart 'direct memory array' - which when needed...
nonvirtual size_t size() const noexcept
Iterable<T> is a base class for containers which easily produce an Iterator<T> to traverse them.
Definition Iterable.h:237
nonvirtual RESULT_T Join(const CONVERT_TO_RESULT &convertToResult=kDefaultToStringConverter<>, const COMBINER &combiner=Characters::kDefaultStringCombiner) const
ape the JavaScript/python 'join' function - take the parts of 'this' iterable and combine them into a...
nonvirtual size_t size() const
Returns the number of items contained.
Definition Iterable.inl:303
nonvirtual Iterator< Character > MakeIterator() const
Create an iterator object which can be used to traverse the 'Iterable'.
Definition Iterable.inl:297
An Iterator<T> is a copyable object which allows traversing the contents of some container....
Definition Iterator.h:225
concept - trivial shorthand for variadic same_as A or same_as B, or ...
Definition Concepts.h:195
char ASCII
Stroika's string/character classes treat 'char' as being an ASCII character.
Definition Character.h:59
wostream & operator<<(wostream &out, const String &s)
Definition String.cpp:2036
conditional_t<(sizeof(CHECK_T)<=2 *sizeof(void *)) and is_trivially_copyable_v< CHECK_T >, CHECK_T, const CHECK_T & > ArgByValueType
This is an alias for 'T' - but how we want to pass it on stack as formal parameter.
Definition TypeHints.h:32
SequencePolicy
equivalent which of 4 types being used std::execution::sequenced_policy, parallel_policy,...
void Throw(T &&e2Throw)
identical to builtin C++ 'throw' except that it does helpful, type dependent DbgTrace() messages firs...
Definition Throw.inl:43
Summary data for raw contents of rep - each rep will support at least one of these span forms.
Definition String.h:1280
StringCombiner is a simple function object used to combine two strings visually - used in Iterable<>:...
Definition String.h:1933