Stroika Library 3.0d16
 
Loading...
Searching...
No Matches
UniformResourceIdentification.cpp
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#include "Stroika/Foundation/StroikaPreComp.h"
5
9#include "Stroika/Foundation/Characters/String2Int.h"
11#include "Stroika/Foundation/Containers/Support/ReserveTweaks.h"
12#include "Stroika/Foundation/Execution/Exceptions.h"
13#include "Stroika/Foundation/Execution/Throw.h"
14
16
17// Comment this in to turn on aggressive noisy DbgTrace in this module
18//#define USE_NOISY_TRACE_IN_THIS_MODULE_ 1
19
20using namespace Stroika::Foundation;
23using namespace Stroika::Foundation::IO;
25using namespace Stroika::Foundation::IO::Network::UniformResourceIdentification;
26
27namespace {
28 inline uint8_t ConvertReadSingleHexDigit_ (char digit)
29 {
30 static const auto kException_ = Execution::RuntimeErrorException{"illegal hex digit"sv};
31 if (isupper (digit)) {
32 digit = static_cast<char> (tolower (digit));
33 }
34 if (isdigit (digit)) {
35 return static_cast<uint8_t> (digit - '0');
36 }
37 else if (islower (digit)) {
38 if (digit > 'f') {
39 Execution::Throw (kException_);
40 }
41 return static_cast<uint8_t> (10 + (digit - 'a'));
42 }
43 else {
44 Execution::Throw (kException_);
45 }
46 }
47}
48
49/*
50 ********************************************************************************
51 *************** UniformResourceIdentification::SchemeType **********************
52 ********************************************************************************
53 */
54SchemeType SchemeType::Normalize () const
55{
56 // replace all uppercase with lowercase - don't validate here
57 return ToLowerCase ();
58}
59
60void SchemeType::Validate () const
61{
62 // https://tools.ietf.org/html/rfc3986#appendix-A -- scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
63 for (Characters::Character c : *this) {
64 if (not c.IsASCII () or not(c.IsAlphabetic () or c.IsDigit () or c == '-' or c == '.' or c == '+')) [[unlikely]] {
65 static const auto kException_ = Execution::RuntimeErrorException{"bad character in URI scheme"sv};
66 Execution::Throw (kException_);
67 }
68 }
69}
70
72{
73 SchemeType ns = Normalize ();
74 return ns == "https"sv or ns == "ftps"sv or ns == "ldaps"sv or ns == "ssh"sv;
75}
76
77optional<PortType> SchemeType::GetDefaultPort () const
78{
79 // From http://www.iana.org/assignments/port-numbers
80 static const Mapping<String, PortType> kPredefined_{String::EqualsComparer{eCaseInsensitive},
81 {
82 {"http"sv, static_cast<PortType> (80)},
83 {"https"sv, static_cast<PortType> (443)},
84 {"ldap"sv, static_cast<PortType> (389)},
85 {"ldaps"sv, static_cast<PortType> (636)},
86 {"ftp"sv, static_cast<PortType> (21)},
87 {"ftps"sv, static_cast<PortType> (990)},
88 }};
89 return kPredefined_.Lookup (*this);
90}
91
92strong_ordering SchemeType::TWC_ (const SchemeType& lhs, const SchemeType& rhs)
93{
94 using namespace Characters;
95 return String::ThreeWayComparer{eCaseInsensitive}(lhs, rhs);
96}
97
98/*
99 ********************************************************************************
100 ************************************ Host **************************************
101 ********************************************************************************
102 */
103pair<optional<String>, optional<InternetAddress>> Host::ParseRaw_ (const String& raw)
104{
105 Require (not raw.empty ());
106
107 // See https://tools.ietf.org/html/rfc3986#section-3.2.2 for details of this algorithm
108 if (raw[0].IsDigit ()) {
109 // must be ipv4 address
110 return pair<optional<String>, optional<InternetAddress>>{nullopt, InternetAddress{raw, InternetAddress::AddressFamily::V4}};
111 }
112 else if (raw[0] == '[') {
113 // must be ipv6 address
114 // must be surrounded with []
115 if (raw.Last () != ']') {
116 static const auto kException_ = Execution::RuntimeErrorException{"IPV6 hostname in URL must be surrounded with []"sv};
117 Execution::Throw (kException_);
118 }
119 return pair<optional<String>, optional<InternetAddress>>{nullopt, InternetAddress{raw.SubString (1, -1), InternetAddress::AddressFamily::V6}};
120 }
121 else {
122 return pair<optional<String>, optional<InternetAddress>>{PCTDecode2String (raw.AsUTF8<string> ()), nullopt};
123 }
124}
125
127{
128 if (fRegisteredName_) {
129 return Host{fRegisteredName_->ToLowerCase ()};
130 }
131 Assert (fInternetAddress_);
132 return Host{*fInternetAddress_};
133}
134
135String Host::EncodeAsRawURL_ (const String& registeredName)
136{
137 // https://tools.ietf.org/html/rfc3986#appendix-A
138 //reg-name = *( unreserved / pct-encoded / sub-delims )
139 static constexpr UniformResourceIdentification::PCTEncodeOptions kHostEncodeOptions_{true};
140 return UniformResourceIdentification::PCTEncode2String (registeredName, kHostEncodeOptions_);
141}
142
143String Host::EncodeAsRawURL_ (const InternetAddress& ipAddr)
144{
145 // See https://tools.ietf.org/html/rfc3986#section-3.2.2 for details of this algorithm
146 switch (ipAddr.GetAddressFamily ()) {
147 case InternetAddress::AddressFamily::V4: {
148 return ipAddr.As<String> ();
149 } break;
150 case InternetAddress::AddressFamily::V6: {
151 return "["sv + ipAddr.As<String> () + "]"sv;
152 } break;
153 default: {
155 // Probably need to use the V??? format - but this maybe the best we can do for now...
156 return ipAddr.As<String> ();
157 } break;
158 }
159}
160
162{
163 return Characters::ToString (As<String> (StringPCTEncodedFlag::eDecoded));
164}
165
166/*
167 ********************************************************************************
168 ************************************ UserInfo **********************************
169 ********************************************************************************
170 */
171String UserInfo::ParseRaw_ (const String& raw)
172{
173 Require (not raw.empty ());
174 // See https://tools.ietf.org/html/rfc3986#section-3.2.1 for details of this algorithm
175 return PCTDecode2String (raw);
176}
177
178String UserInfo::EncodeAsRawURL_ (const String& decodedName)
179{
180 Require (not decodedName.empty ());
181 // https://tools.ietf.org/html/rfc3986#appendix-A
182 //userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
183 static constexpr UniformResourceIdentification::PCTEncodeOptions kUserInfoEncodeOptions_{true};
184 return UniformResourceIdentification::PCTEncode2String (decodedName, kUserInfoEncodeOptions_);
185}
186
188{
189 return Characters::ToString (As<String> (eDecoded));
190}
191
192/*
193 ********************************************************************************
194 ********************************* Authority ************************************
195 ********************************************************************************
196 */
197optional<Authority> Authority::Parse (const String& rawURLAuthorityText)
198{
199 if (rawURLAuthorityText.empty ()) {
200 return nullopt;
201 }
202 optional<UserInfo> userInfo;
203 // From https://tools.ietf.org/html/rfc3986#appendix-A
204 // authority = [ userinfo "@" ] host [ ":" port ]
205 // host = IP-literal / IPv4address / reg-name
206 // IP-literal = "[" ( IPv6address / IPvFuture ) "]"
207 // IPv6address ...
208 // IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
209 // IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
210 // reg-name = *( unreserved / pct-encoded / sub-delims )
211 String remainingString2Parse = rawURLAuthorityText;
212 {
213 if (auto oat = remainingString2Parse.Find ('@')) {
214 optional<String> encodedUserInfo = remainingString2Parse.SubString (0, *oat);
215 if (encodedUserInfo) {
216 userInfo = UserInfo::Parse (*encodedUserInfo);
217 }
218 remainingString2Parse = remainingString2Parse.SubString (*oat + 1);
219 }
220 }
221 optional<Host> host;
222 {
223 String hostString;
224 // here we are looking for characters that are [] (IP-literal) or IPv4address or reg-name characters
225 // There are no 'colons' in reg-name, nor IPv4Address. The only possible : in hostname is inside an IP-Literal
226 // so check for []
227 if (remainingString2Parse.size () >= 2 and remainingString2Parse[0] == '[') {
228 auto closeBracket = remainingString2Parse.Find (']'); // a close bracket cannot be in a legal IP-literal except at the end
229 if (closeBracket) {
230 hostString = remainingString2Parse.SubString (0, *closeBracket + 1);
231 remainingString2Parse = remainingString2Parse.SubString (*closeBracket + 1);
232 }
233 else {
234 ;
235 static const Execution::RuntimeErrorException kException_{"no closing bracket in host part of authority of URI"sv};
236 Execution::Throw (kException_);
237 }
238 }
239 else {
240 // since not IP-literal, any colons would be introducing a port#
241 if (auto oPortColon = remainingString2Parse.Find (':')) {
242 hostString = remainingString2Parse.SubString (0, *oPortColon);
243 remainingString2Parse = remainingString2Parse.SubString (*oPortColon);
244 }
245 else {
246 hostString = remainingString2Parse;
247 remainingString2Parse = String{};
248 }
249 }
250 host = hostString.empty () ? optional<Host>{} : Host::Parse (hostString);
251 }
252 optional<uint16_t> port;
253 if (auto oPortColon = remainingString2Parse.Find (':')) {
254 port = Characters::String2Int<uint16_t> (remainingString2Parse.SubString (*oPortColon + 1));
255 }
256 return Authority{host, port, userInfo};
257}
258
260{
261 return Authority{fHost_ ? fHost_->Normalize () : optional<Host>{}, fPort_, fUserInfo_};
262}
263
264template <>
265String Authority::As (optional<StringPCTEncodedFlag> pctEncode) const
266{
267 StringBuilder sb;
268 if (fUserInfo_) {
269 sb << fUserInfo_->As<String> (pctEncode) << "@"sv;
270 }
271 if (fHost_) {
272 sb << fHost_->As<String> (pctEncode);
273 }
274 if (fPort_) {
275 sb << ":"sv << static_cast<unsigned int> (*fPort_);
276 }
277 return sb;
278}
279
281{
282 return Characters::ToString (As<String> ());
283}
284
285/*
286 ********************************************************************************
287 *********************************** Query **************************************
288 ********************************************************************************
289 */
290namespace {
291 // According to http://tools.ietf.org/html/rfc3986 - URLs need to be treated as UTF-8 before
292 // doing % etc substitution
293 // Note - not quite the same as PCTDecode (because of + expansion), and because of looking for = and building a map (and cuz = can be pct encoded)
294 void InitURLQueryDecoder_ (Mapping<String, String>* m, const u8string& utf8Query)
295 {
296 size_t utfqLen = utf8Query.length ();
297 for (size_t i = 0; i < utfqLen;) {
298 size_t e = utf8Query.find ('&', i);
299 u8string elt = utf8Query.substr (i, e - i);
300 size_t brk = elt.find ('=');
301 if (brk != string::npos) {
302 u8string val = elt.substr (brk + 1);
303 for (auto p = val.begin (); p != val.end (); ++p) {
304 switch (*p) {
305 case '+':
306 *p = ' ';
307 break;
308 case '%': {
309 if (p + 2 < val.end ()) {
310 unsigned char newC = (ConvertReadSingleHexDigit_ (*(p + 1)) << 4) + ConvertReadSingleHexDigit_ (*(p + 2));
311 p = val.erase (p, p + 2);
312 *p = static_cast<char> (newC);
313 }
314 break;
315 }
316 }
317 }
318 m->Add (String::FromUTF8 (elt.substr (0, brk)), String::FromUTF8 (val));
319 }
320 if (e == String::npos) {
321 break;
322 }
323 i = e + 1;
324 }
325 }
326}
327Query::Query (const String& query)
328{
329 InitURLQueryDecoder_ (&fMap_, query.AsASCII<u8string> ());
330}
331
332Query::Query (const u8string& query)
333{
334 InitURLQueryDecoder_ (&fMap_, query);
335}
336
337void Query::RemoveFieldIfAny (const String& idx)
338{
339 fMap_.Remove (idx);
340}
341
342String Query::ComputeQueryString () const
343{
344 u8string result;
345 for (auto i = fMap_.begin (); i != fMap_.end (); ++i) {
346 Containers::Support::ReserveTweaks::Reserve4Add1 (result);
347 if (not result.empty ()) {
348 result += u8"&";
349 }
350 //careful - need to encode first/second
351 result += EncodeURLQueryStringField (i->fKey) + u8"=" + EncodeURLQueryStringField (i->fValue);
352 }
353 return String{result};
354}
355
357{
358 // could use ComputeQueryString, Characters::ToString (ComputeQueryString), or Characters::ToString (fMap_)
359 // Chose this representation because it shows most characters 'decoded' (%xy)
360 return Characters::ToString (fMap_);
361}
362
363strong_ordering Query::TWC_ (const Query& lhs, const Query& rhs)
364{
365 // Nothing in https://tools.ietf.org/html/rfc3986#section-3.4 appears to indicate case insensative so treat as case sensitive
366
367 // comparing for equals makes full sense. But comparing < really doesn't, because there is no obvious preferred order for query strings
368 // So pick a preferred ordering (alphabetical) - and compare one after the other
369 for (String i : (Set<String>{lhs.GetMap ().Keys ()} + Set<String>{rhs.GetMap ().Keys ()}).OrderBy (less<String>{})) {
370 optional<String> lhsVal = lhs.GetMap ().Lookup (i);
371 optional<String> rhsVal = rhs.GetMap ().Lookup (i);
372 strong_ordering cmp = Common::StdCompat::compare_three_way{}(lhsVal, rhsVal);
373 if (cmp != strong_ordering::equal) {
374 return cmp;
375 }
376 }
377 return strong_ordering::equal;
378}
379
380/*
381 ********************************************************************************
382 ********** UniformResourceIdentification::EncodeURLQueryStringField ************
383 ********************************************************************************
384 */
385u8string UniformResourceIdentification::EncodeURLQueryStringField (const String& s)
386{
387 //
388 // According to http://tools.ietf.org/html/rfc3986 - URLs need to be treated as UTF-8 before
389 // doing % etc substitution
390 //
391 // From http://tools.ietf.org/html/rfc3986#section-2.3
392 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
393 u8string utf8Query = s.AsUTF8 ();
394 u8string result;
395 size_t sLength = utf8Query.length ();
396 result.reserve (sLength);
397 for (size_t i = 0; i < sLength; ++i) {
398 Containers::Support::ReserveTweaks::Reserve4Add1 (result);
399 switch (utf8Query[i]) {
400 case ' ':
401 result += u8"+"sv;
402 break;
403 default: {
404 char8_t ccode = utf8Query[i];
405 if (isascii (ccode) and (isalnum (ccode) or (ccode == '-') or (ccode == '.') or (ccode == '_') or (ccode == '~'))) {
406 result += static_cast<char> (utf8Query[i]);
407 }
408 else {
409 result += CString::Format (u8"%%%.2x", ccode);
410 }
411 }
412 }
413 }
414 return result;
415}
416
417/*
418 ********************************************************************************
419 ****************** UniformResourceIdentification::PCTEncode ********************
420 ********************************************************************************
421 */
422u8string UniformResourceIdentification::PCTEncode (const u8string& s, const PCTEncodeOptions& options)
423{
424 u8string result;
425 size_t sLength = s.length ();
426 result.reserve (sLength);
427
428 PCTEncodeOptions useOptions = options;
429 if (useOptions.allowFragOrQueryChars) {
430 useOptions.allowPChar = true;
431 }
432 if (useOptions.allowPChar) {
433 useOptions.allowSubDelims = true;
434 }
435 if (useOptions.allowPathCharacters) {
436 useOptions.allowSubDelims = true;
437 }
438
439 for (char c : s) {
440 bool encode{true};
441
442 // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
443 if (isalpha (c) or isdigit (c)) {
444 encode = false;
445 }
446 switch (c) {
447 case '-':
448 case '.':
449 case '_':
450 case '~':
451 encode = false;
452 }
453
454 if (useOptions.allowFragOrQueryChars) {
455 switch (c) {
456 case '/':
457 case '?':
458 encode = false;
459 }
460 }
461 if (useOptions.allowPChar) {
462 switch (c) {
463 case ':':
464 case '@':
465 encode = false;
466 }
467 }
468 if (useOptions.allowPathCharacters) {
469 switch (c) {
470 case '/':
471 encode = false;
472 }
473 }
474 if (useOptions.allowGenDelims) {
475 switch (c) {
476 case ':':
477 case '/':
478 case '?':
479 case '[':
480 case ']':
481 case '@':
482 encode = false;
483 }
484 }
485 if (useOptions.allowSubDelims) {
486 switch (c) {
487 case '!':
488 case '$':
489 case '&':
490 case '\'':
491 case '(':
492 case ')':
493 case '*':
494 case '+':
495 case ',':
496 case ';':
497 case '=':
498 encode = false;
499 }
500 }
501 if (encode) {
502 Containers::Support::ReserveTweaks::Reserve4AddN (result, 3);
503 result += CString::Format (u8"%%%.2x", c);
504 }
505 else {
506 Containers::Support::ReserveTweaks::Reserve4Add1 (result);
507 result += c;
508 }
509 }
510 return result;
511}
512
513u8string UniformResourceIdentification::PCTEncode (const String& s, const PCTEncodeOptions& options)
514{
515 return PCTEncode (s.AsUTF8<u8string> (), options);
516}
517
518String UniformResourceIdentification::PCTEncode2String (const String& s, const PCTEncodeOptions& options)
519{
520 return String::FromUTF8 (PCTEncode (s, options));
521}
522
523/*
524 ********************************************************************************
525 ************** UniformResourceIdentification::PCTDecode ************************
526 ********************************************************************************
527 */
528u8string UniformResourceIdentification::PCTDecode (const u8string& s)
529{
530 u8string result;
531 result.reserve (s.length ());
532 for (auto p = s.begin (); p != s.end (); ++p) {
533 switch (*p) {
534 case '%': {
535 if (p + 2 < s.end ()) {
536 unsigned char newC = (ConvertReadSingleHexDigit_ (*(p + 1)) << 4) + ConvertReadSingleHexDigit_ (*(p + 2));
537 p += 2;
538 result += (newC);
539 }
540 else {
541 static const auto kException_ = Execution::RuntimeErrorException{"incomplete % encoded character in URI"sv};
542 Execution::Throw (kException_);
543 }
544 } break;
545 default: {
546 result += *p;
547 } break;
548 }
549 }
550 return result;
551}
552
553/*
554 ********************************************************************************
555 ************** UniformResourceIdentification::PCTDecode2String *****************
556 ********************************************************************************
557 */
558String UniformResourceIdentification::PCTDecode2String (const u8string& s)
559{
560 return String::FromUTF8 (PCTDecode (s));
561}
562
563String UniformResourceIdentification::PCTDecode2String (const String& s)
564{
565 return String::FromUTF8 (PCTDecode (s.AsASCII<u8string> ()));
566}
567
568/*
569 ********************************************************************************
570 * hash<Stroika::Foundation::IO::Network::UniformResourceIdentification::Host> **
571 ********************************************************************************
572 */
573size_t std::hash<Stroika::Foundation::IO::Network::UniformResourceIdentification::Host>::operator() (
575{
576 return hash<Characters::String>{}(arg.As<Characters::String> (UniformResourceIdentification::Host::eDecoded));
577}
#define WeakAssertNotImplemented()
Definition Assertions.h:483
Similar to String, but intended to more efficiently construct a String. Mutable type (String is large...
String is like std::u32string, except it is much easier to use, often much more space efficient,...
Definition String.h:201
nonvirtual size_t size() const noexcept
Definition String.inl:534
nonvirtual String ToLowerCase() const
Definition String.cpp:1706
static constexpr size_t npos
Definition String.h:1390
nonvirtual String SubString(SZ from) const
static String FromUTF8(span< CHAR_T > from)
Definition String.inl:420
nonvirtual optional< size_t > Find(Character c, CompareOptions co=eWithCase) const
Definition String.inl:681
nonvirtual bool Add(ArgByValueType< key_type > key, ArgByValueType< mapped_type > newElt, AddReplaceMode addReplaceMode=AddReplaceMode::eAddReplaces)
Definition Mapping.inl:190
Set<T> is a container of T, where once an item is added, additionally adds () do nothing.
Definition Set.h:105
nonvirtual constexpr AddressFamily GetAddressFamily() const
Authority is roughly the part of a URL where you say the hostname (and portnumber etc) - part just af...
nonvirtual T As(optional< StringPCTEncodedFlag > pctEncode={}) const
static optional< Authority > Parse(const String &rawURLAuthorityText)
nonvirtual RESULT_TYPE As(optional< StringPCTEncodedFlag > pctEncode={}) const
Returns the hostname, either encoded or decoded (PCT encoding) as some form of printed derivitive str...
nonvirtual optional< T > Last() const
return last element in iterable, or if 'that' specified, last where 'that' is true,...
Definition Iterable.inl:888
String ToString(T &&t, ARGS... args)
Return a debug-friendly, display version of the argument: not guaranteed parsable or usable except fo...
Definition ToString.inl:465
void Throw(T &&e2Throw)
identical to builtin C++ 'throw' except that it does helpful, type dependent DbgTrace() messages firs...
Definition Throw.inl:43