Stroika Library 3.0d16
 
Loading...
Searching...
No Matches
URI.h
Go to the documentation of this file.
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#ifndef _Stroika_Foundation_IO_Network_URI_h_
5#define _Stroika_Foundation_IO_Network_URI_h_ 1
6
7#include "Stroika/Foundation/StroikaPreComp.h"
8
9#include <compare>
10#include <string>
11
13#include "Stroika/Foundation/Common/Common.h"
15#include "Stroika/Foundation/Containers/Mapping.h"
19
20/**
21 * \file
22 *
23 * \note Code-Status: <a href="Code-Status.md#Beta">Beta</a>
24 */
25
27
28 using Characters::String;
29
30 /**
31 * \par RFC Reference
32 * \note https://tools.ietf.org/html/rfc3986 - Uniform Resource Identifier (URI): Generic Syntax - 2005
33 * Updates: 1738; Obsoletes 2732, 2396, 1808
34 * (so combines relative and absolute)
35 *
36 * A URI is EITHER a URL, or a relative reference to a URL. This definition is VERY MUCH LESS THAN CLEAR,
37 * but the closest I can infer from:
38 * https://tools.ietf.org/html/rfc3986
39 *
40 * A URI can be further classified as a locator, a name, or both. The
41 * term "Uniform Resource Locator" (URL) refers to the subset of URIs
42 * that, in addition to identifying a resource, provide a means of
43 * locating the resource by describing its primary access mechanism
44 * (e.g., its network "location").
45 *
46 * From:
47 * https://tools.ietf.org/html/rfc3986#section-3
48 *
49 * The generic URI syntax consists of a hierarchical sequence of
50 * components referred to as the scheme, authority, path, query, and
51 * fragment
52 * ...
53 * The following are two example URIs and their component parts:
54 *
55 * foo://example.com:8042/over/there?name=ferret#nose
56 * \_/ \______________/\_________/ \_________/ \__/
57 * | | | | |
58 * scheme authority path query fragment
59 * | _____________________|__
60 * / \ / \
61 * urn:example:animal:ferret:nose
62 *
63 * \note This code does not currently (as of v2.1d23) address https://tools.ietf.org/html/rfc3986#appendix-C - URI delimiting (finding the boundaries of the URI from
64 * surrounding text).
65 *
66 * \note One subtlety with the URI syntax is that:
67 * https://tools.ietf.org/html/rfc3986#section-3.3
68 * If a URI contains an authority component, then the path component
69 * must either be empty or begin with a slash ("/") character
70 * If you look at the syntax/BNF, this makes sense. But logically, it makes no sense. The specification of an authority
71 * then PROHIBITS the specification of a relative path.
72 *
73 * But - I don't get to write the specs ;-).
74 *
75 * So this class triggers throws if you ever attempt to specify a non-empty path that doesn't start with /
76 * on a URI that has an authority.
77 *
78 * This poses some difficulties for code that wants to update BOTH the authority and the path of a URI (which do you do first - tricky).
79 * But its easy enough to avoid by re-constructing the URI from scratch using the URI (individual components) constructor.
80 *
81 * \note <a href="Design-Overview.md#Comparisons">Comparisons</a>:
82 * o static_assert (totally_ordered<URI>);
83 */
84 class [[nodiscard]] URI {
85 public:
87
88 public:
90
91 public:
93
94 public:
96
97 public:
99 static constexpr auto eDecoded = StringPCTEncodedFlag::eDecoded;
100 static constexpr auto ePCTEncoded = StringPCTEncodedFlag::ePCTEncoded;
101
102 public:
103 /**
104 * This checks and throws if arguments are invalid our out of range (e.g. a scheme with a colon in it will trigger a throw).
105 *
106 * These will raise exceptions if anything illegal in the URL specification.
107 *
108 * Constructor from String(or string) and no other arguments, is equivalent to calling URI::Parse ()
109 *
110 * Copy and Move constructors/assignment operators are noexcept because the underlying objects are, but
111 * the default constructor is NOT noexcept because the instance always allocates at least a string (could be remedied, but not as of now).
112 *
113 * \note URL/0 is treated as of empty path and nullopt for authority, query, scheme, fragment.
114 *
115 * \todo http://stroika-bugs.sophists.com/browse/STK-750
116 * noexcept - unclear why I cannot declare copy constructor and copy assignment operators as noexcept
117 * on GCC. THIS compiles fine, but then later bits of code that use it fail to compile (g++ 9 at least).
118 */
119 URI () = default;
120 URI (const optional<SchemeType>& scheme, const optional<Authority>& authority, const String& path = String{},
121 const optional<String>& query = nullopt, const optional<String>& fragment = nullopt);
122 //URI (const string& encodedURI);
123 template <Characters::IConvertibleToString STRISH_TYPE>
124 URI (STRISH_TYPE&& encodedURI);
125 URI (const URI&) = default;
126 // clang-format off
127 URI (URI&&) noexcept = default;
128 // clang-format on
129
130 public:
131 nonvirtual URI& operator= (const URI&) = default;
132 nonvirtual URI& operator= (URI&&) noexcept = default;
133
134 public:
135 /**
136 * This takes argument string url, with possibly % encoded characters, according to https://tools.ietf.org/html/rfc3986
137 * The input character set is always ASCII (but may encode UCS after %PCT substitutions).
138 * If not handed ASCII text, an exception will be thrown.
139 */
140 static URI Parse (const String& rawURL);
141
142 public:
143 /**
144 * Same as URI::Parse () - except that it starts looking at the Path part - and will not interpret the leading bits as part of a scheme or authority
145 */
146 static URI ParseRelative (const String& rawRelativeURL);
147
148 public:
149 /**
150 * This returns true if this is relative URI (either network relative or host-relative), and false if
151 * it contains a scheme.
152 *
153 * From https://tools.ietf.org/html/rfc3986#section-4.1
154 *
155 * URI-reference = URI / relative-ref
156 *
157 * A URI-reference is either a URI or a relative reference. If the
158 * URI-reference's prefix does not match the syntax of a scheme followed
159 * by its colon separator, then the URI-reference is a relative
160 * reference.
161 * ...
162 * A relative reference that begins with two slash characters is termed
163 * a network-path reference; such references are rarely used. A
164 * relative reference that begins with a single slash character is
165 * termed an absolute-path reference. A relative reference that does
166 * not begin with a slash character is termed a relative-path reference
167 *
168 * \note Even if something is NOT a relative reference, it may not have a host/authority:
169 * e.g.
170 * mailto:John.Doe@example.com
171 */
172 nonvirtual bool IsRelativeReference () const;
173
174 public:
175 /**
176 * Always returns a valid (or empty) protocol/URL scheme - according to http://www.ietf.org/rfc/rfc1738.txt
177 */
178 nonvirtual optional<SchemeType> GetScheme () const;
179
180 public:
181 /**
182 * \par Example Usage
183 * \code
184 * URI u;
185 * u.SetScheme (URI::SchemeType{"http"});
186 * \endcode
187 */
188 nonvirtual void SetScheme (const optional<SchemeType>& scheme);
189 nonvirtual void SetScheme (const SchemeType& scheme);
190
191 public:
192 /**
193 * The authority of a URI is basically the hostname (+ optional port and user info)
194 */
195 nonvirtual optional<Authority> GetAuthority () const;
196
197 public:
198 /**
199 */
200 nonvirtual void SetAuthority (const optional<Authority>& authority);
201
202 public:
203 /**
204 * Get the best guess possible for the port#, based on the given port, and given the scheme. Will return a bogus
205 * port number (like 80) if not enough information given.
206 */
207 nonvirtual PortType GetPortValue () const;
208
209 public:
210 /*
211 * The path MAY or MAY NOT start with a /, and it may be empty.
212 *
213 * \note - the path is already decoded (% decoding and character set decoded)
214 */
215 nonvirtual String GetPath () const;
216
217 public:
218 /**
219 * \note - the path is a UNICODE string, and should not be url-encoded.
220 */
221 nonvirtual void SetPath (const String& path);
222
223 public:
224 /**
225 * Return just the scheme and authority part of the URI (as a URI). This is useful for HTTP for example,
226 * as it was what is needed to define/create the connection.
227 */
228 nonvirtual URI GetSchemeAndAuthority () const;
229
230 public:
231 /**
232 * \brief Return the (PCT etc encoded if a string) data AFTER the authority, but not including the fragment
233 *
234 * @aliases GetHostRelativePathPlusQuery
235 *
236 * This returns the path + the query (omitting authority, scheme, and fragment).
237 *
238 * RETURN_TYPE may be:
239 * o String (default)
240 * o string (because its all ASCII return since ENCODED)
241 * o URI (in which case it just copies the path, and query elements)
242 */
243 template <Common::IAnyOf<String, string, URI> RETURN_TYPE = String>
244 nonvirtual RETURN_TYPE GetAuthorityRelativeResource () const;
245
246 public:
247 /**
248 * \brief Return the path component, excluding any text after the final /.
249 *
250 * @aliases GetHostRelPathDir
251 *
252 * This value maybe a full UNICODE String, and is NOT PCT encoded.
253 */
254 nonvirtual String GetAuthorityRelativeResourceDir () const;
255
256 public:
257 /**
258 * \brief Return the GetPath () value, but assuring its an absolute path.
259 *
260 * Return type maybe:
261 * String
262 * optional<String>
263 *
264 * If return type is optional<String>, it will return nullopt when the path is NOT an absolute path.
265 * If return type is String, it will THROW when the path is not an absolute path.
266 *
267 * In either case, the special case of GetPath ().empty () will be treated as '/'.
268 * So in either case, if a string is returned, its length always >= 1.
269 */
270 template <Common::IAnyOf<String, optional<String>> RETURN_VALUE = String>
271 nonvirtual RETURN_VALUE GetAbsPath () const;
272
273 public:
274 /*
275 * Return the query part of the URI as the given RETURN_TYPE. Note this this value maybe missing.
276 *
277 * Supported RETURN_TYPE values are:
278 * o String
279 * o Query (a parsed query string - much akin to a Mapping<String,String>)
280 *
281 */
282 template <Common::IAnyOf<String, URI::Query> RETURN_TYPE = Query>
283 nonvirtual optional<RETURN_TYPE> GetQuery () const;
284
285 public:
286 /**
287 */
288 nonvirtual void SetQuery (const optional<String>& query);
289 nonvirtual void SetQuery (const optional<Query>& query);
290
291 public:
292 /**
293 * \brief shortcut for url.GetQuery<Query> ()? url.GetQuery<Query> ()->Lookup (arg): nullopt;
294 */
295 nonvirtual optional<String> LookupQueryArg (const String& arg) const;
296
297 public:
298 /**
299 */
300 nonvirtual optional<String> GetFragment () const;
301
302 public:
303 /**
304 */
305 nonvirtual void SetFragment (const optional<String>& query);
306
307 public:
308 /**
309 * RFC 3986 lists a few specific normalizations to perform.
310 * https://en.wikipedia.org/wiki/URI_normalization lists a few more common ones, we also perform.
311 */
313 eRFC3986,
314
315 /**
316 * This adds (from https://en.wikipedia.org/wiki/URI_normalization):
317 * o Removing duplicate slashes
318 */
319 eAggressive,
320
321 eDefault = eRFC3986,
322
323 Stroika_Define_Enum_Bounds (eRFC3986, eAggressive)
324 };
325
326 public:
327 /**
328 * \brief Produce a normalized representation of the URI.
329 *
330 * Since constructing the URI object already does a lot of this, some parts are not needed (like
331 * Percent-Encoding Normalization, and character set conversion. But other parts are still useful/impactful (like tolower).
332 *
333 * @see https://tools.ietf.org/html/rfc3986#section-6
334 */
335 nonvirtual URI Normalize (NormalizationStyle normalization = NormalizationStyle::eDefault) const;
336
337 public:
338 /**
339 * Supported conversion-targets (T):
340 * String - converts to the raw URI format (as it would appear in a web-browser or html link); note raw form is ASCII
341 * string - ditto
342 *
343 * if T==String, pctEncoded defaults to eDecoded
344 * if T==string, pctEncoded defaults to ePCTEncoded
345 */
346 template <Common::IAnyOf<String, string> T>
347 nonvirtual T As (optional<StringPCTEncodedFlag> pctEncoded = {}) const;
348
349 private:
350 nonvirtual String AsString_ (optional<StringPCTEncodedFlag> pctEncoded) const;
351
352 public:
353 /**
354 * Returns true iff one or more sub-elements have a value
355 */
356 nonvirtual explicit operator bool () const;
357
358 public:
359 /**
360 * \brief Combine **overridingURI** possibly relative url with this base url, to produce a new URI
361 *
362 * Combine a full URI with a (possibly) relative URI, to produce a new URI. Note - its completely legal for the argument uri
363 * to be a full url, in which case this returns its argument (taking no properties from 'this')
364 *
365 * @see https://tools.ietf.org/html/rfc3986#section-5.2
366 *
367 * Note - one special case - if the source URI is empty (if !u) - then just return the argument URI as the result (avoiding
368 * issues with having to always check if the source URI has a scheme. NOT - if you pass in a BAD src URI as 'this' - one without
369 * a scheme for example - you still get an exception thrown.
370 *
371 * \note *this is the 'baseURL' which the argument 'overrides' bits of (or all of).
372 */
373 nonvirtual URI Combine (const URI& overridingURI) const;
374
375 public:
376 /**
377 */
378 nonvirtual strong_ordering operator<=> (const URI& rhs) const;
379
380 public:
381 /**
382 */
383 nonvirtual bool operator== (const URI& rhs) const;
384
385 private:
386 static strong_ordering TWC_ (const URI& lhs, const URI& rhs); // utility code share between c++17 and c++20 versions
387
388 public:
389 /**
390 * For debugging purposes: don't count on the format.
391 */
392 nonvirtual String ToString () const;
393
394 private:
395 static void CheckValidPathForAuthority_ (const optional<Authority>& authority, const String& path);
396
397 private:
398 optional<SchemeType> fScheme_; // aka protocol
399 optional<Authority> fAuthority_; // aka host+port+username
400 String fPath_; // Can be empty string, but documented as always 'present' even as empty so model that way
401 optional<String> fQuery_; // ditto
402 optional<String> fFragment_; // ditto
403 [[no_unique_address]] Debug::AssertExternallySynchronizedMutex fThisAssertExternallySynchronized_;
404 };
405 static_assert (totally_ordered<URI>);
406
407}
408
409template <>
410class std::hash<Stroika::Foundation::IO::Network::URI> {
411public:
412 size_t operator() (const Stroika::Foundation::IO::Network::URI& arg) const;
413};
414
415/*
416 ********************************************************************************
417 ***************************** Implementation Details ***************************
418 ********************************************************************************
419 */
420#include "URI.inl"
421
422#endif /*_Stroika_Foundation_IO_Network_URI_h_*/
#define Stroika_Define_Enum_Bounds(FIRST_ITEM, LAST_ITEM)
StringPCTEncodedFlag
for some purposes, we may want to render objects PCT-encoded, and sometimes not (plain or decoded)....
String is like std::u32string, except it is much easier to use, often much more space efficient,...
Definition String.h:201
NOT a real mutex - just a debugging infrastructure support tool so in debug builds can be assured thr...
nonvirtual T As(optional< StringPCTEncodedFlag > pctEncoded={}) const
Authority is roughly the part of a URL where you say the hostname (and portnumber etc) - part just af...