Stroika Library 3.0d16
 
Loading...
Searching...
No Matches
CodePage.inl
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#include <algorithm>
5#include <cstring>
6
8
9#if qStroika_Foundation_Common_Platform_Windows
10#include "Stroika/Foundation/Characters/Platform/Windows/CodePage.h"
11#endif
12
13#include "UTFConvert.h"
14
16
17 /*
18 ********************************************************************************
19 ************ CodePageConverter::CodePageNotSupportedException ******************
20 ********************************************************************************
21 */
22 inline CodePage CodePageNotSupportedException::GetCodePage () const
23 {
24 return fCodePage_;
25 }
26
27 /*
28 ********************************************************************************
29 ****************************** CodePagesInstalled ******************************
30 ********************************************************************************
31 */
32 inline vector<CodePage> CodePagesInstalled::GetAll ()
33 {
34 return fCodePages_;
35 }
36 inline bool CodePagesInstalled::IsCodePageAvailable (CodePage cp)
37 {
38 return find (fCodePages_.begin (), fCodePages_.end (), cp) == fCodePages_.end ();
39 }
40
41 /// <summary>
42 //////////////////////////// DEPRECATED BELOW.../////////////////////////////
43 /// </summary
44
45 class [[deprecated ("Since Stroika v3.0d2, use ReadByteOrderMark")]] CodePagesGuesser {
46 public:
47 enum class Confidence : uint8_t {
48 eLow = 0,
49 eMedium = 10,
50 eHigh = 100
51 };
52
53 public:
54 /*
55 @METHOD: CodePagesGuesser::Guess
56 @DESCRIPTION: <p>Guess the code page of the given snippet of text. Return that codepage.
57 Always make some guess, and return the level of quality of the guess in the
58 optional parameter 'confidence' - unless its nullptr (which it is by default),
59 and return the number of bytes of BOM (byte-order-mark) prefix to strip from
60 the source in 'bytesFromFrontToStrip' unless it is nullptr (which it is by
61 default).
62 </p>
63 */
64 nonvirtual CodePage Guess (const void* input, size_t nBytes, Confidence* confidence = nullptr, size_t* bytesFromFrontToStrip = nullptr);
65 };
66
67 /// <summary>
68 /// DEPRECATED
69 /// </summary>
70 enum {
71 kCodePage_INVALID [[deprecated ("Since v3.0d2 - deprecated - use optional")]] = 0xffffffff, // I hope this is always an invalid code page???
72
73 kCodePage_ANSI [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kANSI")]] = WellKnownCodePages::kANSI,
74
75 kCodePage_MAC [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kMAC")]] = WellKnownCodePages::kMAC,
76 kCodePage_PC [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kPC")]] = WellKnownCodePages::kPC,
77 kCodePage_PCA [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kPCA")]] = WellKnownCodePages::kPCA,
78 kCodePage_Thai [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kThai")]] = WellKnownCodePages::kThai,
79 kCodePage_SJIS [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kSJIS")]] = WellKnownCodePages::kSJIS,
80 kCodePage_GB2312 [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kGB2312")]] = WellKnownCodePages::kGB2312,
81 kCodePage_Korean [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kKorean")]] = WellKnownCodePages::kKorean,
82 kCodePage_BIG5 [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kBIG5")]] = WellKnownCodePages::kBIG5,
83 kCodePage_EasternEuropean [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kEasternEuropean")]] =
84 WellKnownCodePages::kEasternEuropean,
85 kCodePage_CYRILIC [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kCyrilic")]] = WellKnownCodePages::kCyrilic,
86 kCodePage_GREEK [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kGreek")]] = WellKnownCodePages::kGreek,
87 kCodePage_Turkish [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kTurkish")]] = WellKnownCodePages::kTurkish,
88 kCodePage_HEBREW [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kHebrew")]] = WellKnownCodePages::kHebrew,
89 kCodePage_ARABIC [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kArabic")]] = WellKnownCodePages::kArabic,
90 kCodePage_Baltic [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kBaltic")]] = WellKnownCodePages::kBaltic,
91 kCodePage_Vietnamese [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kVietnamese")]] = WellKnownCodePages::kVietnamese,
92
93 kCodePage_UNICODE_WIDE [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kUNICODE_WIDE")]] = WellKnownCodePages::kUNICODE_WIDE,
94 kCodePage_UNICODE_WIDE_BIGENDIAN [[deprecated ("Since v3.0d2 - deprecated - use WellKnownCodePages::kUNICODE_WIDE_BIGENDIAN")]] =
95 WellKnownCodePages::kUNICODE_WIDE_BIGENDIAN,
96
97 kCodePage_UTF7 [[deprecated ("Since v3.0d2 - UTF-7 deprecated")]] = 65000,
98 kCodePage_UTF8 [[deprecated ("Since v3.0d2 - WellKnownCodePages::kUTF8")]] = WellKnownCodePages::kUTF8
99 };
100
101 class [[deprecated ("Since Stroika v3.0d2, use CodeCvt - for example - CodeCvt<wchar_t>{locale{}}.String2Bytes<SDKString> (span<const "
102 "wchar_t>{s})")]] CodePageConverter {
103 public:
104 enum class HandleBOMFlag {
105 eHandleBOM
106 };
107 static constexpr HandleBOMFlag eHandleBOM = HandleBOMFlag::eHandleBOM;
108
109 public:
110 /**
111 */
112 CodePageConverter (CodePage codePage);
113 CodePageConverter (CodePage codePage, HandleBOMFlag h);
114
115 public:
116 /**
117 * <p>In UNICODE, files are generally headed by a byte order mark (BOM). This
118 mark is used to indicate if the file is big endian, or little-endian (if the
119 characters are wide-characters). This is true for 2 and 4 byte UNICODE (UCS-2, UCS-4)
120 UNICODE, as well as for UTF-X encodings (such as UTF-7 and UTF-8). It is also used
121 to indicate whether or not the file is in a UTF encoding (as byte order doesn't matter
122 in any (most?) of the UTF encodings.</p>
123 <p>The basic rubrick for BOM's is that they are the character 0xfeff, as it would
124 be encoded in the given UTF or UCS encoding.</p>
125 <p>Because of this type of encoding - if you have a 0xfeff character (after
126 decoding) at the beginning of a buffer, there is no way for this routine to know if
127 that was REALLY there, or if it was byte order mark. And its not always desirable for
128 the routine producing these encodings to produce the byte order mark, but sometimes
129 its highly desirable. So - this class lets you get/set a flag to indicate whether or not
130 to process BOMs on input, and whether or not to generate them on encoded outputs.
131 </p>
132 <p>See also @'CodePageConverter::SetHandleBOM', and note that there is an
133 overloaded CTOR that lets you specify CodePageConverter::eHandleBOM as a final
134 argument to automatically set this BOM converter flag.</p>
135 */
136 nonvirtual bool GetHandleBOM () const;
137 /**
138 * See also @'CodePageConverter::GetHandleBOM'.</p>
139 */
140 nonvirtual void SetHandleBOM (bool handleBOM);
141
142 private:
143 bool fHandleBOM;
144
145 public:
146 /**
147 * Map the given multibyte chars in the fCodePage codepage into wide UNICODE
148 * characters. Pass in a buffer 'outChars' of
149 * size large enough to accomodate those characrters.</p>
150 *
151 * 'outCharCnt' is the size of the output buffer coming in, and it contains the number
152 * of UNICODE chars copied out on return.</p>
153 */
154 nonvirtual void MapToUNICODE (const char* inMBChars, size_t inMBCharCnt, char16_t* outChars, size_t* outCharCnt) const;
155 nonvirtual void MapToUNICODE (const char* inMBChars, size_t inMBCharCnt, char32_t* outChars, size_t* outCharCnt) const;
156 nonvirtual void MapToUNICODE (const char* inMBChars, size_t inMBCharCnt, wchar_t* outChars, size_t* outCharCnt) const;
157
158 /*
159 @METHOD: CodePageConverter::MapToUNICODE_QuickComputeOutBufSize
160 @DESCRIPTION: <p>Call to get an upper bound, reasonable buffer size to use to pass to
161 @'CodePageConverter::MapToUNICODE' calls.</p>
162 */
163 nonvirtual size_t MapToUNICODE_QuickComputeOutBufSize (const char* inMBChars, size_t inMBCharCnt) const;
164
165 nonvirtual void MapFromUNICODE (const char16_t* inChars, size_t inCharCnt, char* outChars, size_t* outCharCnt) const;
166 nonvirtual void MapFromUNICODE (const char32_t* inChars, size_t inCharCnt, char* outChars, size_t* outCharCnt) const;
167 nonvirtual void MapFromUNICODE (const wchar_t* inChars, size_t inCharCnt, char* outChars, size_t* outCharCnt) const;
168
169 /*
170 @METHOD: CodePageConverter::MapFromUNICODE_QuickComputeOutBufSize
171 @DESCRIPTION: <p>Call to get an upper bound, reasonable buffer size to use to pass to MapFromUNICODE calls.</p>
172 */
173 nonvirtual size_t MapFromUNICODE_QuickComputeOutBufSize (const char16_t* inChars, size_t inCharCnt) const;
174 nonvirtual size_t MapFromUNICODE_QuickComputeOutBufSize (const char32_t* inChars, size_t inCharCnt) const;
175 nonvirtual size_t MapFromUNICODE_QuickComputeOutBufSize (const wchar_t* inChars, size_t inCharCnt) const;
176
177 private:
178 CodePage fCodePage;
179 };
180
181 [[deprecated ("Since Stroika v3.0d2 - use CodeCvt")]] inline void MapSBUnicodeTextWithMaybeBOMToUNICODE (const char* inMBChars, size_t inMBCharCnt,
182 wchar_t* outChars, size_t* outCharCnt)
183 {
185 DISABLE_COMPILER_GCC_WARNING_START ("GCC diagnostic ignored \"-Wdeprecated-declarations\"");
186 DISABLE_COMPILER_CLANG_WARNING_START ("clang diagnostic ignored \"-Wdeprecated-declarations\"");
187 RequireNotNull (outChars);
188 RequireNotNull (outCharCnt);
189 [[maybe_unused]] size_t outBufSize = *outCharCnt;
190 CodePagesGuesser::Confidence confidence = CodePagesGuesser::Confidence::eLow;
191 CodePage cp = CodePagesGuesser{}.Guess (inMBChars, inMBCharCnt, &confidence, nullptr);
192 if (confidence <= CodePagesGuesser::Confidence::eLow) {
193 cp = WellKnownCodePages::kUTF8;
194 }
195 CodePageConverter cpCvt (cp, CodePageConverter::eHandleBOM);
196 cpCvt.MapToUNICODE (inMBChars, inMBCharCnt, outChars, outCharCnt);
197 Ensure (*outCharCnt <= outBufSize);
198 DISABLE_COMPILER_MSC_WARNING_END (4996);
199 DISABLE_COMPILER_GCC_WARNING_END ("GCC diagnostic ignored \"-Wdeprecated-declarations\"");
200 DISABLE_COMPILER_CLANG_WARNING_END ("clang diagnostic ignored \"-Wdeprecated-declarations\"");
201 }
202
203 /*
204 ********************************************************************************
205 ****************************** GetDefaultSDKCodePage ***************************
206 ********************************************************************************
207 */
208 [[deprecated ("Since Stroika v3.0d2 - on windows use CP_ACP, and elsewhere probably dont use")]] inline CodePage GetDefaultSDKCodePage ()
209 {
210#if qStroika_Foundation_Common_Platform_Windows
211//http://blogs.msdn.com/b/michkap/archive/2005/01/06/347834.aspx
212#if defined(CP_ACP)
213 Assert (CP_ACP == 0);
214 return CP_ACP; // special windows define which maps to the current OS code page
215#else
216 return 0;
217#endif
218//GetACP() // means essentially the same thing but supposedly (even if we cahced GetACP() - CP_ACP is faster)
219#else
220 // MAYBE should use the LOCALE stuff - and get the current code-page from the locale? If such a thing?
221 return WellKnownCodePages::kUTF8; // So far - this is meaningless on other systems - but this would be the best guess I think
222#endif
223 }
224
225 /*
226 ********************************************************************************
227 ******************************* CodePageConverter ******************************
228 ********************************************************************************
229 */
230 inline CodePageConverter::CodePageConverter (CodePage codePage)
231 : fHandleBOM{false}
232 , fCodePage{codePage}
233 {
234 }
235 inline CodePageConverter::CodePageConverter (CodePage codePage, [[maybe_unused]] HandleBOMFlag h)
236 : fHandleBOM{true}
237 , fCodePage{codePage}
238 {
239 Require (h == eHandleBOM);
240 }
241 inline bool CodePageConverter::GetHandleBOM () const
242 {
243 return fHandleBOM;
244 }
245 inline void CodePageConverter::SetHandleBOM (bool handleBOM)
246 {
247 fHandleBOM = handleBOM;
248 }
249 inline size_t CodePageConverter::MapToUNICODE_QuickComputeOutBufSize (const char* /*inMBChars*/, size_t inMBCharCnt) const
250 {
251 size_t resultSize = inMBCharCnt;
252 return resultSize;
253 }
254 inline void CodePageConverter::MapToUNICODE (const char* inMBChars, size_t inMBCharCnt, wchar_t* outChars, size_t* outCharCnt) const
255 {
256 static_assert ((sizeof (wchar_t) == sizeof (char16_t)) or (sizeof (wchar_t) == sizeof (char32_t)),
257 "(sizeof (wchar_t) == sizeof (char16_t)) or (sizeof (wchar_t) == sizeof (char32_t))");
258 if constexpr (sizeof (wchar_t) == sizeof (char16_t)) {
259 MapToUNICODE (inMBChars, inMBCharCnt, reinterpret_cast<char16_t*> (outChars), outCharCnt);
260 }
261 else if constexpr (sizeof (wchar_t) == sizeof (char32_t)) {
262 MapToUNICODE (inMBChars, inMBCharCnt, reinterpret_cast<char32_t*> (outChars), outCharCnt);
263 }
264 }
265 inline void CodePageConverter::MapFromUNICODE (const wchar_t* inChars, size_t inCharCnt, char* outChars, size_t* outCharCnt) const
266 {
267 static_assert ((sizeof (wchar_t) == sizeof (char16_t)) or (sizeof (wchar_t) == sizeof (char32_t)),
268 "(sizeof (wchar_t) == sizeof (char16_t)) or (sizeof (wchar_t) == sizeof (char32_t))");
269 if constexpr (sizeof (wchar_t) == sizeof (char16_t)) {
270 MapFromUNICODE (reinterpret_cast<const char16_t*> (inChars), inCharCnt, outChars, outCharCnt);
271 }
272 else if constexpr (sizeof (wchar_t) == sizeof (char32_t)) {
273 MapFromUNICODE (reinterpret_cast<const char32_t*> (inChars), inCharCnt, outChars, outCharCnt);
274 }
275 }
276
277 DISABLE_COMPILER_MSC_WARNING_START (4996);
278 DISABLE_COMPILER_GCC_WARNING_START ("GCC diagnostic ignored \"-Wdeprecated-declarations\"");
279 DISABLE_COMPILER_CLANG_WARNING_START ("clang diagnostic ignored \"-Wdeprecated-declarations\"");
280
281 // @todo consider losing MOST of these - at least from this file
282
283 [[deprecated ("Since Stroika v3.0d2 - use CodeCvt or String")]] void NarrowStringToWide (const char* sStart, const char* sEnd,
284 CodePage codePage, wstring* intoResult);
285 [[deprecated ("Since Stroika v3.0d2 - use CodeCvt or String")]] inline void NarrowStringToWide (const string& s, CodePage codePage, wstring* intoResult)
286 {
287 RequireNotNull (intoResult);
288 const char* sp = s.c_str ();
289 NarrowStringToWide (sp, sp + s.length (), codePage, intoResult);
290 }
291 [[deprecated ("Since Stroika v3.0d2 - use CodeCvt or String")]] inline wstring NarrowStringToWide (const string& s, CodePage codePage)
292 {
293 wstring result;
294 NarrowStringToWide (s, codePage, &result);
295 return result;
296 }
297 [[deprecated ("Since Stroika v3.0d2 - use CodeCvt or String::FromUTF8")]] inline void UTF8StringToWide (const char* s, wstring* intoStr)
298 {
299 RequireNotNull (s);
300 NarrowStringToWide (s, s + ::strlen (s), WellKnownCodePages::kUTF8, intoStr);
301 }
302 [[deprecated ("Since Stroika v3.0d2 - use CodeCvt or String")]] inline void UTF8StringToWide (const string& s, wstring* intoStr)
303 {
304 NarrowStringToWide (s, WellKnownCodePages::kUTF8, intoStr);
305 }
306 [[deprecated ("Since Stroika v3.0d2 - use CodeCvt or String")]] inline wstring UTF8StringToWide (const char* s)
307 {
308 RequireNotNull (s);
309 wstring result;
310 NarrowStringToWide (s, s + ::strlen (s), WellKnownCodePages::kUTF8, &result);
311 return result;
312 }
313 [[deprecated ("Since Stroika v3.0d2 - use CodeCvt or String")]] inline wstring UTF8StringToWide (const string& s)
314 {
315 return NarrowStringToWide (s, WellKnownCodePages::kUTF8);
316 }
317
318 [[deprecated ("Since Stroika v3.0d2 - use CodeCvt")]] void WideStringToNarrow (const wchar_t* wsStart, const wchar_t* wsEnd,
319 CodePage codePage, string* intoResult);
320
321 [[deprecated ("Since Stroika v3.0d2 - ")]] inline void WideStringToNarrow (const wstring& ws, CodePage codePage, string* intoResult)
322 {
323 RequireNotNull (intoResult);
324 const wchar_t* wsp = ws.c_str ();
325 WideStringToNarrow (wsp, wsp + ws.length (), codePage, intoResult);
326 }
327 [[deprecated ("Since Stroika v3.0d2 - ")]] inline string WideStringToNarrow (const wstring& ws, CodePage codePage)
328 {
329 string result;
330 WideStringToNarrow (ws, codePage, &result);
331 return result;
332 }
333
334 // @todo DEPRECATE
335
336 [[deprecated ("Since Stroika v3.0d2 - use String::FromSDKString ().AsUTF8 () or CodeCvt<char8_t>{}")]] inline string WideStringToUTF8 (const wstring& ws)
337 {
338 return WideStringToNarrow (ws, WellKnownCodePages::kUTF8);
339 }
340
341 [[deprecated ("Since Stroika v3.0d2 - use CodeCvt")]] vector<byte>
342 MapUNICODETextToSerializedFormat (const wchar_t* start, const wchar_t* end, CodePage useCP = WellKnownCodePages::kUTF8); // suitable for files
343
344 [[deprecated ("Since Stroika v3.0d2 - use CodeCvt")]] wstring MapUNICODETextWithMaybeBOMTowstring (const char* start, const char* end);
345
346 [[deprecated ("Since Stroika v3.0d2 - use wstring{s.begin(), s.end()}")]] inline wstring ASCIIStringToWide (const string& s)
347 {
349 for (string::const_iterator i = s.begin (); i != s.end (); ++i) {
350 Assert (isascii (*i));
351 }
352 }
353 return wstring (s.begin (), s.end ());
354 }
355 [[deprecated ("Since Stroika v3.0d2 - seems unneeded - use String{}.AsASCII() iuf needed}")]] inline string WideStringToASCII (const wstring& s)
356 {
358 for (wstring::const_iterator i = s.begin (); i != s.end (); ++i) {
359 Assert (isascii (*i));
360 }
361 }
362 DISABLE_COMPILER_MSC_WARNING_START (4244) // 'argument': conversion from 'const wchar_t' to 'const _Elem', possible loss of data
363 return string{s.begin (), s.end ()};
364 DISABLE_COMPILER_MSC_WARNING_END (4244)
365 }
366 DISABLE_COMPILER_MSC_WARNING_END (4996);
367 DISABLE_COMPILER_GCC_WARNING_END ("GCC diagnostic ignored \"-Wdeprecated-declarations\"");
368 DISABLE_COMPILER_CLANG_WARNING_END ("clang diagnostic ignored \"-Wdeprecated-declarations\"");
369
370}
#define qStroika_Foundation_Debug_AssertionsChecked
The qStroika_Foundation_Debug_AssertionsChecked flag determines if assertions are checked and validat...
Definition Assertions.h:48
#define RequireNotNull(p)
Definition Assertions.h:347