Stroika Library 3.0d16
 
Loading...
Searching...
No Matches
Platform/Windows/CodePage.inl
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#include "Stroika/Foundation/Containers/Common.h"
6
8
9 /*
10 ********************************************************************************
11 ********************************* WideStringToNarrow ***************************
12 ********************************************************************************
13 */
14 inline void WideStringToNarrow (const wchar_t* wsStart, const wchar_t* wsEnd, CodePage codePage, string* intoResult)
15 {
16 RequireNotNull (intoResult);
17 Require (wsStart <= wsEnd);
18 if (wsStart == wsEnd) {
19 intoResult->clear ();
20 return;
21 }
22 Assert ((wsEnd - wsStart) < numeric_limits<int>::max ()); // WideCharToMultiByte API uses int - and this impl assumes fits
23 constexpr bool kUseOptimizedApporach_ = true;
24 if (kUseOptimizedApporach_) {
25 int wsLen = static_cast<int> (wsEnd - wsStart);
26 int guessNewSterlLen = max<int> (wsLen, 2 * 64 - 10); // for ascii, wsLen enuf. For most string, they will fit in 64 chars (apx)
27 // in these cases, we solve in one pass/one call to OS/charmapper
28 intoResult->resize (guessNewSterlLen); // maybe over-allocates a bit but will pare back
29 int nCharsWritten =
30 ::WideCharToMultiByte (codePage, 0, wsStart, wsLen, Containers::Start (*intoResult), guessNewSterlLen, nullptr, nullptr);
31 if ((nCharsWritten == 0) and (::GetLastError () == ERROR_INSUFFICIENT_BUFFER)) {
32 guessNewSterlLen = ::WideCharToMultiByte (codePage, 0, wsStart, static_cast<int> (wsLen), nullptr, 0, nullptr, nullptr);
33 intoResult->resize (guessNewSterlLen);
34 nCharsWritten =
35 ::WideCharToMultiByte (codePage, 0, wsStart, wsLen, Containers::Start (*intoResult), guessNewSterlLen, nullptr, nullptr);
36 }
37 Assert (nCharsWritten != 0);
38 Verify (nCharsWritten != 0);
39 Verify (nCharsWritten > 0);
40 Verify (nCharsWritten <= guessNewSterlLen);
41 if (nCharsWritten != guessNewSterlLen) {
42 intoResult->resize (nCharsWritten); // shrink to fit
43 }
44 }
45 else {
46 int wsLen = static_cast<int> (wsEnd - wsStart);
47 int stringLength = ::WideCharToMultiByte (codePage, 0, wsStart, wsLen, nullptr, 0, nullptr, nullptr);
48 intoResult->resize (stringLength);
49 if (stringLength != 0) {
50 Verify (::WideCharToMultiByte (codePage, 0, wsStart, wsLen, Containers::Start (*intoResult), stringLength, nullptr, nullptr) == stringLength);
51 }
52 }
53 }
54
55 /*
56 ********************************************************************************
57 ********************************* NarrowStringToWide ***************************
58 ********************************************************************************
59 */
60 inline void NarrowStringToWide (const char* sStart, const char* sEnd, CodePage codePage, wstring* intoResult)
61 {
62 RequireNotNull (intoResult);
63 Require (sStart <= sEnd);
64 if (sStart == sEnd) {
65 intoResult->clear ();
66 return;
67 }
68 Assert ((sEnd - sStart) < numeric_limits<int>::max ()); // MultiByteToWideChar API uses int - and this impl assumes fits
69 int sLen = static_cast<int> (sEnd - sStart);
70 Assert (sLen > 0);
71 const bool kUseOptimizedApporach_ = true;
72 if (kUseOptimizedApporach_) {
73 int guessNewSterlLen = sLen;
74 Assert (guessNewSterlLen >= ::MultiByteToWideChar (codePage, 0, sStart, static_cast<int> (sLen), nullptr, 0));
75 intoResult->resize (guessNewSterlLen); // maybe overallocates a little sometimes...
76 int nCharsWritten = ::MultiByteToWideChar (codePage, 0, sStart, sLen, Containers::Start (*intoResult), guessNewSterlLen);
77 Verify (nCharsWritten != 0);
78 Verify (nCharsWritten > 0);
79 Verify (nCharsWritten <= guessNewSterlLen);
80 if (nCharsWritten != guessNewSterlLen) {
81 intoResult->resize (nCharsWritten); // shrink to fit
82 }
83 }
84 else {
85 int newStrLen = ::MultiByteToWideChar (codePage, 0, sStart, sLen, nullptr, 0);
86 intoResult->resize (newStrLen);
87 if (newStrLen != 0) {
88 Verify (::MultiByteToWideChar (codePage, 0, sStart, sLen, Containers::Start (*intoResult), newStrLen) == newStrLen);
89 }
90 }
91 }
92
93 /*
94 ********************************************************************************
95 ********************************* BSTR2wstring *********************************
96 ********************************************************************************
97 */
98 inline wstring BSTR2wstring (BSTR b)
99 {
100 if (b == nullptr) {
101 return wstring{};
102 }
103 else {
104 return wstring{b};
105 }
106 }
107
108 /*
109 ********************************************************************************
110 ****************************** Win32CharSetToCodePage **************************
111 ********************************************************************************
112 */
113 inline CodePage Win32CharSetToCodePage (uint8_t lfCharSet)
114 {
115// See MSFT info article Q165478. No routine to map from these charset values - just this table... Sigh...
116// LGP 2001-04-30
117
118// In the file http://msdn.microsoft.com/library/specs/rtfspec_6.htm - there are many more of these
119// magic#s documented. But not how to get the numbers back from a font. May want to support more
120// of these? But I think not - I think they are now redudundantly specified via better mecahnims,
121// like UNICODE or \cpg
122// -- LGP 2000/04/29
123//
124// For SPR#1184 I grabbed a few more numbers. The mapping to codepages can be roughly guestimated by looking
125// in the registry around HKEY_LOCAL_MACHINE\SOFTWARE\Classes\MIME\Database\Charset\
126 // -- LGP 2002-11-29
127#ifdef CP_ACP
128 Assert (CP_ACP == 0);
129#else
130 const unsigned char CP_ACP = 0;
131#endif
132#ifdef ANSI_CHARSET
133 Assert (ANSI_CHARSET == 0);
134#else
135 const unsigned char ANSI_CHARSET = 0;
136#endif
137#ifdef DEFAULT_CHARSET
138 Assert (DEFAULT_CHARSET == 1);
139#else
140 const unsigned char DEFAULT_CHARSET = 1;
141#endif
142#ifdef MAC_CHARSET
143 Assert (MAC_CHARSET == 77);
144#else
145 const unsigned char MAC_CHARSET = 77;
146#endif
147#ifdef SHIFTJIS_CHARSET
148 Assert (SHIFTJIS_CHARSET == 128);
149#else
150 const unsigned char SHIFTJIS_CHARSET = 128;
151#endif
152#ifdef HANGEUL_CHARSET
153 Assert (HANGEUL_CHARSET == 129);
154#else
155 const unsigned char HANGEUL_CHARSET = 129;
156#endif
157#ifdef JOHAB_CHARSET
158 Assert (JOHAB_CHARSET == 130);
159#else
160 const unsigned char JOHAB_CHARSET = 130;
161#endif
162#ifdef GB2312_CHARSET
163 Assert (GB2312_CHARSET == 134);
164#else
165 const unsigned char GB2312_CHARSET = 134;
166#endif
167#ifdef CHINESEBIG5_CHARSET
168 Assert (CHINESEBIG5_CHARSET == 136);
169#else
170 const unsigned char CHINESEBIG5_CHARSET = 136;
171#endif
172#ifdef GREEK_CHARSET
173 Assert (GREEK_CHARSET == 161);
174#else
175 const unsigned char GREEK_CHARSET = 161;
176#endif
177#ifdef TURKISH_CHARSET
178 Assert (TURKISH_CHARSET == 162);
179#else
180 const unsigned char TURKISH_CHARSET = 162;
181#endif
182#ifdef VIETNAMESE_CHARSET
183 Assert (VIETNAMESE_CHARSET == 163);
184#else
185 const unsigned char VIETNAMESE_CHARSET = 163;
186#endif
187#ifdef HEBREW_CHARSET
188 Assert (HEBREW_CHARSET == 177);
189#else
190 const unsigned char HEBREW_CHARSET = 177;
191#endif
192#ifdef ARABIC_CHARSET
193 Assert (ARABIC_CHARSET == 178);
194#else
195 const unsigned char ARABIC_CHARSET = 178;
196#endif
197#ifdef BALTIC_CHARSET
198 Assert (BALTIC_CHARSET == 186);
199#else
200 const unsigned char BALTIC_CHARSET = 186;
201#endif
202#ifdef RUSSIAN_CHARSET
203 Assert (RUSSIAN_CHARSET == 204);
204#else
205 const unsigned char RUSSIAN_CHARSET = 204;
206#endif
207#ifdef THAI_CHARSET
208 Assert (THAI_CHARSET == 222);
209#else
210 const unsigned char THAI_CHARSET = 222;
211#endif
212#ifdef EASTEUROPE_CHARSET
213 Assert (EASTEUROPE_CHARSET == 238);
214#else
215 const unsigned char EASTEUROPE_CHARSET = 238;
216#endif
217#ifdef OEM_CHARSET
218 Assert (OEM_CHARSET == 255);
219#else
220 const unsigned char OEM_CHARSET = 255;
221#endif
222
223 switch (lfCharSet) {
224 case ANSI_CHARSET:
225 return WellKnownCodePages::kANSI; // right? Maybe SB? WellKnownCodePages::kANSI (1252)???
226 case MAC_CHARSET:
227 return WellKnownCodePages::kMAC;
228 case SHIFTJIS_CHARSET:
229 return WellKnownCodePages::kSJIS; // Japanese (SJIS)
230 case HANGEUL_CHARSET:
231 return WellKnownCodePages::kKorean; // Hangul
232 case GB2312_CHARSET:
233 return WellKnownCodePages::kGB2312; // Chinese
234 case CHINESEBIG5_CHARSET:
235 return WellKnownCodePages::kBIG5; // Chinese
236 case GREEK_CHARSET:
237 return WellKnownCodePages::kGreek; // Greek
238 case TURKISH_CHARSET:
239 return WellKnownCodePages::kTurkish; // Turkish
240 case VIETNAMESE_CHARSET:
241 return WellKnownCodePages::kVietnamese; // Vietnamese
242 case HEBREW_CHARSET:
243 return WellKnownCodePages::kHebrew; // Hebrew
244 case ARABIC_CHARSET:
245 return WellKnownCodePages::kArabic; // Arabic
246 case 179:
247 return WellKnownCodePages::kArabic; // Arabic Traditional
248 case 180:
249 return WellKnownCodePages::kArabic; // Arabic user
250 case 181:
251 return WellKnownCodePages::kHebrew; // Hebrew user
252 case BALTIC_CHARSET:
253 return WellKnownCodePages::kBaltic; // Baltic
254 case RUSSIAN_CHARSET:
255 return WellKnownCodePages::kCyrilic; // Russian/Cyrilic
256 case THAI_CHARSET:
257 return WellKnownCodePages::kThai; // Thai
258 case EASTEUROPE_CHARSET:
259 return WellKnownCodePages::kEasternEuropean; // aka 'central european'?
260 case 254:
261 return WellKnownCodePages::kPC;
262 break;
263 case OEM_CHARSET:
264 return WellKnownCodePages::kPCA;
265 break;
266 default:
267 return CP_ACP;
268 }
269 }
270
271 /*
272 ********************************************************************************
273 ************************ Win32PrimaryLangIDToCodePage **************************
274 ********************************************************************************
275 */
276 inline CodePage Win32PrimaryLangIDToCodePage (USHORT languageIdenifier)
277 {
278#ifdef CP_ACP
279 Assert (CP_ACP == 0);
280#else
281 const unsigned char CP_ACP = 0;
282#endif
283 /*
284 * I haven't found this stuff documented anyplace. Its hard to imagine that MSFT doesn't provide their own
285 * mapping routines! Anyhow - I got a start on this from some UNISCRIBE sample code, and have since
286 * added on from educated guesswork. -- LGP 2003-01-30
287 */
288 switch (PRIMARYLANGID (languageIdenifier)) {
289 case LANG_ARABIC:
290 return WellKnownCodePages::kArabic;
291 case LANG_ENGLISH:
292 return WellKnownCodePages::kANSI;
293 case LANG_FRENCH:
294 return WellKnownCodePages::kANSI;
295 case LANG_GERMAN:
296 return WellKnownCodePages::kANSI;
297 case LANG_GREEK:
298 return WellKnownCodePages::kGreek;
299 case LANG_HEBREW:
300 return WellKnownCodePages::kHebrew;
301 case LANG_ICELANDIC:
302 return WellKnownCodePages::kANSI; // guess? - 2003-01-30
303 case LANG_ITALIAN:
304 return WellKnownCodePages::kANSI;
305 case LANG_KOREAN:
306 return WellKnownCodePages::kKorean;
307 case LANG_POLISH:
308 return WellKnownCodePages::kEasternEuropean; // a bit of a guess - LGP 2003-01-30
309 case LANG_PORTUGUESE:
310 return WellKnownCodePages::kANSI;
311 case LANG_RUSSIAN:
312 return WellKnownCodePages::kCyrilic;
313 case LANG_SPANISH:
314 return WellKnownCodePages::kANSI;
315 case LANG_SWEDISH:
316 return WellKnownCodePages::kANSI; // guess? - 2003-01-30
317 case LANG_THAI:
318 return WellKnownCodePages::kThai;
319 case LANG_TURKISH:
320 return WellKnownCodePages::kTurkish;
321 case LANG_UKRAINIAN:
322 return WellKnownCodePages::kCyrilic; // guess? - 2003-01-30
323 case LANG_VIETNAMESE:
324 return WellKnownCodePages::kVietnamese;
325 default:
326 return CP_ACP;
327 }
328 }
329
330 /*
331 ********************************************************************************
332 ************************ PlatformCodePageConverter *****************************
333 ********************************************************************************
334 */
335 inline PlatformCodePageConverter::PlatformCodePageConverter (CodePage codePage)
336 : fCodePage_{codePage}
337 {
338 }
339
340}
#define RequireNotNull(p)
Definition Assertions.h:347
#define Verify(c)
Definition Assertions.h:419
CONTAINER::value_type * Start(CONTAINER &c)
For a contiguous container (such as a vector or basic_string) - find the pointer to the start of the ...