Stroika Library 3.0d16
 
Loading...
Searching...
No Matches
TextBreaks.cpp
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#include "Stroika/Frameworks/StroikaPreComp.h"
5
6#include <cctype>
7
11
12#include "Stroika/Frameworks/Led/Config.h"
13
14#include "TextBreaks.h"
15
16using std::byte;
17
18using namespace Stroika::Foundation;
20
21using namespace Stroika::Frameworks;
22using namespace Stroika::Frameworks::Led;
23
24// These SHOULD work for UNICODE, MBYTE and SingleByte case...
25inline bool IsASCIISpace (Led_tChar c)
26{
27 return isascii (c) and isspace (c);
28}
29inline bool IsASCIIAlnum (Led_tChar c)
30{
31 return isascii (c) and isalnum (c);
32}
33inline bool IsASCIIAlpha (Led_tChar c)
34{
35 return isascii (c) and isalpha (c);
36}
37inline bool IsASCIIDigit (Led_tChar c)
38{
39 return isascii (c) and isdigit (c);
40}
41
42static bool SJIS_IsLeadByte (unsigned char c)
43{
44 // Based on code from LEC - mtcdef.h
45 return ((c >= 0x81 and c <= 0x9f) or (c >= 0xe0 and c <= 0xfc));
46}
47static bool SJIS_IsBOLChar (const char* mbChar)
48{
49 AssertNotNull (mbChar);
50 unsigned char byte0 = (unsigned char)mbChar[0];
51 unsigned char byte1 = (unsigned char)mbChar[1];
52 // Based on code from LEC - jwrap.c
53 static constexpr unsigned char yBits[8] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
54 static constexpr unsigned char yBOLTable[4][32] = {
55 {0x0, 0x0, 0x0, 0x0, 0x2, 0x52, 0x0, 0xcc, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x20,
56 0x0, 0x0, 0x0, 0x0, 0x98, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0},
57 {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xdf, 0x3, 0x3c, 0x1, 0x40, 0x55, 0x55, 0x5,
58 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
59 {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
60 0x0, 0x0, 0x0, 0x80, 0xa2, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2a, 0x10, 0x0, 0x0},
61 {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x55, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
62 0xa8, 0x40, 0x60, 0x0, 0x8, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}};
63 unsigned char hi;
64 unsigned char lo;
65 if (SJIS_IsLeadByte (byte0)) {
66 if (byte0 < (unsigned char)0x81 || byte0 > (unsigned char)0x83)
67 return false;
68 else {
69 hi = byte0;
70 hi -= 0x80;
71 lo = byte1;
72 }
73 }
74 else {
75 hi = 0;
76 lo = byte0;
77 }
78 bool isBOLChar = (yBOLTable[hi][lo / 8] & yBits[lo & 7]);
79 return (isBOLChar);
80}
81static bool SJIS_IsEOLChar (const char* mbChar)
82{
83 AssertNotNull (mbChar);
84 unsigned char byte0 = (unsigned char)mbChar[0];
85 unsigned char byte1 = (unsigned char)mbChar[1];
86 // Based on code from LEC - jwrap.c
87 static unsigned char yBits[8] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
88 static unsigned char yEOLTable[2][32] = {{0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x10, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0, 0x8,
89 0x0, 0x0, 0x0, 0x0, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
90 {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xa0, 0xaa, 0xaa, 0x2,
91 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}};
92 unsigned char hi;
93 unsigned char lo;
94 if (SJIS_IsLeadByte (byte0)) {
95 if (byte0 != 0x81) {
96 return false;
97 }
98 else {
99 hi = 1;
100 lo = byte1;
101 }
102 }
103 else {
104 hi = 0;
105 lo = byte0;
106 }
107 bool isEOLChar = bool (yEOLTable[hi][lo / 8] & yBits[lo & 7]);
108 return isEOLChar;
109}
110
111// CONSIDER USING CodePageConverter class here!!!
112inline unsigned SJIS_To_Kuten_Row (const char mbChar[2])
113{
114 // See alec@lec.com's 950111 email message/SPEC
115 unsigned char c1 = mbChar[0];
116 unsigned char c2 = mbChar[1];
117 int adjust = (c2 < 159 ? 1 : 0);
118 int rowOffset = (c1 < 160 ? 112 : 176);
119 int result = ((c1 - rowOffset) << 1) - adjust - 32;
120 Assert (result >= 0);
121 return (result);
122}
123inline bool IsJapaneseBOLChar (wchar_t c)
124{
125 char mbyteChars[2];
126 size_t nBytesInThisChar = 2;
127 char16_t useC = static_cast<char16_t> (c); // this code was originally written for wchar_t == char16_t, so that explains unfortunate casts for now
128 nBytesInThisChar = CodeCvt<char16_t>{Characters::WellKnownCodePages::kSJIS}
129 .Characters2Bytes (span{&useC, 1}, Memory::SpanBytesCast<span<byte>> (span{mbyteChars}))
130 .size ();
131 Assert (nBytesInThisChar >= 0 and nBytesInThisChar <= 2);
132 if (nBytesInThisChar == 0) {
133 return 0; // if No SJIS code page, not much we can do!
134 }
135 return SJIS_IsBOLChar (mbyteChars);
136}
137inline bool IsJapaneseEOLChar (wchar_t c)
138{
139 char mbyteChars[2];
140 size_t nBytesInThisChar = 2;
141 char16_t useC = static_cast<char16_t> (c); // this code was originally written for wchar_t == char16_t, so that explains unfortunate casts for now
142 nBytesInThisChar = CodeCvt<char16_t>{Characters::WellKnownCodePages::kSJIS}
143 .Characters2Bytes (span{&useC, 1}, Memory::SpanBytesCast<span<byte>> (span{mbyteChars}))
144 .size ();
145 Assert (nBytesInThisChar >= 0 and nBytesInThisChar <= 2);
146 if (nBytesInThisChar == 0) {
147 return 0; // if No SJIS code page, not much we can do!
148 }
149 return SJIS_IsEOLChar (mbyteChars);
150}
151inline unsigned GetJapaneseKutenRow (wchar_t c)
152{
153 char mbyteChars[2];
154 size_t nBytesInThisChar = 2;
155 char16_t useC = static_cast<char16_t> (c); // this code was originally written for wchar_t == char16_t, so that explains unfortunate casts for now
156 nBytesInThisChar = CodeCvt<char16_t>{Characters::WellKnownCodePages::kSJIS}
157 .Characters2Bytes (span{&useC, 1}, Memory::SpanBytesCast<span<byte>> (span{mbyteChars}))
158 .size ();
159 Assert (nBytesInThisChar >= 0 and nBytesInThisChar <= 2);
160 if (nBytesInThisChar == 0) {
161 return 0; // if No SJIS code page, not much we can do!
162 }
163 if (SJIS_IsLeadByte (mbyteChars[0])) {
164 return SJIS_To_Kuten_Row (mbyteChars);
165 }
166 else {
167 return 0;
168 }
169}
170
171/*
172 ********************************************************************************
173 ******************************** TextBreaks_Basic ******************************
174 ********************************************************************************
175 */
176TextBreaks_Basic::TextBreaks_Basic ()
177{
178#if qStroika_Foundation_Debug_AssertionsChecked
179 // NB: since this is called in this CTOR - it doesn't capture (or pay attention to) subclass overrides of CharToCharacterClass
180 // That fact is important - since subclasses might change its result in a way to voilate this regression test. Thats fine - if its
181 // desired by the subclass. This is just a test to make sure logical changes we make to this code have intended
182 // consequences... LGP 2003-11-24
183 RegressionTest ();
184#endif
185}
186
187void TextBreaks_Basic::FindWordBreaks (const Led_tChar* startOfText, size_t lengthOfText, size_t textOffsetToStartLookingForWord,
188 size_t* wordStartResult, size_t* wordEndResult, bool* wordReal) const
189{
190 AssertNotNull (startOfText);
191 AssertNotNull (wordStartResult);
192 AssertNotNull (wordEndResult);
193 AssertNotNull (wordReal);
194 Assert (textOffsetToStartLookingForWord <= lengthOfText);
195
196 if (textOffsetToStartLookingForWord == lengthOfText) {
197 *wordStartResult = textOffsetToStartLookingForWord;
198 *wordEndResult = textOffsetToStartLookingForWord;
199 *wordReal = false;
200 return;
201 }
202 /*
203 * First figure out the class of characters we are looking at. Then - scan backwards
204 * looking for the same class of characters. Then scan forwards (from our starting point)
205 * looking for the same class of characters. Return a signal if the class of characters is
206 * space or not (treat all other word-classes as the same for this purpose).
207 * Except that there is a special sentinel class which always breaks anything.
208 */
209 Assert (textOffsetToStartLookingForWord < lengthOfText); // cuz we checked at start - and returned if equal
210
211 CharacterClasses charClass = CharToCharacterClass (startOfText, lengthOfText, &startOfText[textOffsetToStartLookingForWord]);
212
213 if (charClass == eSentinelClass) {
214 *wordStartResult = textOffsetToStartLookingForWord;
215 *wordEndResult = textOffsetToStartLookingForWord + 1;
216 *wordReal = true;
217 return;
218 }
219
220 // Scan back - while character of the same class
221 {
222 const Led_tChar* cur = &startOfText[textOffsetToStartLookingForWord];
223 if (charClass != eOtherCharacterClass and textOffsetToStartLookingForWord != 0) {
224 for (const Led_tChar* maybeCur = Led_PreviousChar (startOfText, cur);; maybeCur = Led_PreviousChar (startOfText, maybeCur)) {
225 if (*maybeCur == '\n') { //SPR#0354 - don't cross lines
226 break;
227 }
228 CharacterClasses curCharClass = CharToCharacterClass (startOfText, lengthOfText, maybeCur);
229 if (charClass == curCharClass) {
230 cur = maybeCur;
231 }
232 else {
233 break;
234 }
235
236 if (maybeCur == startOfText) {
237 break;
238 }
239 }
240 }
241 *wordStartResult = cur - startOfText;
242 }
243 // Scan forward - while character of the same class
244 {
245 const Led_tChar* cur = &startOfText[textOffsetToStartLookingForWord];
246 for (; cur < &startOfText[lengthOfText]; cur = Led_NextChar (cur)) {
247 if (*cur == '\n') { //SPR#0354
248 break;
249 }
250 CharacterClasses curCharClass = CharToCharacterClass (startOfText, lengthOfText, cur);
251 /*
252 * On a change of char-class break. Except for the case of the special class OTHER - in which case we break
253 * return a single char as result.
254 */
255 if (charClass != curCharClass or (cur != &startOfText[textOffsetToStartLookingForWord] and charClass == eOtherCharacterClass)) {
256 break;
257 }
258 }
259 *wordEndResult = cur - startOfText;
260 }
261 *wordReal = not(charClass == eSpaceClass) and (*wordStartResult != *wordEndResult);
262}
263
264void TextBreaks_Basic::FindLineBreaks (const Led_tChar* startOfText, size_t lengthOfText, size_t textOffsetToStartLookingForWord,
265 size_t* wordEndResult, bool* wordReal) const
266{
267 AssertNotNull (startOfText);
268 AssertNotNull (wordEndResult);
269 AssertNotNull (wordReal);
270 Assert (textOffsetToStartLookingForWord <= lengthOfText); // Cannot look at characters
271
272 if (textOffsetToStartLookingForWord == lengthOfText) {
273 *wordEndResult = textOffsetToStartLookingForWord;
274 *wordReal = false;
275 return;
276 }
277
278 Led_tChar thisChar = startOfText[textOffsetToStartLookingForWord];
279
280 bool isSpaceChar = IsASCIISpace (thisChar);
281 CharacterClasses startCharClass = CharToCharacterClass (startOfText, lengthOfText, &startOfText[textOffsetToStartLookingForWord]);
282
283 // Scan forward - while character of the same class
284 if (startCharClass == eSentinelClass) {
285 *wordEndResult = textOffsetToStartLookingForWord + 1;
286 Assert (not isspace (thisChar)); // else we need to cleanup the wordReal logic below...
287 }
288 else {
289 Led_tChar prevChar = thisChar; // for Kinsoku rule - need to keep track of previous character...
290 // But since we skip first char at start of loop, initialize with
291 // first char!
292
293 CharacterClasses prevCharWordClass = startCharClass;
294
295 const Led_tChar* end = &startOfText[lengthOfText];
296 const Led_tChar* cur = Led_NextChar (&startOfText[textOffsetToStartLookingForWord]);
297 for (; cur < end; cur = Led_NextChar (cur)) {
298 Led_tChar thisLoopCurChar = *cur;
299
300 CharacterClasses charClass = CharToCharacterClass (startOfText, lengthOfText, cur);
301 if (charClass == eSentinelClass) {
302 break;
303 }
304
305 /*
306 * On a change of char-class break (space-ness) - we return a possible row break.
307 */
308 bool curCharSpaceChar = IsASCIISpace (thisLoopCurChar);
309 if (isSpaceChar != curCharSpaceChar) {
310 break;
311 }
312 // FROM CHARLESVIEW EDITOR - (Basically) ALL I COPIED WAS THE COMMENT!
313 //
314 // Here is the Kinsoku rule:
315 // The following character combinations cannot be broken:
316 // an EOL character followed by any character.
317 // any character followed by an BOL character.
318 // any non-white space english characters.
319 if (not isSpaceChar) {
320 if ((charClass != eWordClass or prevCharWordClass != eWordClass or IsASCIISpace (thisLoopCurChar)) and
321 not IsJapaneseEOLChar (prevChar) and not IsJapaneseBOLChar (thisLoopCurChar)) {
322 break;
323 }
324 }
325 prevChar = thisLoopCurChar;
326 prevCharWordClass = charClass;
327 }
328 *wordEndResult = cur - startOfText;
329 }
330 *wordReal = (not(IsASCIISpace (thisChar))) and (textOffsetToStartLookingForWord != *wordEndResult);
331 Assert (*wordEndResult <= lengthOfText); // LGP added 950208 - in response to Alecs email message of same date - not
332 // sure this assert is right, but might help debugging later...
333}
334
335TextBreaks_Basic::CharacterClasses TextBreaks_Basic::CharToCharacterClass (const Led_tChar* startOfText, size_t lengthOfText,
336 const Led_tChar* charToExamine) const
337{
338 Led_tChar c = *charToExamine;
339
340 if (c == 0) {
341 return eSentinelClass;
342 }
343 /*
344 * Return appropriate class for all characters we are SURE of. Some deepend on
345 * context (like decimal point). For those - we return eOtherCharacterClass, and
346 * let the calling software check those special cases.
347 */
348 if (IsASCIISpace (c)) {
349 return (eSpaceClass);
350 }
351 if (IsASCIIAlnum (c)) {
352 return (eWordClass);
353 }
354 {
355 unsigned kutenRow = GetJapaneseKutenRow (c);
356 switch (kutenRow) {
357 case 3:
358 return (eRomanjiOrDigitClass);
359 case 4:
360 return (eHiraganaClass);
361 case 5:
362 return (eKatakanaClass);
363 default: {
364 if (kutenRow >= 16 and kutenRow <= 84) {
365 return (eKanjiClass);
366 }
367 }
368 }
369 }
370
371 switch (c) {
372 case '.': { // PERIOD before digits
373 if (charToExamine > startOfText and charToExamine < &startOfText[lengthOfText]) {
374 const Led_tChar* nextChar = charToExamine + 1; // cuz we KNOW we are single-byte...
375
376 if (IsASCIIDigit (*nextChar)) {
377 return (eWordClass);
378 }
379 }
380 } break;
381 }
382
383 if (not Character (c).IsPunctuation ()) {
384 return eWordClass;
385 }
386
387 return eOtherCharacterClass;
388}
389
390#if qStroika_Foundation_Debug_AssertionsChecked
391void TextBreaks_Basic::RegressionTest ()
392{
393 {
394 const Led_tChar* kTest = LED_TCHAR_OF ("This is a good test");
395 size_t wordStartResult = 0;
396 size_t wordEndResult = 0;
397 bool wordReal = 0;
398
399 FindWordBreaks (kTest, Led_tStrlen (kTest), 1, &wordStartResult, &wordEndResult, &wordReal);
400 Assert (wordEndResult == 4);
401 Assert (wordReal == true);
402
403 FindWordBreaks (kTest, Led_tStrlen (kTest), 4, &wordStartResult, &wordEndResult, &wordReal);
404 Assert (wordEndResult == 5);
405 Assert (wordReal == false);
406 }
407}
408#endif
409
410/*
411 ********************************************************************************
412 ***************************** TextBreaks_Basic_WP ******************************
413 ********************************************************************************
414 */
415TextBreaks_Basic_WP::TextBreaks_Basic_WP ()
416{
417#if qStroika_Foundation_Debug_AssertionsChecked
418 // NB: since this is called in this CTOR - it doesn't capture (or pay attention to) subclass overrides of CharToCharacterClass
419 // That fact is important - since subclasses might change its result in a way to voilate this regression test. Thats fine - if its
420 // desired by the subclass. This is just a test to make sure logical changes we make to this code have intended
421 // consequences... LGP 2003-11-24
422 RegressionTest ();
423#endif
424}
425
426TextBreaks_Basic_WP::CharacterClasses TextBreaks_Basic_WP::CharToCharacterClass (const Led_tChar* startOfText, size_t lengthOfText,
427 const Led_tChar* charToExamine) const
428{
429 switch (*charToExamine) {
430 case ',': { // COMMA between digits
431 if (charToExamine > startOfText and charToExamine < &startOfText[lengthOfText]) {
432 const Led_tChar* prevChar = Led_PreviousChar (startOfText, charToExamine);
433 const Led_tChar* nextChar = charToExamine + 1; // cuz we KNOW we are single-byte...
434
435 if (IsASCIIDigit (*prevChar) and IsASCIIDigit (*nextChar)) {
436 return (eWordClass);
437 }
438 }
439 } break;
440
441 case 0x2019: // curly apostrophe
442 case '\'': {
443 // APOSTROPHE between digits or letters
444 if (charToExamine > startOfText and charToExamine < &startOfText[lengthOfText]) {
445 const Led_tChar* prevChar = Led_PreviousChar (startOfText, charToExamine);
446 const Led_tChar* nextChar = charToExamine + 1; // cuz we KNOW we are single-byte...
447
448 // E.g.: Fred's or Lewis', but not Jim'
449 if ((IsASCIIAlnum (*prevChar) and *nextChar == 's') or (*prevChar == 's' and IsASCIISpace (*nextChar))) {
450 return (eWordClass);
451 }
452 }
453 } break;
454 }
455
456 // Mimic what we did for MacOS (Inside-Mac : Text (Appendix A-6 thru A-15))
457 if (*charToExamine == kNonBreakingSpace or *charToExamine == kPoundSign or *charToExamine == kYenSign or *charToExamine == kCentSign) {
458 return eWordClass;
459 }
460 if (*charToExamine == '$' or *charToExamine == '%' or *charToExamine == '-') {
461 return (eWordClass);
462 }
463
464 return inherited::CharToCharacterClass (startOfText, lengthOfText, charToExamine);
465}
466
467#if qStroika_Foundation_Debug_AssertionsChecked
468void TextBreaks_Basic_WP::RegressionTest ()
469{
470 {
471 const Led_tChar* kTest = LED_TCHAR_OF ("This is a good test");
472 size_t wordStartResult = 0;
473 size_t wordEndResult = 0;
474 bool wordReal = 0;
475
476 FindWordBreaks (kTest, Led_tStrlen (kTest), 1, &wordStartResult, &wordEndResult, &wordReal);
477 Assert (wordEndResult == 4);
478 Assert (wordReal == true);
479
480 FindWordBreaks (kTest, Led_tStrlen (kTest), 4, &wordStartResult, &wordEndResult, &wordReal);
481 Assert (wordEndResult == 5);
482 Assert (wordReal == false);
483 }
484
485 {
486 const Led_tChar* kTest = LED_TCHAR_OF ("This is a good test of Simone's bug with the 'word'.");
487 size_t wordStartResult = 0;
488 size_t wordEndResult = 0;
489 bool wordReal = 0;
490
491 FindWordBreaks (kTest, Led_tStrlen (kTest), 25, &wordStartResult, &wordEndResult, &wordReal);
492 Assert (wordEndResult == 31);
493 Assert (wordReal == true);
494 }
495}
496#endif
497
498/*
499 ********************************************************************************
500 ************************* TextBreaks_Basic_TextEditor **************************
501 ********************************************************************************
502 */
503TextBreaks_Basic_TextEditor::TextBreaks_Basic_TextEditor ()
504{
505#if qStroika_Foundation_Debug_AssertionsChecked
506 // NB: since this is called in this CTOR - it doesn't capture (or pay attention to) subclass overrides of CharToCharacterClass
507 // That fact is important - since subclasses might change its result in a way to voilate this regression test. Thats fine - if its
508 // desired by the subclass. This is just a test to make sure logical changes we make to this code have intended
509 // consequences... LGP 2003-11-24
510 RegressionTest ();
511#endif
512}
513
514TextBreaks_Basic_TextEditor::CharacterClasses TextBreaks_Basic_TextEditor::CharToCharacterClass (const Led_tChar* startOfText, size_t lengthOfText,
515 const Led_tChar* charToExamine) const
516{
517 if (*charToExamine == '$' or *charToExamine == '%') {
518 return (eWordClass);
519 }
520 if (*charToExamine == '_') {
521 return (eWordClass); // SPR#1309 - I think this works a little better in text word selection
522 }
523 return inherited::CharToCharacterClass (startOfText, lengthOfText, charToExamine);
524}
525
526#if qStroika_Foundation_Debug_AssertionsChecked
527void TextBreaks_Basic_TextEditor::RegressionTest ()
528{
529 {
530 const Led_tChar* kTest = LED_TCHAR_OF ("This is a good test of Simone's bug with the 'word'.");
531 size_t wordStartResult = 0;
532 size_t wordEndResult = 0;
533 bool wordReal = 0;
534
535 FindWordBreaks (kTest, Led_tStrlen (kTest), 25, &wordStartResult, &wordEndResult, &wordReal);
536 Assert (wordEndResult == 29);
537 Assert (wordReal == true);
538 }
539}
540#endif
#define AssertNotNull(p)
Definition Assertions.h:333
CodeCvt unifies byte <-> unicode conversions, vaguely inspired by (and wraps) std::codecvt,...
Definition CodeCvt.h:118