4#include "Stroika/Frameworks/StroikaPreComp.h"
12#include "Stroika/Frameworks/Led/Config.h"
14#include "TextBreaks.h"
21using namespace Stroika::Frameworks;
22using namespace Stroika::Frameworks::Led;
25inline bool IsASCIISpace (Led_tChar c)
27 return isascii (c) and isspace (c);
29inline bool IsASCIIAlnum (Led_tChar c)
31 return isascii (c) and isalnum (c);
33inline bool IsASCIIAlpha (Led_tChar c)
35 return isascii (c) and isalpha (c);
37inline bool IsASCIIDigit (Led_tChar c)
39 return isascii (c) and isdigit (c);
42static bool SJIS_IsLeadByte (
unsigned char c)
45 return ((c >= 0x81 and c <= 0x9f) or (c >= 0xe0 and c <= 0xfc));
47static bool SJIS_IsBOLChar (
const char* mbChar)
50 unsigned char byte0 = (
unsigned char)mbChar[0];
51 unsigned char byte1 = (
unsigned char)mbChar[1];
53 static constexpr unsigned char yBits[8] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
54 static constexpr unsigned char yBOLTable[4][32] = {
55 {0x0, 0x0, 0x0, 0x0, 0x2, 0x52, 0x0, 0xcc, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x20,
56 0x0, 0x0, 0x0, 0x0, 0x98, 0xff, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x0, 0x0, 0x0, 0x0},
57 {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xdf, 0x3, 0x3c, 0x1, 0x40, 0x55, 0x55, 0x5,
58 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
59 {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
60 0x0, 0x0, 0x0, 0x80, 0xa2, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2a, 0x10, 0x0, 0x0},
61 {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x55, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
62 0xa8, 0x40, 0x60, 0x0, 0x8, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}};
65 if (SJIS_IsLeadByte (byte0)) {
66 if (byte0 < (
unsigned char)0x81 || byte0 > (
unsigned char)0x83)
78 bool isBOLChar = (yBOLTable[hi][lo / 8] & yBits[lo & 7]);
81static bool SJIS_IsEOLChar (
const char* mbChar)
84 unsigned char byte0 = (
unsigned char)mbChar[0];
85 unsigned char byte1 = (
unsigned char)mbChar[1];
87 static unsigned char yBits[8] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
88 static unsigned char yEOLTable[2][32] = {{0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x10, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0, 0x0, 0x8,
89 0x0, 0x0, 0x0, 0x0, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
90 {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xa0, 0xaa, 0xaa, 0x2,
91 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}};
94 if (SJIS_IsLeadByte (byte0)) {
107 bool isEOLChar = bool (yEOLTable[hi][lo / 8] & yBits[lo & 7]);
112inline unsigned SJIS_To_Kuten_Row (
const char mbChar[2])
115 unsigned char c1 = mbChar[0];
116 unsigned char c2 = mbChar[1];
117 int adjust = (c2 < 159 ? 1 : 0);
118 int rowOffset = (c1 < 160 ? 112 : 176);
119 int result = ((c1 - rowOffset) << 1) - adjust - 32;
120 Assert (result >= 0);
123inline bool IsJapaneseBOLChar (
wchar_t c)
126 size_t nBytesInThisChar = 2;
127 char16_t useC =
static_cast<char16_t> (c);
129 .Characters2Bytes (span{&useC, 1}, Memory::SpanBytesCast<span<byte>> (span{mbyteChars}))
131 Assert (nBytesInThisChar >= 0 and nBytesInThisChar <= 2);
132 if (nBytesInThisChar == 0) {
135 return SJIS_IsBOLChar (mbyteChars);
137inline bool IsJapaneseEOLChar (
wchar_t c)
140 size_t nBytesInThisChar = 2;
141 char16_t useC =
static_cast<char16_t> (c);
143 .Characters2Bytes (span{&useC, 1}, Memory::SpanBytesCast<span<byte>> (span{mbyteChars}))
145 Assert (nBytesInThisChar >= 0 and nBytesInThisChar <= 2);
146 if (nBytesInThisChar == 0) {
149 return SJIS_IsEOLChar (mbyteChars);
151inline unsigned GetJapaneseKutenRow (
wchar_t c)
154 size_t nBytesInThisChar = 2;
155 char16_t useC =
static_cast<char16_t> (c);
157 .Characters2Bytes (span{&useC, 1}, Memory::SpanBytesCast<span<byte>> (span{mbyteChars}))
159 Assert (nBytesInThisChar >= 0 and nBytesInThisChar <= 2);
160 if (nBytesInThisChar == 0) {
163 if (SJIS_IsLeadByte (mbyteChars[0])) {
164 return SJIS_To_Kuten_Row (mbyteChars);
176TextBreaks_Basic::TextBreaks_Basic ()
178#if qStroika_Foundation_Debug_AssertionsChecked
187void TextBreaks_Basic::FindWordBreaks (
const Led_tChar* startOfText,
size_t lengthOfText,
size_t textOffsetToStartLookingForWord,
188 size_t* wordStartResult,
size_t* wordEndResult,
bool* wordReal)
const
194 Assert (textOffsetToStartLookingForWord <= lengthOfText);
196 if (textOffsetToStartLookingForWord == lengthOfText) {
197 *wordStartResult = textOffsetToStartLookingForWord;
198 *wordEndResult = textOffsetToStartLookingForWord;
209 Assert (textOffsetToStartLookingForWord < lengthOfText);
211 CharacterClasses charClass = CharToCharacterClass (startOfText, lengthOfText, &startOfText[textOffsetToStartLookingForWord]);
213 if (charClass == eSentinelClass) {
214 *wordStartResult = textOffsetToStartLookingForWord;
215 *wordEndResult = textOffsetToStartLookingForWord + 1;
222 const Led_tChar* cur = &startOfText[textOffsetToStartLookingForWord];
223 if (charClass != eOtherCharacterClass and textOffsetToStartLookingForWord != 0) {
224 for (
const Led_tChar* maybeCur = Led_PreviousChar (startOfText, cur);; maybeCur = Led_PreviousChar (startOfText, maybeCur)) {
225 if (*maybeCur ==
'\n') {
228 CharacterClasses curCharClass = CharToCharacterClass (startOfText, lengthOfText, maybeCur);
229 if (charClass == curCharClass) {
236 if (maybeCur == startOfText) {
241 *wordStartResult = cur - startOfText;
245 const Led_tChar* cur = &startOfText[textOffsetToStartLookingForWord];
246 for (; cur < &startOfText[lengthOfText]; cur = Led_NextChar (cur)) {
250 CharacterClasses curCharClass = CharToCharacterClass (startOfText, lengthOfText, cur);
255 if (charClass != curCharClass or (cur != &startOfText[textOffsetToStartLookingForWord] and charClass == eOtherCharacterClass)) {
259 *wordEndResult = cur - startOfText;
261 *wordReal = not(charClass == eSpaceClass) and (*wordStartResult != *wordEndResult);
264void TextBreaks_Basic::FindLineBreaks (
const Led_tChar* startOfText,
size_t lengthOfText,
size_t textOffsetToStartLookingForWord,
265 size_t* wordEndResult,
bool* wordReal)
const
270 Assert (textOffsetToStartLookingForWord <= lengthOfText);
272 if (textOffsetToStartLookingForWord == lengthOfText) {
273 *wordEndResult = textOffsetToStartLookingForWord;
278 Led_tChar thisChar = startOfText[textOffsetToStartLookingForWord];
280 bool isSpaceChar = IsASCIISpace (thisChar);
281 CharacterClasses startCharClass = CharToCharacterClass (startOfText, lengthOfText, &startOfText[textOffsetToStartLookingForWord]);
284 if (startCharClass == eSentinelClass) {
285 *wordEndResult = textOffsetToStartLookingForWord + 1;
286 Assert (not isspace (thisChar));
289 Led_tChar prevChar = thisChar;
293 CharacterClasses prevCharWordClass = startCharClass;
295 const Led_tChar* end = &startOfText[lengthOfText];
296 const Led_tChar* cur = Led_NextChar (&startOfText[textOffsetToStartLookingForWord]);
297 for (; cur < end; cur = Led_NextChar (cur)) {
298 Led_tChar thisLoopCurChar = *cur;
300 CharacterClasses charClass = CharToCharacterClass (startOfText, lengthOfText, cur);
301 if (charClass == eSentinelClass) {
308 bool curCharSpaceChar = IsASCIISpace (thisLoopCurChar);
309 if (isSpaceChar != curCharSpaceChar) {
319 if (not isSpaceChar) {
320 if ((charClass != eWordClass or prevCharWordClass != eWordClass or IsASCIISpace (thisLoopCurChar)) and
321 not IsJapaneseEOLChar (prevChar) and not IsJapaneseBOLChar (thisLoopCurChar)) {
325 prevChar = thisLoopCurChar;
326 prevCharWordClass = charClass;
328 *wordEndResult = cur - startOfText;
330 *wordReal = (not(IsASCIISpace (thisChar))) and (textOffsetToStartLookingForWord != *wordEndResult);
331 Assert (*wordEndResult <= lengthOfText);
335TextBreaks_Basic::CharacterClasses TextBreaks_Basic::CharToCharacterClass (
const Led_tChar* startOfText,
size_t lengthOfText,
336 const Led_tChar* charToExamine)
const
338 Led_tChar c = *charToExamine;
341 return eSentinelClass;
348 if (IsASCIISpace (c)) {
349 return (eSpaceClass);
351 if (IsASCIIAlnum (c)) {
355 unsigned kutenRow = GetJapaneseKutenRow (c);
358 return (eRomanjiOrDigitClass);
360 return (eHiraganaClass);
362 return (eKatakanaClass);
364 if (kutenRow >= 16 and kutenRow <= 84) {
365 return (eKanjiClass);
373 if (charToExamine > startOfText and charToExamine < &startOfText[lengthOfText]) {
374 const Led_tChar* nextChar = charToExamine + 1;
376 if (IsASCIIDigit (*nextChar)) {
383 if (not
Character (c).IsPunctuation ()) {
387 return eOtherCharacterClass;
390#if qStroika_Foundation_Debug_AssertionsChecked
391void TextBreaks_Basic::RegressionTest ()
394 const Led_tChar* kTest = LED_TCHAR_OF (
"This is a good test");
395 size_t wordStartResult = 0;
396 size_t wordEndResult = 0;
399 FindWordBreaks (kTest, Led_tStrlen (kTest), 1, &wordStartResult, &wordEndResult, &wordReal);
400 Assert (wordEndResult == 4);
401 Assert (wordReal ==
true);
403 FindWordBreaks (kTest, Led_tStrlen (kTest), 4, &wordStartResult, &wordEndResult, &wordReal);
404 Assert (wordEndResult == 5);
405 Assert (wordReal ==
false);
415TextBreaks_Basic_WP::TextBreaks_Basic_WP ()
417#if qStroika_Foundation_Debug_AssertionsChecked
426TextBreaks_Basic_WP::CharacterClasses TextBreaks_Basic_WP::CharToCharacterClass (
const Led_tChar* startOfText,
size_t lengthOfText,
427 const Led_tChar* charToExamine)
const
429 switch (*charToExamine) {
431 if (charToExamine > startOfText and charToExamine < &startOfText[lengthOfText]) {
432 const Led_tChar* prevChar = Led_PreviousChar (startOfText, charToExamine);
433 const Led_tChar* nextChar = charToExamine + 1;
435 if (IsASCIIDigit (*prevChar) and IsASCIIDigit (*nextChar)) {
444 if (charToExamine > startOfText and charToExamine < &startOfText[lengthOfText]) {
445 const Led_tChar* prevChar = Led_PreviousChar (startOfText, charToExamine);
446 const Led_tChar* nextChar = charToExamine + 1;
449 if ((IsASCIIAlnum (*prevChar) and *nextChar ==
's') or (*prevChar ==
's' and IsASCIISpace (*nextChar))) {
457 if (*charToExamine == kNonBreakingSpace or *charToExamine == kPoundSign or *charToExamine == kYenSign or *charToExamine == kCentSign) {
460 if (*charToExamine ==
'$' or *charToExamine ==
'%' or *charToExamine ==
'-') {
464 return inherited::CharToCharacterClass (startOfText, lengthOfText, charToExamine);
467#if qStroika_Foundation_Debug_AssertionsChecked
468void TextBreaks_Basic_WP::RegressionTest ()
471 const Led_tChar* kTest = LED_TCHAR_OF (
"This is a good test");
472 size_t wordStartResult = 0;
473 size_t wordEndResult = 0;
476 FindWordBreaks (kTest, Led_tStrlen (kTest), 1, &wordStartResult, &wordEndResult, &wordReal);
477 Assert (wordEndResult == 4);
478 Assert (wordReal ==
true);
480 FindWordBreaks (kTest, Led_tStrlen (kTest), 4, &wordStartResult, &wordEndResult, &wordReal);
481 Assert (wordEndResult == 5);
482 Assert (wordReal ==
false);
486 const Led_tChar* kTest = LED_TCHAR_OF (
"This is a good test of Simone's bug with the 'word'.");
487 size_t wordStartResult = 0;
488 size_t wordEndResult = 0;
491 FindWordBreaks (kTest, Led_tStrlen (kTest), 25, &wordStartResult, &wordEndResult, &wordReal);
492 Assert (wordEndResult == 31);
493 Assert (wordReal ==
true);
503TextBreaks_Basic_TextEditor::TextBreaks_Basic_TextEditor ()
505#if qStroika_Foundation_Debug_AssertionsChecked
514TextBreaks_Basic_TextEditor::CharacterClasses TextBreaks_Basic_TextEditor::CharToCharacterClass (
const Led_tChar* startOfText,
size_t lengthOfText,
515 const Led_tChar* charToExamine)
const
517 if (*charToExamine ==
'$' or *charToExamine ==
'%') {
520 if (*charToExamine ==
'_') {
523 return inherited::CharToCharacterClass (startOfText, lengthOfText, charToExamine);
526#if qStroika_Foundation_Debug_AssertionsChecked
527void TextBreaks_Basic_TextEditor::RegressionTest ()
530 const Led_tChar* kTest = LED_TCHAR_OF (
"This is a good test of Simone's bug with the 'word'.");
531 size_t wordStartResult = 0;
532 size_t wordEndResult = 0;
535 FindWordBreaks (kTest, Led_tStrlen (kTest), 25, &wordStartResult, &wordEndResult, &wordReal);
536 Assert (wordEndResult == 29);
537 Assert (wordReal ==
true);
CodeCvt unifies byte <-> unicode conversions, vaguely inspired by (and wraps) std::codecvt,...