Stroika Library 3.0d16
 
Loading...
Searching...
No Matches
SpellCheckEngine_Basic.cpp
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#include "Stroika/Frameworks/StroikaPreComp.h"
5
6#include <cctype>
7#include <cmath>
8
16#include "Stroika/Foundation/Streams/TextToBinary.h"
17
18#include "SpellCheckEngine_Basic.h"
19
20using std::byte;
21
22using namespace Stroika::Foundation;
24
25using namespace Stroika::Frameworks;
26using namespace Stroika::Frameworks::Led;
27
29
30namespace {
31
32#if qIncludeBakedInDictionaries
33#if qStroika_Foundation_Common_Platform_MacOS
34// Short filenames on MacOS
35#include "Dictionary-Compiled-US-English."
36#else
37#include "Dictionary-Compiled-US-English.inc"
38#endif
39#endif
40
41 inline bool IsASCIIUpper (Led_tChar c)
42 {
43 return isascii (c) and isupper (c);
44 }
45
46 inline bool AsymmetricCaseInsensativeCompare (Led_tChar wordChar, Led_tChar dictChar)
47 {
48 if (wordChar == dictChar) {
49 return true;
50 }
51 if (isascii (wordChar) and isupper (wordChar)) {
52 return tolower (wordChar) == dictChar;
53 }
54 return false;
55 }
56 inline bool AsymmetricCaseInsensativeCompare (const Led_tChar* word, const Led_tChar* dictWord)
57 {
58 const Led_tChar* wi = word;
59 const Led_tChar* di = dictWord;
60 for (;; ++wi, ++di) {
61 if (not AsymmetricCaseInsensativeCompare (*wi, *di)) {
62 return false;
63 }
64 if (*wi == '\0' and *di == '\0') {
65 return true;
66 }
67 if (*wi == '\0' or *di == '\0') {
68 return false;
69 }
70 }
71 }
72
73 struct DictLookup_Compare {
74 DictLookup_Compare (const Led_tChar* base)
75 : fBase (base)
76 {
77 }
78 bool operator() (const SpellCheckEngine_Basic::InfoBlock& _Left, const Led_tString& _Right) const
79 {
80// Speed tweeked compare... don't construct string object to safe time and possible heap fragmentation
81#if qBasicString_Missing_CompareOverload_T
82 bool answer = _Right.compare (fBase + _Left.fIndex, 0, _Left.fWordLen) > 0;
83#else
84 bool answer = _Right.compare (0, _Right.length (), fBase + _Left.fIndex, _Left.fWordLen) > 0;
85#endif
87 Led_tString left = Led_tString{fBase + _Left.fIndex, fBase + _Left.fIndex + _Left.fWordLen};
88 Assert (answer == (left < _Right));
89 }
90 return (answer);
91 }
92 const Led_tChar* fBase;
93 };
94}
95
96namespace {
97 inline bool IsASCIISpace (Led_tChar c)
98 {
99 return isascii (c) and isspace (c);
100 }
101 inline bool IsASCIIAlnum (Led_tChar c)
102 {
103 return isascii (c) and isalnum (c);
104 }
105}
106
107/*
108 ********************************************************************************
109 *************************** SpellCheckEngine_Basic *****************************
110 ********************************************************************************
111 */
112#if qIncludeBakedInDictionaries
113const SpellCheckEngine_Basic::CompiledDictionary SpellCheckEngine_Basic::kDictionary_US_English (Dictionary_US_English);
114#endif
115
116SpellCheckEngine_Basic::SpellCheckEngine_Basic (const Dictionary* mainDictionary)
117 : inherited ()
118 , fDictionaries ()
119{
120 if (mainDictionary != NULL) {
121 fDictionaries.push_back (mainDictionary);
122 }
123}
124
125SpellCheckEngine_Basic::~SpellCheckEngine_Basic ()
126{
127}
128
129/*
130@METHOD: SpellCheckEngine_Basic::ScanForUndefinedWord
131@DESCRIPTION: <p>Overrides @'SpellCheckEngine::ScanForUndefinedWord'.</p>
132*/
133bool SpellCheckEngine_Basic::ScanForUndefinedWord (const Led_tChar* startBuf, const Led_tChar* endBuf, const Led_tChar** cursor,
134 const Led_tChar** wordStartResult, const Led_tChar** wordEndResult)
135{
136 RequireNotNull (startBuf);
137 RequireNotNull (endBuf);
138 RequireNotNull (cursor);
139 RequireNotNull (wordStartResult);
140 RequireNotNull (wordEndResult);
141 Require (*cursor == NULL or (*cursor >= startBuf and *cursor <= endBuf));
142
143 if (*cursor == NULL) {
144 *cursor = startBuf;
145 }
146
147 // preliminary implementation - shouldn't set output vars (wordStartResult/wordEndResult) result UNLESS WE are returning true...
148 while (ScanForWord (startBuf, endBuf, cursor, wordStartResult, wordEndResult)) {
149 if (not LookupWord (Led_tString{*wordStartResult, *wordEndResult}) and not OtherStringToIgnore (Led_tString{*wordStartResult, *wordEndResult})) {
150 return true;
151 }
152 }
153 return false;
154}
155
156/*
157@METHOD: SpellCheckEngine_Basic::LookupWord_
158@ACCESS: protected
159@DESCRIPTION: <p>Override (implement) @'SpellCheckEngine::LookupWord_'.</p>
160*/
161bool SpellCheckEngine_Basic::LookupWord_ (const Led_tString& checkWord, Led_tString* matchedWordResult)
162{
163 Invariant ();
164
165 /*
166 * See if we find the word as-is, and after that, try again after a few 'rewriting' tricks
167 */
168 if (LookupWordHelper_ (checkWord, matchedWordResult)) {
169 return true;
170 }
171
172 if (checkWord.empty ()) {
173 return false;
174 }
175
176 {
177 /*
178 * Don't compare completely case-insensatively. However- if we lookup a word which
179 * is capitalized, allow it to match a word in the dictionary whcih is not (since the given
180 * word could be starting a sentence).
181 */
182 if (checkWord[0] < 127 and isupper (checkWord[0])) {
183 Led_tString w2 = checkWord;
184 w2[0] = static_cast<char> (tolower (w2[0]));
185 if (LookupWordHelper_ (w2, matchedWordResult)) {
186 return true;
187 }
188
189 /*
190 * If the word is ALL UPPERCASE - then just treat it as a Capitalized word lookup. That is - "UPPER" should match
191 * the dictionary word "Upper" or "upper", but not "uPper".
192 */
193 {
194 bool allUpper = true;
195 Led_tString caseFixedWord = checkWord;
196 for (auto i = caseFixedWord.begin (); i != caseFixedWord.end (); ++i) {
197 if (IsASCIIUpper (*i)) {
198 *i = static_cast<char> (tolower (*i));
199 }
200 else {
201 allUpper = false;
202 break;
203 }
204 }
205 if (allUpper and LookupWordHelper_ (caseFixedWord, matchedWordResult)) {
206 return true;
207 }
208 caseFixedWord[0] = checkWord[0];
209 if (allUpper and LookupWordHelper_ (caseFixedWord, matchedWordResult)) {
210 return true;
211 }
212 }
213 }
214 }
215
216 // Look for hyphenated words
217 {
218 /*
219 * In our dictionary, we allow for hypenated words to be present. But thats not common. Its only for cases
220 * where the hyphenated word isn't constructable by other means (for example - a hypenated foreign expression
221 * where the elemental words are not part of the base language).
222 *
223 * Next, you can put hyphens between any normal english words (even with the usual rewriting rules - like
224 * appending 's' to the base noun word).
225 *
226 * Then - for some special cases - some words maybe entered in the dictionary with a suffixing hypen - to
227 * indicate that they ONLY match when used as a hypenated prefix. For example - 'anti-' might be so-coded,
228 * if you didn't want to allow for the word 'anti' by itself, but just as a prefix as in 'anti-war'.
229 */
230 Led_tString genMWR;
231 size_t lastFoundAt = 0;
232 bool mustCheckLastSeg = false;
233 for (size_t i = checkWord.find ('-'); i != Led_tString::npos or mustCheckLastSeg; i = checkWord.find ('-', i + 1)) {
234 if (i == Led_tString::npos) {
235 Assert (mustCheckLastSeg);
236 mustCheckLastSeg = false;
237 i = checkWord.length ();
238 }
239 else {
240 mustCheckLastSeg = true; // if we find a dash - we must check afer it as well
241 }
242 Led_tString segWord = Led_tString{checkWord.substr (lastFoundAt, i - lastFoundAt)};
243 Led_tString tmpMWR;
244 if (segWord.empty ()) {
245 // if any segment is empty - thats not legit - so treat that as misspelled
246 return false;
247 }
248 else if (LookupWord_ (segWord, &tmpMWR)) {
249 if (lastFoundAt != 0) {
250 genMWR += LED_TCHAR_OF ("-");
251 }
252 genMWR += tmpMWR;
253 }
254 else if (i != checkWord.length () and LookupWordHelper_ (segWord + LED_TCHAR_OF ("-"), &tmpMWR)) {
255 if (lastFoundAt != 0) {
256 genMWR += LED_TCHAR_OF ("-");
257 }
258 Assert (tmpMWR.length () >= 2); // must include the dash - whcih we strip...
259 genMWR += tmpMWR.substr (0, tmpMWR.length () - 1);
260 }
261 else {
262 // if any segment cannot be found - then the hyphenated word as a whole cannot
263 return false;
264 }
265 lastFoundAt = i + 1;
266 }
267
268 // If all segments found - then return the generated (combined genMWR) matchedWordResult.
269 // We know there were hyphens found if genMWR not empty
270 if (not genMWR.empty ()) {
271 if (matchedWordResult != NULL) {
272 *matchedWordResult = genMWR;
273 }
274 return true;
275 }
276 }
277
278 {
279 const wchar_t kRightSingleQuotationMark = L'\x2019';
280 size_t apos = checkWord.find (kRightSingleQuotationMark);
281 if (apos != Led_tString::npos) {
282 Led_tString tmp = checkWord;
283 tmp[apos] = '\'';
284 return LookupWord_ (tmp, matchedWordResult);
285 }
286 }
287
288 // PROBABALY LOTS MORE LOGIC/RULES TO ADD HERE!!!
289 {
290 }
291
292 return false;
293}
294
295/*
296@METHOD: SpellCheckEngine_Basic::LookupWordHelper_
297@ACCESS: private
298@DESCRIPTION: <p>Lookup the given word (case-sensative match - using @'DictLookup_Compare'). Looks through
299 all loaded dictionaries. Performs no fancy (linguistic) mapping (like tricks with punctutation
300 stripping etc). Thats handled at a higher level.</p>
301*/
302bool SpellCheckEngine_Basic::LookupWordHelper_ (const Led_tString& checkWord, Led_tString* matchedWordResult) const
303{
304 Invariant ();
305
306 for (auto i = fDictionaries.begin (); i != fDictionaries.end (); ++i) {
307 const Dictionary* dict = *i;
308 AssertNotNull (dict);
309
310 const InfoBlock* ibsStart = NULL;
311 const InfoBlock* ibsEnd = NULL;
312 dict->GetInfoBlocks (&ibsStart, &ibsEnd);
313 const Led_tChar* dictBufStart = dict->GetTextBase ();
314
315 const InfoBlock* r = lower_bound (ibsStart, ibsEnd, checkWord, DictLookup_Compare (dictBufStart));
316 if (r != ibsEnd) {
317 Led_tString x = Led_tString{dictBufStart + (*r).fIndex, dictBufStart + (*r).fIndex + (*r).fWordLen};
318 if (x == checkWord) {
319 if (matchedWordResult != NULL) {
320 *matchedWordResult = x;
321 }
322 return true;
323 }
324 }
325 }
326 return false;
327}
328
329/*
330@METHOD: SpellCheckEngine_Basic::OtherStringToIgnore
331@ACCESS: protected
332@DESCRIPTION: <p>Check if the given string should be ignored as an undefined word. Look for special patterns that won't be
333 found in our dictionary (e.g. numbers, strings of punctuation, etc).</p>
334 <p>This is typically called from @'SpellCheckEngine_Basic::ScanForUndefinedWord'</p>
335*/
336bool SpellCheckEngine_Basic::OtherStringToIgnore (const Led_tString& checkWord)
337{
338 return OtherStringToIgnore_AllPunctuation (checkWord) or OtherStringToIgnore_Sentinels (checkWord) or OtherStringToIgnore_Number (checkWord);
339}
340
341bool SpellCheckEngine_Basic::OtherStringToIgnore_AllPunctuation (const Led_tString& checkWord)
342{
343 for (size_t i = 0; i < checkWord.length (); ++i) {
344 Led_tChar c = checkWord[i];
345 if (not Character (c).IsPunctuation ()) {
346 return false;
347 }
348 }
349 return true;
350}
351
352bool SpellCheckEngine_Basic::OtherStringToIgnore_Number (const Led_tString& checkWord)
353{
354 for (size_t i = 0; i < checkWord.length (); ++i) {
355 Led_tChar c = checkWord[i];
356 // at least for English - we could tighten up the IsPunct call to just c==',' or c=='.' Not sure enough about
357 // other languages, so don't bother about this. Anyhow - stuff that is all punctuation and/or digits
358 // should probably be ignored anyhow... LGP 2003-06-25
359 if (not(Character (c).IsPunctuation () or Character (c).IsDigit ())) {
360 return false;
361 }
362 }
363 return true;
364}
365
366namespace {
367 // do this to avoid MSFT annoying _SCL_INSECURE_DEPRECATE warning I cannot seem to suppress
368 // with #praga (and don't want to use #define cuz makes it hard to work as a library)
369 // --LGP 2014-03-23
370 template <class BidirectionalIterator1, class BidirectionalIterator2>
371 BidirectionalIterator2 My_copy_backward_ (BidirectionalIterator1 first, BidirectionalIterator1 last, BidirectionalIterator2 result)
372 {
373 while (last != first)
374 *(--result) = *(--last);
375 return result;
376 }
377}
378
379namespace {
380 void AddToListsHelper (const size_t kMaxSug, Led_tString topSugs[], float topSugScores[], float* scoreCutOff, float s, const Led_tString& w)
381 {
382 RequireNotNull (scoreCutOff);
383 if (s > *scoreCutOff) {
384 // find where it fits in the array, and then adjust array
385 for (size_t ii = 0; ii < kMaxSug; ++ii) {
386 if (s > topSugScores[ii]) {
387 My_copy_backward_ (topSugs + ii, topSugs + kMaxSug - 1, topSugs + kMaxSug);
388 My_copy_backward_ (topSugScores + ii, topSugScores + kMaxSug - 1, topSugScores + kMaxSug);
389 topSugs[ii] = w;
390 topSugScores[ii] = s;
391 break;
392 }
393 }
394 *scoreCutOff = topSugScores[kMaxSug - 1];
395 }
396 }
397}
398
399vector<Led_tString> SpellCheckEngine_Basic::GenerateSuggestions (const Led_tString& misspelledWord)
400{
401 const size_t kMaxSug = 8;
402 Led_tString topSugs[kMaxSug];
403 float topSugScores[kMaxSug];
404 float scoreCutOff = -100000.0f;
405
406 fill (topSugScores, topSugScores + kMaxSug, scoreCutOff);
407
408 //See if the word is a hyphenated combination of two words
409 if (misspelledWord.find ('-') == Led_tString::npos) {
410 size_t maxInitSegSize = misspelledWord.size () - 1;
411 for (size_t initialWordSize = 1; initialWordSize < maxInitSegSize; ++initialWordSize) {
412 if (LookupWord (misspelledWord.substr (0, initialWordSize)) and LookupWord (misspelledWord.substr (initialWordSize))) {
413 AddToListsHelper (kMaxSug, topSugs, topSugScores, &scoreCutOff, -1.2f,
414 misspelledWord.substr (0, initialWordSize) + LED_TCHAR_OF ("-") + misspelledWord.substr (initialWordSize));
415 }
416 }
417 }
418
419 // Iterate over each dictionary
420 for (auto i = fDictionaries.begin (); i != fDictionaries.end (); ++i) {
421 const Dictionary* dict = *i;
422 AssertNotNull (dict);
423 const InfoBlock* ibsStart = NULL;
424 const InfoBlock* ibsEnd = NULL;
425 dict->GetInfoBlocks (&ibsStart, &ibsEnd);
426 const Led_tChar* dictBufStart = dict->GetTextBase ();
427 // Look at each word in the dictionary and assign each a score (see if they would be a good suggestion)
428 for (const InfoBlock* ib = ibsStart; ib != ibsEnd; ++ib) {
429 Led_tString w = Led_tString{dictBufStart + (*ib).fIndex, dictBufStart + (*ib).fIndex + (*ib).fWordLen};
430 float s = Heuristic (misspelledWord, w, scoreCutOff);
431 AddToListsHelper (kMaxSug, topSugs, topSugScores, &scoreCutOff, s, w);
432 }
433 }
434
435 // Look at the CASE of the original word - compate with the top suggestion. Perhaps
436 // clone the original suggestion and right a slightly better one which is the same as the first
437 // but with case properties improved.
438 if (not misspelledWord.empty ()) {
439 Led_tString topSug = topSugs[0];
440 if (not topSug.empty ()) {
441 bool capitalize = false;
442 bool allCaps = false;
443 {
444 if (Character (misspelledWord[0]).IsAlphabetic () and Character (topSug[0]).IsAlphabetic ()) {
445 if (isupper (misspelledWord[0]) and not isupper (topSug[0])) {
446 capitalize = true;
447 }
448 }
449 for (size_t i = 0; i < misspelledWord.length (); ++i) {
450 if (Character (misspelledWord[i]).IsAlphabetic ()) {
451 if (isupper (misspelledWord[i])) {
452 allCaps = true;
453 }
454 else {
455 allCaps = false;
456 break; // if we find ANY letters alpha - but non-capitalized - then breakout and say NOT whole thing capitalized
457 }
458 }
459 }
460 }
461 if (allCaps) {
462 Led_tString newWord = topSug;
463 {
464 for (size_t i = 0; i < newWord.length (); ++i) {
465 if (Character (newWord[i]).IsAlphabetic ()) {
466 newWord[i] = static_cast<char> (toupper (newWord[i]));
467 }
468 }
469 }
470 float newScore = topSugScores[0];
471 newScore += 0.1f;
472 AddToListsHelper (kMaxSug, topSugs, topSugScores, &scoreCutOff, newScore, newWord);
473 }
474 else if (capitalize) {
475 Led_tString newWord = topSug;
476 {
477 if (Character (newWord[0]).IsAlphabetic ()) {
478 newWord[0] = static_cast<char> (toupper (newWord[0]));
479 }
480 }
481 float newScore = topSugScores[0];
482 newScore += 0.1f;
483 AddToListsHelper (kMaxSug, topSugs, topSugScores, &scoreCutOff, newScore, newWord);
484 }
485 }
486 }
487
488 // Look for a big gap in scores, and perhaps cut-off there
489 size_t endScoreList = kMaxSug;
490 {
491 const float kTrigger = 1.5f; // MUST BE RETUNED EACH TIME I CHANGE HUERISTIC FUNCTION!
492 float lastScore = 0.0f;
493 for (size_t i = 0; i < kMaxSug; ++i) {
494 if (i != 0) {
495 if (fabs (lastScore - topSugScores[i]) > kTrigger) {
496 endScoreList = i;
497 break;
498 }
499 }
500 lastScore = topSugScores[i];
501 }
502 }
503
504 vector<Led_tString> result;
505 for (size_t j = 0; j < endScoreList; ++j) {
506 if (topSugs[j].empty ()) {
507 break;
508 }
509 else {
510 result.push_back (topSugs[j]);
511 }
512 }
513 return result;
514}
515
516SpellCheckEngine_Basic::UDInterface* SpellCheckEngine_Basic::GetUDInterface ()
517{
518 return NULL;
519}
520
521TextBreaks* SpellCheckEngine_Basic::PeekAtTextBreaksUsed ()
522{
523 return GetTextBreaker ().get ();
524}
525
526float SpellCheckEngine_Basic::Heuristic (const Led_tString& misspelledWord, const Led_tString& candidateWord, float atLeast)
527{
528 // Totally quick and dirty hack impl...
529 float h = 0.0f;
530 size_t mwl = misspelledWord.length ();
531 size_t cwl = candidateWord.length ();
532 float thisCharImportance = 2.0f;
533 for (size_t i = 0; i < mwl; ++i) {
534 if (i >= cwl) {
535 h -= thisCharImportance * 5.0f;
536 }
537 else if (misspelledWord[i] != candidateWord[i]) {
538 float prevH = h; // saved H so we can see how much we adjusted for transposition bonus
539
540 // do case mapping - REDO using CodePage.h. code(IsAlpha) etc... add a ToLower(C)
541 if (AsymmetricCaseInsensativeCompare (misspelledWord[i], candidateWord[i])) {
542 h -= thisCharImportance * 0.1f;
543 }
544 else {
545 // see if the right character is just before or after (a transpose? or missing character?)
546 if (i > 0 and AsymmetricCaseInsensativeCompare (misspelledWord[i], candidateWord[i - 1])) {
547 h -= thisCharImportance * 0.5f;
548 }
549 else if (i + 1 < cwl and AsymmetricCaseInsensativeCompare (misspelledWord[i], candidateWord[i + 1])) {
550 h -= thisCharImportance * 0.5f;
551 }
552 else {
553 h -= thisCharImportance * 3.0f;
554 }
555 }
556
557 // See if the mismatch is a transposition (with the following character- no need to check PREV because
558 // already captured in previous iteration)
559 if (i < mwl and (misspelledWord[i] == candidateWord[i + 1]) and (misspelledWord[i + 1] == candidateWord[i])) {
560 float thisLetterCost = h - prevH;
561 h += (-thisLetterCost) * 1.2f; // unsure what right cost - must take care of THIS char plus next neg...
562 }
563 }
564
565 // speed tweek - cut-off heuristic calc if we're already below the cut-off
566 if (h < atLeast) {
567 return -10000.0f;
568 }
569
570 // letters at the beginning are more important than at the end
571 if (i == 0) {
572 thisCharImportance *= 0.5f;
573 }
574 else if (i < 3) {
575 thisCharImportance *= 0.8f;
576 }
577 else {
578 thisCharImportance *= 0.9f;
579 }
580 }
581 {
582 const float kPenaltyForLettersOffEnd = 0.5f;
583 const float kPenaltyForLettersOffEndGrowthFactor = 1.2f;
584 float curPenalty = kPenaltyForLettersOffEnd;
585 for (size_t i = mwl; i < cwl; ++i) {
586 // extra chars off the end have growing significance...
587 h -= curPenalty;
588 curPenalty *= kPenaltyForLettersOffEndGrowthFactor;
589 }
590 }
591 return h;
592}
593
594/*
595@METHOD: SpellCheckEngine_Basic::ScanForWord
596@ACCESS: private
597@DESCRIPTION: <p>Look in the given buffer - starting at 'cursor' - for the next word Set wordStartResult/
598 wordEndResult according to what is found and return true if we find a word, and return false otherwise.
599 In either case - set 'cursor' on output to reflect how far we scanned ahead. It is indented that
600 this function be used iteratively and that you repeatedly pass IN the same cursor that was passed out.</p>
601*/
602bool SpellCheckEngine_Basic::ScanForWord (const Led_tChar* startBuf, const Led_tChar* endBuf, const Led_tChar** cursor,
603 const Led_tChar** wordStartResult, const Led_tChar** wordEndResult)
604{
605 RequireNotNull (startBuf);
606 RequireNotNull (endBuf);
607 RequireNotNull (cursor);
608 RequireNotNull (wordStartResult);
609 RequireNotNull (wordEndResult);
610 Require ((*cursor >= startBuf and *cursor <= endBuf));
611
612 if (*cursor >= endBuf) {
613 return false;
614 }
615
616 size_t bufLen = endBuf - startBuf;
617
618 size_t initialCrs = *cursor - startBuf;
619 size_t p = initialCrs;
620 size_t wordStart = 0;
621 size_t wordEnd = 0;
622 bool wordReal = false;
623 // Find a real word...
624 while (not wordReal or wordStart < initialCrs) {
625 GetTextBreaker ()->FindWordBreaks (startBuf, bufLen, p, &wordStart, &wordEnd, &wordReal);
626 if (not wordReal or wordStart < initialCrs) {
627 if (p < bufLen) {
628 p = Led_NextChar (&startBuf[p]) - startBuf;
629 wordStart = p;
630 wordEnd = p;
631 }
632 else {
633 wordStart = p;
634 wordEnd = p;
635 break;
636 }
637 }
638 }
639
640 if (not wordReal) {
641 // no real word to be found...
642 //maybe the first 'word' was a series of spaces and there may not have been a word after (end of buf).
643 *cursor = startBuf + wordEnd;
644 return false;
645 }
646
647 Assert (wordReal);
648 Assert (wordStart < wordEnd);
649 p = wordEnd;
650
651 Assert (*cursor <= startBuf + p);
652 Assert ((*cursor < startBuf + p) or (*cursor == endBuf));
653 *cursor = startBuf + p;
654
655 *wordStartResult = startBuf + wordStart;
656 *wordEndResult = startBuf + wordEnd;
657
658 return true;
659}
660
661vector<const SpellCheckEngine_Basic::Dictionary*> SpellCheckEngine_Basic::GetDictionaries () const
662{
663 return fDictionaries;
664}
665
666void SpellCheckEngine_Basic::SetDictionaries (const vector<const Dictionary*>& dictionaries)
667{
668 fDictionaries = dictionaries;
669}
670
671#if qStroika_Foundation_Debug_AssertionsChecked
672/*
673@METHOD: SpellCheckEngine_Basic::Invariant_
674@DESCRIPTION: <p>Check validity of SpellCheck engine. Called by @'SpellCheckEngine::Invariant'.</p>
675*/
676void SpellCheckEngine_Basic::Invariant_ () const
677{
678 Assert (sizeof (InfoBlock) == sizeof (int)); // Not a REAL requirement - but we want to make sure - for the most part - the
679 // compiler lays this out for us to be SMALL.
680 for (const Dictionary* dict : fDictionaries) {
681 AssertNotNull (dict);
682 const InfoBlock* ibsStart = NULL;
683 const InfoBlock* ibsEnd = NULL;
684 dict->GetInfoBlocks (&ibsStart, &ibsEnd);
685 const Led_tChar* dictBufStart = dict->GetTextBase ();
686 const Led_tChar* dictBufEnd = dict->GetTextEnd ();
687 if (dictBufStart != dictBufEnd) {
688 AssertNotNull (dictBufStart);
689 AssertNotNull (dictBufEnd);
690 Assert (dictBufStart <= dictBufEnd); // empty is a boring case - but I suppose not illegal
691 size_t bufSize = dictBufEnd - dictBufStart;
692 Led_tString prevWord;
693 for (const InfoBlock* i = ibsStart; i != ibsEnd; ++i) {
694 AssertNotNull (dictBufStart);
695 Assert ((*i).fIndex < bufSize);
696 Assert ((*i).fIndex + (*i).fWordLen <= bufSize);
697 Assert ((*i).fWordLen > 0);
698 Led_tString w = Led_tString{dictBufStart + (*i).fIndex, dictBufStart + (*i).fIndex + (*i).fWordLen};
699 Assert (not w.empty ());
700 if (i != ibsStart) {
701 // Assure words in alphabetical order
702 Assert (Led_tStrCmp (prevWord.c_str (), w.c_str ()) < 0);
703 }
704 prevWord = w;
705 }
706 }
707 }
708}
709#endif
710
711#if qStroika_Foundation_Debug_AssertionsChecked
712void SpellCheckEngine_Basic::RegressionTest ()
713{
714 try {
715 RegressionTest_1 ();
716 }
717 catch (...) {
718 Assert (false); // catch and just report here as assert errors and exceptions...
719 // (cuz currently on MacOS - for UNICODE builds - we generate
720 // exceptions cuz not enough UNICODE support there yet -- LGP 2003-05-30)
721 }
722}
723
724void SpellCheckEngine_Basic::RegressionTest_1 ()
725{
726 const Led_tChar* xxx = LED_TCHAR_OF ("IBM\na\napple\ndog\nfrog\ngood\nthis\nzipper\n");
727 SpellCheckEngine_Basic::EditableDictionary testerDict;
728 testerDict.ReadFromBuffer (xxx, xxx + Led_tStrlen (xxx));
729 SpellCheckEngine_Basic tester (&testerDict);
730 const Led_tChar* testText = LED_TCHAR_OF ("This is a very good test. ");
731
732 tester.Invariant ();
733
734 {
735 bool r1 = tester.LookupWord (LED_TCHAR_OF ("Frog"));
736 Assert (r1);
737 }
738
739 {
740 Led_tString r2S;
741 bool r2n = tester.LookupWord (LED_TCHAR_OF ("ziPPer"), &r2S);
742 bool r2y = tester.LookupWord (LED_TCHAR_OF ("Zipper"), &r2S);
743 Assert (not r2n and r2y and r2S == LED_TCHAR_OF ("zipper"));
744 }
745
746 {
747 Led_tString r3S;
748 Assert (not tester.LookupWord (LED_TCHAR_OF ("ibm")));
749 Assert (not tester.LookupWord (LED_TCHAR_OF ("Ibm")));
750 bool r3 = tester.LookupWord (LED_TCHAR_OF ("IBM"), &r3S);
751 Assert (r3 and r3S == LED_TCHAR_OF ("IBM"));
752 }
753
754 {
755 const Led_tChar* wordStart = NULL;
756 const Led_tChar* wordEnd = NULL;
757 const Led_tChar* p = NULL;
758 bool result = tester.ScanForUndefinedWord (testText, testText + Led_tStrlen (testText), &p, &wordStart, &wordEnd);
759 Assert (result and (Led_tString{wordStart, wordEnd} == LED_TCHAR_OF ("is")));
760 }
761
762 {
763 const Led_tChar* cursor = NULL;
764 const Led_tChar* wordStartResult = NULL;
765 const Led_tChar* wordEndResult = NULL;
766 int nWordsFound = 0;
767 while (tester.ScanForUndefinedWord (testText, testText + Led_tStrlen (testText), &cursor, &wordStartResult, &wordEndResult)) {
768 // we found a possible undefined word.
769 Led_tString word = Led_tString{wordStartResult, wordEndResult};
770 ++nWordsFound;
771 if (nWordsFound == 1) {
772 Assert ((Led_tString{wordStartResult, wordEndResult} == LED_TCHAR_OF ("is")));
773 }
774 if (nWordsFound == 2) {
775 Assert ((Led_tString{wordStartResult, wordEndResult} == LED_TCHAR_OF ("very")));
776 }
777 if (nWordsFound == 3) {
778 Assert ((Led_tString{wordStartResult, wordEndResult} == LED_TCHAR_OF ("test")));
779 }
780 }
781 Assert (nWordsFound == 3);
782 }
783}
784#endif
785
786/*
787 ********************************************************************************
788 ****************** SpellCheckEngine_Basic::EditableDictionary ******************
789 ********************************************************************************
790 */
791SpellCheckEngine_Basic::EditableDictionary::~EditableDictionary ()
792{
793 delete[] fDictBufStart;
794}
795
796void SpellCheckEngine_Basic::EditableDictionary::AddWordToUserDictionary (const Led_tString& word)
797{
798 fSortedWordList.insert (word);
799 ConstructInfoBlocksEtcFromWordList ();
800}
801
802const Led_tChar* SpellCheckEngine_Basic::EditableDictionary::GetTextBase () const
803{
804 return fDictBufStart;
805}
806
807const Led_tChar* SpellCheckEngine_Basic::EditableDictionary::GetTextEnd () const
808{
809 return fDictBufEnd;
810}
811
812void SpellCheckEngine_Basic::EditableDictionary::GetInfoBlocks (const InfoBlock** start, const InfoBlock** end) const
813{
814 RequireNotNull (start);
815 RequireNotNull (end);
816 *start = Containers::Start (fInfoBlocks);
817 *end = Containers::End (fInfoBlocks);
818}
819
820void SpellCheckEngine_Basic::EditableDictionary::ReadFromBuffer (const Led_tChar* readOnlyRAMDictStart, const Led_tChar* readOnlyRAMDictEnd)
821{
822 fSortedWordList.clear ();
823
824 // ASSUMES VALS COME IN CRLF or LF separated. This code does not assume that the
825 // input is already in dictionary order (though we write it that way).
826 for (const Led_tChar* p = readOnlyRAMDictStart; p < readOnlyRAMDictEnd;) {
827 // scan for \r or \n or \r\n to mark end of word
828 const Led_tChar* i1 = Led_tStrChr (p, '\r');
829 const Led_tChar* i2 = Led_tStrChr (p, '\n');
830 if (i1 == NULL) {
831 i1 = p + Led_tStrlen (p);
832 }
833 if (i2 == NULL) {
834 i2 = p + Led_tStrlen (p);
835 }
836 const Led_tChar* wordStart = p;
837 const Led_tChar* wordEnd = min (i1, i2);
838 if (wordStart != wordEnd) {
839 fSortedWordList.insert (Led_tString{wordStart, wordEnd});
840 }
841
842 p = wordEnd;
843 if (p < readOnlyRAMDictEnd) {
844 ++p; // skip CR or LF
845 }
846 // If CRLF then set p to point after CRLF (not just CR or LF)
847 if (p + 1 == i2) {
848 p = p + 1;
849 }
850 }
851 ConstructInfoBlocksEtcFromWordList ();
852}
853
854vector<Led_tChar> SpellCheckEngine_Basic::EditableDictionary::SaveToBuffer () const
855{
856 StackBuffer<Led_tChar> buf{1};
857
858#if qStroika_Foundation_Common_Platform_Windows
859 const Led_tChar kLineTerm[] = LED_TCHAR_OF ("\r\n");
860#elif qStroika_Foundation_Common_Platform_MacOS
861 const Led_tChar kLineTerm[] = LED_TCHAR_OF ("\r");
862#else
863 const Led_tChar kLineTerm[] = LED_TCHAR_OF ("\n");
864#endif
865 const size_t kLineTerm_Length = Memory::NEltsOf (kLineTerm) - 1;
866 size_t totalBufSizeSoFar = 0;
867 for (auto i = fSortedWordList.begin (); i != fSortedWordList.end (); ++i) {
868 {
869 size_t prevCopyTo = totalBufSizeSoFar;
870 totalBufSizeSoFar += (*i).length ();
871 buf.GrowToSize (totalBufSizeSoFar);
872 (void)::memcpy (static_cast<Led_tChar*> (buf) + prevCopyTo, Containers::Start (*i), i->size () * sizeof (Led_tChar));
873 }
874 {
875 size_t prevCopyTo = totalBufSizeSoFar;
876 totalBufSizeSoFar += kLineTerm_Length;
877 buf.GrowToSize (totalBufSizeSoFar);
878 (void)::memcpy (static_cast<Led_tChar*> (buf) + prevCopyTo, kLineTerm, kLineTerm_Length * sizeof (Led_tChar));
879 }
880 }
881 return vector<Led_tChar> (static_cast<Led_tChar*> (buf), static_cast<Led_tChar*> (buf) + totalBufSizeSoFar);
882}
883
884namespace {
885 // do this to avoid MSFT annoying _SCL_INSECURE_DEPRECATE warning I cannot seem to suppress
886 // with #praga (and don't want to use #define cuz makes it hard to work as a library)
887 // --LGP 2014-03-23
888 template <class InputIterator, class OutputIterator>
889 OutputIterator my_copy_ (InputIterator first, InputIterator last, OutputIterator result)
890 {
891 while (first != last) {
892 *result = *first;
893 ++result;
894 ++first;
895 }
896 return result;
897 }
898}
899
900void SpellCheckEngine_Basic::EditableDictionary::ConstructInfoBlocksEtcFromWordList ()
901{
902 // Clear old buffer values
903 delete[] fDictBufStart;
904 fDictBufStart = NULL;
905 fDictBufEnd = NULL;
906 fInfoBlocks.clear ();
907
908 size_t totalBlockSize = 0;
909 {
910 for (auto i = fSortedWordList.begin (); i != fSortedWordList.end (); ++i) {
911 totalBlockSize += (*i).length ();
912 }
913 }
914
915 fDictBufStart = new Led_tChar[totalBlockSize];
916 fDictBufEnd = fDictBufStart + totalBlockSize;
917
918 /*
919 * Now walk the list (already in the right order for the InfoBlock list), and fill in the fDictBuffer area,
920 * and the InfoBlock list at the same time.
921 */
922 Led_tChar* intoBufPtr = fDictBufStart;
923 for (auto i = fSortedWordList.begin (); i != fSortedWordList.end (); ++i) {
924 my_copy_ ((*i).begin (), (*i).end (), intoBufPtr);
925
926 InfoBlock iB;
927 (void)::memset (&iB, 0, sizeof (iB));
928 iB.fIndex = intoBufPtr - fDictBufStart;
929 iB.fWordLen = (*i).length ();
930 iB.fXXX = 0; //tmphack
931 Assert (iB.fWordLen > 0);
932 fInfoBlocks.push_back (iB);
933
934 intoBufPtr += (*i).length ();
935 }
936}
937
938/*
939 ********************************************************************************
940 ***************** SpellCheckEngine_Basic::CompiledDictionary *******************
941 ********************************************************************************
942 */
943SpellCheckEngine_Basic::CompiledDictionary::CompiledDictionary (const CompiledDictionaryData& data)
944 : fData{data}
945{
946}
947
948const Led_tChar* SpellCheckEngine_Basic::CompiledDictionary::GetTextBase () const
949{
950 return fData.fTextDataStart;
951}
952
953const Led_tChar* SpellCheckEngine_Basic::CompiledDictionary::GetTextEnd () const
954{
955 return fData.fTextDataEnd;
956}
957
958void SpellCheckEngine_Basic::CompiledDictionary::GetInfoBlocks (const InfoBlock** start, const InfoBlock** end) const
959{
960 RequireNotNull (start);
961 RequireNotNull (end);
962 *start = fData.fInfoBlocksStart;
963 *end = fData.fInfoBlocksEnd;
964}
965
966/*
967 ********************************************************************************
968 ***************************** TextBreaks_SpellChecker **************************
969 ********************************************************************************
970 */
971
972TextBreaks_SpellChecker::TextBreaks_SpellChecker ()
973{
974#if qStroika_Foundation_Debug_AssertionsChecked
975 // NB: since this is called in this CTOR - it doesn't capture (or pay attention to) subclass overrides of CharToCharacterClass
976 // That fact is important - since subclasses might change its result in a way to voilate this regression test. Thats fine - if its
977 // desired by the subclass. This is just a test to make sure logical changes we make to this code have intended
978 // consequences... LGP 2003-11-24
979 RegressionTest ();
980#endif
981}
982
983TextBreaks_SpellChecker::CharacterClasses TextBreaks_SpellChecker::CharToCharacterClass (const Led_tChar* startOfText, size_t lengthOfText,
984 const Led_tChar* charToExamine) const
985{
986 switch (*charToExamine) {
987 case '$':
988 case '%':
989 case '-': {
990 return (eWordClass);
991 } break;
992
993 case 0x2019: // curly apostrophe
994 case '\'': {
995 // APOSTROPHE between digits or letters
996 if (charToExamine > startOfText and charToExamine < &startOfText[lengthOfText]) {
997 const Led_tChar* prevChar = Led_PreviousChar (startOfText, charToExamine);
998 const Led_tChar* nextChar = charToExamine + 1; // cuz we KNOW we are single-byte...
999
1000 // E.g.: Fred's or Lewis', but not Jim'
1001 if ((IsASCIIAlnum (*prevChar) and *nextChar == 's') or (*prevChar == 's' and IsASCIISpace (*nextChar))) {
1002 return (eWordClass);
1003 }
1004 }
1005 } break;
1006 }
1007 return inherited::CharToCharacterClass (startOfText, lengthOfText, charToExamine);
1008}
1009
1010#if qStroika_Foundation_Debug_AssertionsChecked
1011void TextBreaks_SpellChecker::RegressionTest ()
1012{
1013 {
1014 const Led_tChar* kTest = LED_TCHAR_OF ("This is a good test of Simone's bug with the 'word'.");
1015 size_t wordEndResult = 0;
1016 bool wordReal = 0;
1017
1018 FindLineBreaks (kTest, Led_tStrlen (kTest), 25, &wordEndResult, &wordReal);
1019 Assert (wordEndResult == 31);
1020 Assert (wordReal == true);
1021 }
1022}
1023#endif
1024
1025/*
1026 ********************************************************************************
1027 *********************** SpellCheckEngine_Basic_Simple **************************
1028 ********************************************************************************
1029 */
1030SpellCheckEngine_Basic_Simple::SpellCheckEngine_Basic_Simple ()
1031 : fMainDictionary{nullptr}
1032 , fUDName{}
1033 , fUD{nullptr}
1034{
1035#if qIncludeBakedInDictionaries
1036 SetMainDictionary (&kDictionary_US_English);
1037#endif
1038}
1039
1040SpellCheckEngine_Basic_Simple::~SpellCheckEngine_Basic_Simple ()
1041{
1042 delete fUD;
1043}
1044
1045SpellCheckEngine_Basic_Simple::UDInterface* SpellCheckEngine_Basic_Simple::GetUDInterface ()
1046{
1047 return this;
1048}
1049
1050bool SpellCheckEngine_Basic_Simple::AddWordToUserDictionarySupported () const
1051{
1052 return fUD != NULL;
1053}
1054
1055void SpellCheckEngine_Basic_Simple::AddWordToUserDictionary (const Led_tString& word)
1056{
1058 fUD->AddWordToUserDictionary (word);
1059 WriteToUD ();
1060}
1061
1062const SpellCheckEngine_Basic_Simple::Dictionary* SpellCheckEngine_Basic_Simple::GetMainDictionary () const
1063{
1064 return fMainDictionary;
1065}
1066
1067void SpellCheckEngine_Basic_Simple::SetMainDictionary (const Dictionary* mainDictionary)
1068{
1069 fMainDictionary = mainDictionary;
1070 vector<const Dictionary*> dicts;
1071 if (fMainDictionary != NULL) {
1072 dicts.push_back (fMainDictionary);
1073 }
1074 if (fUD != NULL) {
1075 dicts.push_back (fUD);
1076 }
1077 SetDictionaries (dicts);
1078}
1079
1080filesystem::path SpellCheckEngine_Basic_Simple::GetUserDictionary () const
1081{
1082 return fUDName;
1083}
1084
1085void SpellCheckEngine_Basic_Simple::SetUserDictionary (const filesystem::path& userDictionary)
1086{
1087 fUDName = userDictionary;
1088 bool noUD = userDictionary.empty ();
1089 delete fUD;
1090 fUD = NULL;
1091
1092 if (not noUD) {
1093 fUD = new EditableDictionary{};
1094 ReadFromUD ();
1095 }
1096 SetMainDictionary (fMainDictionary); // hack to force call to SetDictionaries ()
1097}
1098
1099void SpellCheckEngine_Basic_Simple::ReadFromUD ()
1100{
1101 /*
1102 * Ignore any errors reading from the UD (at least file-not-found errors).
1103 */
1104 try {
1105 Memory::BLOB b = IO::FileSystem::FileInputStream::New (filesystem::path (fUDName)).ReadAll ();
1106 span<const byte> rawByteSpan{b};
1107 CodeCvt<Led_tChar> converter{&rawByteSpan};
1108 size_t outCharCnt = converter.ComputeTargetCharacterBufferSize (rawByteSpan);
1109 Memory::StackBuffer<Led_tChar> fileData2{outCharCnt};
1110 auto charsRead = converter.Bytes2Characters (&rawByteSpan, span{fileData2});
1111 fUD->ReadFromBuffer (charsRead.data (), charsRead.data () + charsRead.size ());
1112 }
1113 catch (...) {
1114 }
1115}
1116
1117void SpellCheckEngine_Basic_Simple::WriteToUD ()
1118{
1120 vector<Led_tChar> data = fUD->SaveToBuffer ();
1121 IO::FileSystem::FileOutputStream::Ptr writer = IO::FileSystem::FileOutputStream::New (filesystem::path (fUDName));
1122 Streams::TextToBinary::Writer::New (writer, UnicodeExternalEncodings::eUTF8, ByteOrderMark::eInclude).Write (span{data});
1123}
#define AssertNotNull(p)
Definition Assertions.h:333
#define qStroika_Foundation_Debug_AssertionsChecked
The qStroika_Foundation_Debug_AssertionsChecked flag determines if assertions are checked and validat...
Definition Assertions.h:48
#define RequireNotNull(p)
Definition Assertions.h:347
CodeCvt unifies byte <-> unicode conversions, vaguely inspired by (and wraps) std::codecvt,...
Definition CodeCvt.h:118
Logically halfway between std::array and std::vector; Smart 'direct memory array' - which when needed...
CONTAINER::value_type * End(CONTAINER &c)
For a contiguous container (such as a vector or basic_string) - find the pointer to the end of the co...
CONTAINER::value_type * Start(CONTAINER &c)
For a contiguous container (such as a vector or basic_string) - find the pointer to the start of the ...
void ThrowIfNull(const Private_::ConstVoidStar &p, const HRESULT &hr)
Template specialization for ThrowIfNull (), for thing being thrown HRESULT - really throw HRESULTErro...