Stroika Library 3.0d16
 
Loading...
Searching...
No Matches
StyledTextIO_HTML.h
1/*
2 * Copyright(c) Sophist Solutions, Inc. 1990-2025. All rights reserved
3 */
4#ifndef _Stroika_Frameworks_Led_StyledTextIO_HTML_h_
5#define _Stroika_Frameworks_Led_StyledTextIO_HTML_h_ 1
6
7#include "Stroika/Frameworks/StroikaPreComp.h"
8
9#include "Stroika/Frameworks/Led/StyledTextIO/StyledTextIO.h"
10
11/*
12@MODULE: StyledTextIO_HTML
13@DESCRIPTION:
14 <p>Subclasses of @'StyledTextIOReader' and @'StyledTextIOWriter' are where the knowledge of particular file formats resides.
15 For example, the knowledge of how to read HTML is in @'StyledTextIOReader_HTML' and the knowledge of how to write HTML is in
16 @'StyledTextIOWriter_HTML'.</p>
17 */
18
19namespace Stroika::Frameworks::Led::StyledTextIO {
20
21/*
22 @CONFIGVAR: qThrowAwayMostUnknownHTMLTags
23 @DESCRIPTION: <p>By default, on reading, we throw away or interpret loosely most tags. This makes the reader
24 very lossy, but produces the most human-readable result. For now control which behavior
25 you want via this compile time flag. -- LGP 961015</p>
26 */
27#ifndef qThrowAwayMostUnknownHTMLTags
28#define qThrowAwayMostUnknownHTMLTags 1
29#endif
30
31/*
32 @CONFIGVAR: qWriteOutMostHTMLEntitiesByName
33 @DESCRIPTION: <p>By default, off - cuz that works more compatably with many older web browser (such as Netscape 4.x).
34 And writing them by number is slightly faster.</p>
35 */
36#ifndef qWriteOutMostHTMLEntitiesByName
37#define qWriteOutMostHTMLEntitiesByName 0
38#endif
39
40 /*
41 @CLASS: HTMLInfo
42 @DESCRIPTION:
43 */
44 class HTMLInfo {
45 public:
46 HTMLInfo () = default;
47
48 public:
49 struct EntityRefMapEntry {
50 EntityRefMapEntry (const string& entityRefName, wchar_t charValue);
51
52 string fEntityRefName;
53 wchar_t fCharValue;
54 };
55
56 static EntityRefMapEntry sDefaultEntityRefMapTable[];
57 static const size_t kDefaultEntityRefMapTable_Count;
58
59 // HTMLFontSizes are numbers from 1..7, and the default/normal is 3.
60 // Eventually, these could become virtual methods, and be hooked into stylesheets.
61 public:
62 static FontSpecification::FontSize HTMLFontSizeToRealFontSize (int size);
63 static int RealFontSizeToHTMLFontSize (FontSpecification::FontSize size);
64
65 public:
66 string fDocTypeTag;
67 string fHTMLTag;
68 string fHeadTag;
69 string fStartBodyTag;
70 vector<string> fUnknownHeaderTags;
71 Led_tString fTitle;
72 };
73
74 /*
75 @CLASS: StyledTextIOReader_HTML
76 @BASES: @'StyledTextIOReader'
77 @DESCRIPTION:
78 */
79 class StyledTextIOReader_HTML : public StyledTextIOReader {
80 public:
81 StyledTextIOReader_HTML (SrcStream* srcStream, SinkStream* sinkStream, HTMLInfo* saveHTMLInfoInto = nullptr);
82
83 public:
84 virtual void Read () override;
85 virtual bool QuickLookAppearsToBeRightFormat () override;
86
87 public:
88 using EntityRefMapEntry = HTMLInfo::EntityRefMapEntry;
89
90 protected:
91 virtual const vector<EntityRefMapEntry>& GetEntityRefMapTable () const;
92
93 protected:
94 enum ThingyType {
95 eEntityRef,
96 eTag,
97 eEOF,
98 eBangComment
99 };
100 nonvirtual ThingyType ScanTilNextHTMLThingy ();
101 nonvirtual void ScanTilAfterHTMLThingy (ThingyType thingy);
102
103 protected:
104 nonvirtual bool LookingAt (const char* text) const;
105
106 protected:
107 nonvirtual Led_tString MapInputTextToTString (const string& text);
108
109 protected:
110 nonvirtual void EmitText (const Led_tString& text, bool skipNLCheck = false);
111 virtual void EmitText (const Led_tChar* text, size_t nBytes, bool skipNLCheck = false);
112 nonvirtual void HandleHTMLThingy (ThingyType thingy, const string& text);
113 virtual void HandleHTMLThingy (ThingyType thingy, const char* text, size_t nBytes);
114
115 protected:
116 virtual void HandleHTMLThingy_EntityReference (const char* text, size_t nBytes);
117 virtual void HandleHTMLThingy_Tag (const char* text, size_t nBytes);
118
119 protected:
120 nonvirtual void ExtractHTMLTagIntoTagNameBuf (const char* text, size_t nBytes, char* tagBuf, size_t tagBufSize, bool* isStartTag);
121
122 protected:
123 nonvirtual IncrementalFontSpecification ExtractFontSpecFromCSSStyleAttribute (const char* text, size_t nBytes);
124 nonvirtual void ApplyCSSStyleAttributeToCurrentFontStack (const char* text, size_t nBytes);
125 nonvirtual void GrabAndApplyCSSStyleFromTagText (const char* text, size_t nBytes);
126
127 protected:
128 virtual void HandleHTMLThingyTag_BANG_doctype (bool start, const char* text, size_t nBytes);
129 virtual void HandleHTMLThingyTag_a (bool start, const char* text, size_t nBytes);
130 virtual void HandleHTMLThingyTag_b (bool start, const char* text, size_t nBytes);
131 virtual void HandleHTMLThingyTag_basefont (bool start, const char* text, size_t nBytes);
132 virtual void HandleHTMLThingyTag_big (bool start, const char* text, size_t nBytes);
133 virtual void HandleHTMLThingyTag_blockquote (bool start, const char* text, size_t nBytes);
134 virtual void HandleHTMLThingyTag_br (bool start, const char* text, size_t nBytes);
135 virtual void HandleHTMLThingyTag_body (bool start, const char* text, size_t nBytes);
136 virtual void HandleHTMLThingyTag_code (bool start, const char* text, size_t nBytes);
137 virtual void HandleHTMLThingyTag_comment (bool start, const char* text, size_t nBytes);
138 virtual void HandleHTMLThingyTag_dir (bool start, const char* text, size_t nBytes);
139 virtual void HandleHTMLThingyTag_div (bool start, const char* text, size_t nBytes);
140 virtual void HandleHTMLThingyTag_em (bool start, const char* text, size_t nBytes);
141 virtual void HandleHTMLThingyTag_font (bool start, const char* text, size_t nBytes);
142 virtual void HandleHTMLThingyTag_head (bool start, const char* text, size_t nBytes);
143 virtual void HandleHTMLThingyTag_html (bool start, const char* text, size_t nBytes);
144 virtual void HandleHTMLThingyTag_hr (bool start, const char* text, size_t nBytes);
145 virtual void HandleHTMLThingyTag_hN (bool start, const char* text, size_t nBytes);
146 virtual void HandleHTMLThingyTag_i (bool start, const char* text, size_t nBytes);
147 virtual void HandleHTMLThingyTag_img (bool start, const char* text, size_t nBytes);
148 virtual void HandleHTMLThingyTag_li (bool start, const char* text, size_t nBytes);
149 virtual void HandleHTMLThingyTag_listing (bool start, const char* text, size_t nBytes);
150 virtual void HandleHTMLThingyTag_ol (bool start, const char* text, size_t nBytes);
151 virtual void HandleHTMLThingyTag_p (bool start, const char* text, size_t nBytes);
152 virtual void HandleHTMLThingyTag_plaintext (bool start, const char* text, size_t nBytes);
153 virtual void HandleHTMLThingyTag_pre (bool start, const char* text, size_t nBytes);
154 virtual void HandleHTMLThingyTag_s (bool start, const char* text, size_t nBytes);
155 virtual void HandleHTMLThingyTag_samp (bool start, const char* text, size_t nBytes);
156 virtual void HandleHTMLThingyTag_small (bool start, const char* text, size_t nBytes);
157 virtual void HandleHTMLThingyTag_span (bool start, const char* text, size_t nBytes);
158 virtual void HandleHTMLThingyTag_strike (bool start, const char* text, size_t nBytes);
159 virtual void HandleHTMLThingyTag_strong (bool start, const char* text, size_t nBytes);
160 virtual void HandleHTMLThingyTag_sub (bool start, const char* text, size_t nBytes);
161 virtual void HandleHTMLThingyTag_sup (bool start, const char* text, size_t nBytes);
162 virtual void HandleHTMLThingyTag_table (bool start, const char* text, size_t nBytes);
163 virtual void HandleHTMLThingyTag_td (bool start, const char* text, size_t nBytes);
164 virtual void HandleHTMLThingyTag_th (bool start, const char* text, size_t nBytes);
165 virtual void HandleHTMLThingyTag_title (bool start, const char* text, size_t nBytes);
166 virtual void HandleHTMLThingyTag_tr (bool start, const char* text, size_t nBytes);
167 virtual void HandleHTMLThingyTag_tt (bool start, const char* text, size_t nBytes);
168 virtual void HandleHTMLThingyTag_u (bool start, const char* text, size_t nBytes);
169 virtual void HandleHTMLThingyTag_ul (bool start, const char* text, size_t nBytes);
170 virtual void HandleHTMLThingyTag_var (bool start, const char* text, size_t nBytes);
171 virtual void HandleHTMLThingyTag_xmp (bool start, const char* text, size_t nBytes);
172
173 virtual void HandleHTMLThingyTagUnknown (bool start, const char* text, size_t nBytes);
174
175 protected:
176 nonvirtual void BasicFontStackOperation (bool start);
177 nonvirtual void EmitForcedLineBreak ();
178
179 protected:
180 nonvirtual bool ParseHTMLTagArgOut (const string& tagText, const string& attrName, string* attrValue);
181 nonvirtual bool ParseCSSTagArgOut (const string& text, const string& attrName, string* attrValue);
182
183 protected:
184 nonvirtual void StartPara ();
185 nonvirtual void EndParaIfOpen ();
186
187 private:
188 bool fInAPara{false};
189
190 // Implement the quirky font/size rules (1..7) HTML prescribes
191 protected:
192 virtual void SetHTMLFontSize (int to);
193
194 protected:
195 int fHTMLBaseFontSize{3};
196 int fHTMLFontSize{3};
197
198 protected:
199 HTMLInfo* fSaveHTMLInfoInto{nullptr};
200 bool fReadingBody{false};
201 vector<FontSpecification> fFontStack;
202 bool fComingTextIsTitle{false};
203 bool fNormalizeInputWhitespace{true};
204 bool fLastCharSpace{true};
205 bool fHiddenTextMode{false};
206 Led_tString fHiddenTextAccumulation;
207 size_t fCurAHRefStart{size_t (-1)};
208 string fCurAHRefText;
209 unsigned int fULNestingCount{0};
210 bool fLIOpen{false};
211 unsigned int fTableOpenCount{0};
212 bool fTableRowOpen{false};
213 bool fTableCellOpen{false};
214 };
215
216 /*
217 @CLASS: StyledTextIOWriter_HTML
218 @BASES: @'StyledTextIOWriter'
219 @DESCRIPTION:
220 */
221 class StyledTextIOWriter_HTML : public StyledTextIOWriter {
222 public:
223 StyledTextIOWriter_HTML (SrcStream* srcStream, SinkStream* sinkStream, const HTMLInfo* getHTMLInfoFrom = nullptr);
224 ~StyledTextIOWriter_HTML ();
225
226 public:
227 virtual void Write () override;
228
229 public:
230 using EntityRefMapEntry = HTMLInfo::EntityRefMapEntry;
231
232 protected:
233 virtual const vector<EntityRefMapEntry>& GetEntityRefMapTable () const;
234
235 public:
236 using Table = StyledTextIOWriter::SrcStream::Table;
237
238 protected:
239 class WriterContext;
240
241 protected:
242 nonvirtual void WriteHeader (WriterContext& /*writerContext*/);
243 nonvirtual void WriteBody (WriterContext& writerContext);
244 nonvirtual void WriteInnerBody (WriterContext& writerContext);
245 nonvirtual void WriteBodyCharacter (WriterContext& writerContext, Led_tChar c);
246 nonvirtual void WriteTable (WriterContext& writerContext, Table* table);
247 nonvirtual void WriteOpenTag (WriterContext& writerContext, const string& tagName, const string& tagExtras = string{});
248 nonvirtual void WriteOpenTagSpecial (WriterContext& writerContext, const string& tagName, const string& tagFullText);
249 nonvirtual void WriteCloseTag (WriterContext& writerContext, const string& tagName);
250 nonvirtual void WriteOpenCloseTag (WriterContext& writerContext, const string& tagName, const string& tagExtras = string{});
251 nonvirtual bool IsTagOnStack (WriterContext& writerContext, const string& tagName);
252 nonvirtual void EmitBodyFontInfoChange (WriterContext& writerContext, const FontSpecification& newOne, bool skipDoingOpenTags);
253 nonvirtual void AssureStyleRunSummaryBuilt (WriterContext& writerContext);
254 nonvirtual string MapOutputTextFromWString (const wstring& text);
255 nonvirtual string MapOutputTextFromTString (const Led_tString& text);
256
257 protected:
258 const HTMLInfo* fGetHTMLInfoFrom;
259 vector<StyledInfoSummaryRecord> fStyleRunSummary;
260 Led_tChar fSoftLineBreakChar;
261 };
262
263 class StyledTextIOWriter_HTML::WriterContext {
264 public:
265 WriterContext (StyledTextIOWriter_HTML& writer);
266 WriterContext (WriterContext& parentContext, SrcStream& srcStream);
267
268 public:
269 nonvirtual StyledTextIOWriter_HTML& GetWriter () const;
270
271 private:
272 StyledTextIOWriter_HTML& fWriter;
273
274 public:
275 nonvirtual SrcStream& GetSrcStream () const;
276 nonvirtual SinkStream& GetSinkStream () const;
277
278 private:
279 StyledTextIOWriter::SrcStream& fSrcStream;
280
281 public:
282 StyledInfoSummaryRecord fLastEmittedISR;
283 size_t fLastStyleChangeAt;
284 size_t fIthStyleRun;
285 size_t fLastForcedNLAt;
286 bool fEmittedStartOfPara;
287 bool fEmittingList;
288 bool fEmittingListItem;
289 vector<string> fTagStack;
290 bool fInTableCell;
291
292 public:
293 nonvirtual size_t GetCurSrcOffset () const;
294#if qStroika_Frameworks_Led_SupportGDI
295 nonvirtual SimpleEmbeddedObjectStyleMarker* GetCurSimpleEmbeddedObjectStyleMarker () const;
296#endif
297 };
298
299 /*
300 ********************************************************************************
301 ***************************** Implementation Details ***************************
302 ********************************************************************************
303 */
304
305 /*
306 ********************************************************************************
307 ***************************** HTMLInfo::EntityRefMapEntry ***************************
308 ********************************************************************************
309 */
310 inline HTMLInfo::EntityRefMapEntry::EntityRefMapEntry (const string& entityRefName, wchar_t charValue)
311 : fEntityRefName (entityRefName)
312 , fCharValue (charValue)
313 {
314 }
315
316 /*
317 ********************************************************************************
318 ***************************** StyledTextIOReader_HTML ***************************
319 ********************************************************************************
320 */
321 inline void StyledTextIOReader_HTML::EmitText (const Led_tString& text, bool skipNLCheck)
322 {
323 EmitText (text.c_str (), text.length (), skipNLCheck);
324 }
325 inline void StyledTextIOReader_HTML::HandleHTMLThingy (StyledTextIOReader_HTML::ThingyType thingy, const string& text)
326 {
327 HandleHTMLThingy (thingy, text.c_str (), text.length ());
328 }
329
330 /*
331 ********************************************************************************
332 ********************** StyledTextIOWriter_HTML::WriterContext ***************************
333 ********************************************************************************
334 */
335 inline StyledTextIOWriter_HTML::WriterContext::WriterContext (StyledTextIOWriter_HTML& writer)
336 : fWriter (writer)
337 , fSrcStream (fWriter.GetSrcStream ())
338 , fLastEmittedISR (IncrementalFontSpecification (), 0)
339 , fLastStyleChangeAt (0)
340 , fIthStyleRun (0)
341 , fLastForcedNLAt (0)
342 , fEmittedStartOfPara (false)
343 , fEmittingList (false)
344 , fEmittingListItem (false)
345 , fTagStack ()
346 , fInTableCell (false)
347 {
348 }
349 inline StyledTextIOWriter_HTML::WriterContext::WriterContext (WriterContext& parentContext, SrcStream& srcStream)
350 : fWriter (parentContext.fWriter)
351 , fSrcStream (srcStream)
352 , fLastEmittedISR (IncrementalFontSpecification (), 0)
353 , fLastStyleChangeAt (0)
354 , fIthStyleRun (0)
355 , fLastForcedNLAt (0)
356 , fEmittedStartOfPara (false)
357 , fEmittingList (false)
358 , fEmittingListItem (false)
359 , fTagStack ()
360 , fInTableCell (true)
361 {
362 }
363 inline StyledTextIOWriter_HTML& StyledTextIOWriter_HTML::WriterContext::GetWriter () const
364 {
365 return fWriter;
366 }
367 inline StyledTextIOWriter::SrcStream& StyledTextIOWriter_HTML::WriterContext::GetSrcStream () const
368 {
369 return fSrcStream;
370 }
371 inline StyledTextIOWriter::SinkStream& StyledTextIOWriter_HTML::WriterContext::GetSinkStream () const
372 {
373 return fWriter.GetSinkStream ();
374 }
375
376}
377
378#endif /*_Stroika_Frameworks_Led_StyledTextIO_HTML_h_*/