1 /++
2 	Some support for the RTF file format - rich text format, like produced by Windows WordPad.
3 
4 	History:
5 		Added February 13, 2025
6 +/
7 module arsd.rtf;
8 
9 // https://www.biblioscape.com/rtf15_spec.htm
10 // https://latex2rtf.sourceforge.net/rtfspec_62.html
11 // https://en.wikipedia.org/wiki/Rich_Text_Format
12 
13 // spacing is in "twips" or 1/20 of a point (as in text size unit). aka 1/1440th of an inch.
14 
15 import arsd.core;
16 import arsd.color;
17 
18 /++
19 
20 +/
21 struct RtfDocument {
22 	RtfGroup root;
23 
24 	/++
25 		There are two helper functions to process a RTF file: one that does minimal processing
26 		and sends you the data as it appears in the file, and one that sends you preprocessed
27 		results upon significant state changes.
28 
29 		The former makes you do more work, but also exposes (almost) the whole file to you (it is still partially processed). The latter lets you just get down to business processing the text, but is not a complete implementation.
30 	+/
31 	void process(void delegate(RtfPiece piece, ref RtfState state) dg) {
32 		recurseIntoGroup(root, RtfState.init, dg);
33 	}
34 
35 	private static void recurseIntoGroup(RtfGroup group, RtfState parentState, void delegate(RtfPiece piece, ref RtfState state) dg) {
36 		// might need to copy...
37 		RtfState state = parentState;
38 		auto newDestination = group.destination;
39 		if(newDestination.length)
40 			state.currentDestination = newDestination;
41 
42 		foreach(piece; group.pieces) {
43 			if(piece.contains == RtfPiece.Contains.group) {
44 				recurseIntoGroup(piece.group, state, dg);
45 			} else {
46 				dg(piece, state);
47 			}
48 		}
49 
50 	}
51 
52 	//Color[] colorTable;
53 	//Object[] fontTable;
54 }
55 
56 /// ditto
57 RtfDocument readRtfFromString(const(char)[] s) {
58 	return readRtfFromBytes(cast(const(ubyte)[]) s);
59 }
60 
61 /// ditto
62 RtfDocument readRtfFromBytes(const(ubyte)[] s) {
63 	RtfDocument document;
64 
65 	if(s.length < 7)
66 		throw new ArsdException!"not a RTF file"("too short");
67 	if((cast(char[]) s[0..6]) != `{\rtf1`)
68 		throw new ArsdException!"not a RTF file"("wrong magic number");
69 
70 	document.root = parseRtfGroup(s);
71 
72 	return document;
73 }
74 
75 /// ditto
76 struct RtfState {
77 	string currentDestination;
78 }
79 
80 unittest {
81 	auto document = readRtfFromString("{\\rtf1Hello\nWorld}");
82 	//import std.file; auto document = readRtfFromString(readText("/home/me/test.rtf"));
83 	document.process((piece, ref state) {
84 		final switch(piece.contains) {
85 			case RtfPiece.Contains.controlWord:
86 				// writeln(state.currentDestination, ": ", piece.controlWord);
87 			break;
88 			case RtfPiece.Contains.text:
89 				// writeln(state.currentDestination, ": ", piece.text);
90 			break;
91 			case RtfPiece.Contains.group:
92 				assert(0);
93 		}
94 	});
95 
96 	// writeln(toPlainText(document));
97 }
98 
99 /++
100 	Returns a plan text string that represents the jist of the document's content.
101 +/
102 string toPlainText(RtfDocument document) {
103 	string ret;
104 	document.process((piece, ref state) {
105 		if(state.currentDestination.length)
106 			return;
107 
108 		final switch(piece.contains) {
109 			case RtfPiece.Contains.controlWord:
110 				if(piece.controlWord.letterSequence == "par")
111 					ret ~= "\n\n";
112 				else if(piece.controlWord.toDchar != dchar.init)
113 					ret ~= piece.controlWord.toDchar;
114 			break;
115 			case RtfPiece.Contains.text:
116 				ret ~= piece.text;
117 			break;
118 			case RtfPiece.Contains.group:
119 				assert(0);
120 		}
121 	});
122 
123 	return ret;
124 }
125 
126 private RtfGroup parseRtfGroup(ref const(ubyte)[] s) {
127 	RtfGroup group;
128 
129 	assert(s[0] == '{');
130 	s = s[1 .. $];
131 	if(s.length == 0)
132 		throw new ArsdException!"bad RTF file"("premature end after {");
133 	while(s[0] != '}') {
134 		group.pieces ~= parseRtfPiece(s);
135 		if(s.length == 0)
136 			throw new ArsdException!"bad RTF file"("premature end before {");
137 	}
138 	s = s[1 .. $];
139 	return group;
140 }
141 
142 private RtfPiece parseRtfPiece(ref const(ubyte)[] s) {
143 	while(true)
144 	switch(s[0]) {
145 		case '\\':
146 			return RtfPiece(parseRtfControlWord(s));
147 		case '{':
148 			return RtfPiece(parseRtfGroup(s));
149 		case '\t':
150 			s = s[1 .. $];
151 			return RtfPiece(RtfControlWord.tab);
152 		case '\r':
153 		case '\n':
154 			// skip irrelevant characters
155 			s = s[1 .. $];
156 			continue;
157 		default:
158 			return RtfPiece(parseRtfText(s));
159 	}
160 }
161 
162 private RtfControlWord parseRtfControlWord(ref const(ubyte)[] s) {
163 	assert(s[0] == '\\');
164 	s = s[1 .. $];
165 
166 	if(s.length == 0)
167 		throw new ArsdException!"bad RTF file"("premature end after \\");
168 
169 	RtfControlWord ret;
170 
171 	size_t pos;
172 	do {
173 		pos++;
174 	} while(pos < s.length && isAlpha(cast(char) s[pos]));
175 
176 	ret.letterSequence = (cast(const char[]) s)[0 .. pos].idup;
177 	s = s[pos .. $];
178 
179 	if(isAlpha(ret.letterSequence[0])) {
180 		if(s.length == 0)
181 			throw new ArsdException!"bad RTF file"("premature end after control word");
182 
183 		int readNumber() {
184 			if(s.length == 0)
185 				throw new ArsdException!"bad RTF file"("premature end when reading number");
186 			int count;
187 			while(s[count] >= '0' && s[count] <= '9')
188 				count++;
189 			if(count == 0)
190 				throw new ArsdException!"bad RTF file"("expected negative number, got something else");
191 
192 			auto buffer = cast(const(char)[]) s[0 .. count];
193 			s = s[count .. $];
194 
195 			int accumulator;
196 			foreach(ch; buffer) {
197 				accumulator *= 10;
198 				accumulator += ch - '0';
199 			}
200 
201 			return accumulator;
202 		}
203 
204 		if(s[0] == '-') {
205 			ret.hadNumber = true;
206 			s = s[1 .. $];
207 			ret.number = - readNumber();
208 
209 			// negative number
210 		} else if(s[0] >= '0' && s[0] <= '9') {
211 			// non-negative number
212 			ret.hadNumber = true;
213 			ret.number = readNumber();
214 		}
215 
216 		if(s[0] == ' ') {
217 			ret.hadSpaceAtEnd = true;
218 			s = s[1 .. $];
219 		}
220 
221 	} else {
222 		// it was a control symbol
223 		if(ret.letterSequence == "\r" || ret.letterSequence == "\n")
224 			ret.letterSequence = "par";
225 	}
226 
227 	return ret;
228 }
229 
230 private string parseRtfText(ref const(ubyte)[] s) {
231 	size_t end = s.length;
232 	foreach(idx, ch; s) {
233 		if(ch == '\\' || ch == '{' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '}') {
234 			end = idx;
235 			break;
236 		}
237 	}
238 	auto ret = s[0 .. end];
239 	s = s[end .. $];
240 
241 	// FIXME: charset conversion?
242 	return (cast(const char[]) ret).idup;
243 }
244 
245 // \r and \n chars w/o a \\ before them are ignored. but \ at the end of al ine is a \par
246 // \t is read but you should use \tab generally
247 // when reading, ima translate the ascii tab to \tab control word
248 // and ignore
249 /++
250 	A union of entities you can see while parsing a RTF file.
251 +/
252 struct RtfPiece {
253 	/++
254 	+/
255 	Contains contains() {
256 		return contains_;
257 	}
258 	/// ditto
259 	enum Contains {
260 		controlWord,
261 		group,
262 		text
263 	}
264 
265 	this(RtfControlWord cw) {
266 		this.controlWord_ = cw;
267 		this.contains_ = Contains.controlWord;
268 	}
269 	this(RtfGroup g) {
270 		this.group_ = g;
271 		this.contains_ = Contains.group;
272 	}
273 	this(string s) {
274 		this.text_ = s;
275 		this.contains_ = Contains.text;
276 	}
277 
278 	/++
279 	+/
280 	RtfControlWord controlWord() {
281 		if(contains != Contains.controlWord)
282 			throw ArsdException!"RtfPiece type mismatch"(contains);
283 		return controlWord_;
284 	}
285 	/++
286 	+/
287 	RtfGroup group() {
288 		if(contains != Contains.group)
289 			throw ArsdException!"RtfPiece type mismatch"(contains);
290 		return group_;
291 	}
292 	/++
293 	+/
294 	string text() {
295 		if(contains != Contains.text)
296 			throw ArsdException!"RtfPiece type mismatch"(contains);
297 		return text_;
298 	}
299 
300 	private Contains contains_;
301 
302 	private union {
303 		RtfControlWord controlWord_;
304 		RtfGroup group_;
305 		string text_;
306 	}
307 }
308 
309 // a \word thing
310 /++
311 	A control word directly from the RTF file format.
312 +/
313 struct RtfControlWord {
314 	bool hadSpaceAtEnd;
315 	bool hadNumber;
316 	string letterSequence; // what the word is
317 	int number;
318 
319 	bool isDestination() {
320 		switch(letterSequence) {
321 			case
322 			"author", "comment", "subject", "title",
323 			"buptim", "creatim", "printim", "revtim",
324 			"doccomm",
325 			"footer", "footerf", "footerl", "footerr",
326 			"footnote",
327 			"ftncn", "ftnsep", "ftnsepc",
328 			"header", "headerf", "headerl", "headerr",
329 			"info", "keywords", "operator",
330 			"pict",
331 			"private",
332 			"rxe",
333 			"stylesheet",
334 			"tc",
335 			"txe",
336 			"xe":
337 				return true;
338 			case "colortbl":
339 				return true;
340 			case "fonttbl":
341 				return true;
342 
343 			default: return false;
344 		}
345 	}
346 
347 	dchar toDchar() {
348 		switch(letterSequence) {
349 			case "{": return '{';
350 			case "}": return '}';
351 			case `\`: return '\\';
352 			case "~": return '\&nbsp;';
353 			case "tab": return '\t';
354 			case "line": return '\n';
355 			default: return dchar.init;
356 		}
357 	}
358 
359 	bool isTurnOn() {
360 		return !hadNumber || number != 0;
361 	}
362 
363 	// take no delimiters
364 	bool isControlSymbol() {
365 		// if true, the letterSequence is the symbol
366 		return letterSequence.length && !isAlpha(letterSequence[0]);
367 	}
368 
369 	// letterSequence == ~ is a non breaking space
370 
371 	static RtfControlWord tab() {
372 		RtfControlWord w;
373 		w.letterSequence = "tab";
374 		return w;
375 	}
376 }
377 
378 private bool isAlpha(char c) {
379 	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
380 }
381 
382 // a { ... } thing
383 /++
384 	A group directly from the RTF file.
385 +/
386 struct RtfGroup {
387 	RtfPiece[] pieces;
388 
389 	string destination() {
390 		return isStarred() ?
391 			((pieces.length > 1 && pieces[1].contains == RtfPiece.Contains.controlWord) ? pieces[1].controlWord.letterSequence : null)
392 			: ((pieces.length && pieces[0].contains == RtfPiece.Contains.controlWord && pieces[0].controlWord.isDestination) ? pieces[0].controlWord.letterSequence : null);
393 	}
394 
395 	bool isStarred() {
396 		return (pieces.length && pieces[0].contains == RtfPiece.Contains.controlWord && pieces[0].controlWord.letterSequence == "*");
397 	}
398 }
399 
400 /+
401 	\pard = paragraph defaults
402 +/