1 /++
2 	Some support for the RTF file format - rich text format, like produced by Windows WordPad.
3 
4 	History:
5 		Added February 13, 2025
6 +/
7 module arsd.rtf;
8 
9 // https://www.biblioscape.com/rtf15_spec.htm
10 // https://latex2rtf.sourceforge.net/rtfspec_62.html
11 // https://en.wikipedia.org/wiki/Rich_Text_Format
12 
13 // spacing is in "twips" or 1/20 of a point (as in text size unit). aka 1/1440th of an inch.
14 
15 import arsd.core;
16 import arsd.color;
17 
18 /++
19 
20 +/
21 struct RtfDocument {
22 	RtfGroup root;
23 
24 	/++
25 		There are two helper functions to process a RTF file: one that does minimal processing
26 		and sends you the data as it appears in the file, and one that sends you preprocessed
27 		results upon significant state changes.
28 
29 		The former makes you do more work, but also exposes (almost) the whole file to you (it is still partially processed). The latter lets you just get down to business processing the text, but is not a complete implementation.
30 	+/
31 	void process(void delegate(RtfPiece piece, ref RtfState state) dg) {
32 		recurseIntoGroup(root, RtfState.init, dg);
33 	}
34 
35 	private static void recurseIntoGroup(RtfGroup group, RtfState parentState, void delegate(RtfPiece piece, ref RtfState state) dg) {
36 		// might need to copy...
37 		RtfState state = parentState;
38 		auto newDestination = group.destination;
39 		if(newDestination.length)
40 			state.currentDestination = newDestination;
41 
42 		foreach(piece; group.pieces) {
43 			if(piece.contains == RtfPiece.Contains.group) {
44 				recurseIntoGroup(piece.group, state, dg);
45 			} else {
46 				dg(piece, state);
47 			}
48 		}
49 
50 	}
51 
52 	//Color[] colorTable;
53 	//Object[] fontTable;
54 }
55 
56 /// ditto
57 RtfDocument readRtfFromString(const(char)[] s) {
58 	return readRtfFromBytes(cast(const(ubyte)[]) s);
59 }
60 
61 /// ditto
62 RtfDocument readRtfFromBytes(const(ubyte)[] s) {
63 	RtfDocument document;
64 
65 	if(s.length < 7)
66 		throw new ArsdException!"not a RTF file"("too short");
67 	if((cast(char[]) s[0..6]) != `{\rtf1`)
68 		throw new ArsdException!"not a RTF file"("wrong magic number");
69 
70 	document.root = parseRtfGroup(s);
71 
72 	return document;
73 }
74 
75 /// ditto
76 struct RtfState {
77 	string currentDestination;
78 }
79 
80 unittest {
81 	auto document = readRtfFromString("{\\rtf1Hello\nWorld}");
82 	//import std.file; auto document = readRtfFromString(readText("/home/me/test.rtf"));
83 	document.process((piece, ref state) {
84 		final switch(piece.contains) {
85 			case RtfPiece.Contains.controlWord:
86 				// writeln(state.currentDestination, ": ", piece.controlWord);
87 			break;
88 			case RtfPiece.Contains.text:
89 				// writeln(state.currentDestination, ": ", piece.text);
90 			break;
91 			case RtfPiece.Contains.group:
92 				assert(0);
93 		}
94 	});
95 
96 	// writeln(toPlainText(document));
97 }
98 
99 string toPlainText(RtfDocument document) {
100 	string ret;
101 	document.process((piece, ref state) {
102 		if(state.currentDestination.length)
103 			return;
104 
105 		final switch(piece.contains) {
106 			case RtfPiece.Contains.controlWord:
107 				if(piece.controlWord.letterSequence == "par")
108 					ret ~= "\n\n";
109 				else if(piece.controlWord.toDchar != dchar.init)
110 					ret ~= piece.controlWord.toDchar;
111 			break;
112 			case RtfPiece.Contains.text:
113 				ret ~= piece.text;
114 			break;
115 			case RtfPiece.Contains.group:
116 				assert(0);
117 		}
118 	});
119 
120 	return ret;
121 }
122 
123 private RtfGroup parseRtfGroup(ref const(ubyte)[] s) {
124 	RtfGroup group;
125 
126 	assert(s[0] == '{');
127 	s = s[1 .. $];
128 	if(s.length == 0)
129 		throw new ArsdException!"bad RTF file"("premature end after {");
130 	while(s[0] != '}') {
131 		group.pieces ~= parseRtfPiece(s);
132 		if(s.length == 0)
133 			throw new ArsdException!"bad RTF file"("premature end before {");
134 	}
135 	s = s[1 .. $];
136 	return group;
137 }
138 
139 private RtfPiece parseRtfPiece(ref const(ubyte)[] s) {
140 	while(true)
141 	switch(s[0]) {
142 		case '\\':
143 			return RtfPiece(parseRtfControlWord(s));
144 		case '{':
145 			return RtfPiece(parseRtfGroup(s));
146 		case '\t':
147 			s = s[1 .. $];
148 			return RtfPiece(RtfControlWord.tab);
149 		case '\r':
150 		case '\n':
151 			// skip irrelevant characters
152 			s = s[1 .. $];
153 			continue;
154 		default:
155 			return RtfPiece(parseRtfText(s));
156 	}
157 }
158 
159 private RtfControlWord parseRtfControlWord(ref const(ubyte)[] s) {
160 	assert(s[0] == '\\');
161 	s = s[1 .. $];
162 
163 	if(s.length == 0)
164 		throw new ArsdException!"bad RTF file"("premature end after \\");
165 
166 	RtfControlWord ret;
167 
168 	size_t pos;
169 	do {
170 		pos++;
171 	} while(pos < s.length && isAlpha(cast(char) s[pos]));
172 
173 	ret.letterSequence = (cast(const char[]) s)[0 .. pos].idup;
174 	s = s[pos .. $];
175 
176 	if(isAlpha(ret.letterSequence[0])) {
177 		if(s.length == 0)
178 			throw new ArsdException!"bad RTF file"("premature end after control word");
179 
180 		int readNumber() {
181 			if(s.length == 0)
182 				throw new ArsdException!"bad RTF file"("premature end when reading number");
183 			int count;
184 			while(s[count] >= '0' && s[count] <= '9')
185 				count++;
186 			if(count == 0)
187 				throw new ArsdException!"bad RTF file"("expected negative number, got something else");
188 
189 			auto buffer = cast(const(char)[]) s[0 .. count];
190 			s = s[count .. $];
191 
192 			int accumulator;
193 			foreach(ch; buffer) {
194 				accumulator *= 10;
195 				accumulator += ch - '0';
196 			}
197 
198 			return accumulator;
199 		}
200 
201 		if(s[0] == '-') {
202 			ret.hadNumber = true;
203 			s = s[1 .. $];
204 			ret.number = - readNumber();
205 
206 			// negative number
207 		} else if(s[0] >= '0' && s[0] <= '9') {
208 			// non-negative number
209 			ret.hadNumber = true;
210 			ret.number = readNumber();
211 		}
212 
213 		if(s[0] == ' ') {
214 			ret.hadSpaceAtEnd = true;
215 			s = s[1 .. $];
216 		}
217 
218 	} else {
219 		// it was a control symbol
220 		if(ret.letterSequence == "\r" || ret.letterSequence == "\n")
221 			ret.letterSequence = "par";
222 	}
223 
224 	return ret;
225 }
226 
227 private string parseRtfText(ref const(ubyte)[] s) {
228 	size_t end = s.length;
229 	foreach(idx, ch; s) {
230 		if(ch == '\\' || ch == '{' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '}') {
231 			end = idx;
232 			break;
233 		}
234 	}
235 	auto ret = s[0 .. end];
236 	s = s[end .. $];
237 
238 	// FIXME: charset conversion?
239 	return (cast(const char[]) ret).idup;
240 }
241 
242 // \r and \n chars w/o a \\ before them are ignored. but \ at the end of al ine is a \par
243 // \t is read but you should use \tab generally
244 // when reading, ima translate the ascii tab to \tab control word
245 // and ignore
246 struct RtfPiece {
247 	/++
248 	+/
249 	Contains contains() {
250 		return contains_;
251 	}
252 	/// ditto
253 	enum Contains {
254 		controlWord,
255 		group,
256 		text
257 	}
258 
259 	this(RtfControlWord cw) {
260 		this.controlWord_ = cw;
261 		this.contains_ = Contains.controlWord;
262 	}
263 	this(RtfGroup g) {
264 		this.group_ = g;
265 		this.contains_ = Contains.group;
266 	}
267 	this(string s) {
268 		this.text_ = s;
269 		this.contains_ = Contains.text;
270 	}
271 
272 	/++
273 	+/
274 	RtfControlWord controlWord() {
275 		if(contains != Contains.controlWord)
276 			throw ArsdException!"RtfPiece type mismatch"(contains);
277 		return controlWord_;
278 	}
279 	/++
280 	+/
281 	RtfGroup group() {
282 		if(contains != Contains.group)
283 			throw ArsdException!"RtfPiece type mismatch"(contains);
284 		return group_;
285 	}
286 	/++
287 	+/
288 	string text() {
289 		if(contains != Contains.text)
290 			throw ArsdException!"RtfPiece type mismatch"(contains);
291 		return text_;
292 	}
293 
294 	private Contains contains_;
295 
296 	private union {
297 		RtfControlWord controlWord_;
298 		RtfGroup group_;
299 		string text_;
300 	}
301 }
302 
303 // a \word thing
304 struct RtfControlWord {
305 	bool hadSpaceAtEnd;
306 	bool hadNumber;
307 	string letterSequence; // what the word is
308 	int number;
309 
310 	bool isDestination() {
311 		switch(letterSequence) {
312 			case
313 			"author", "comment", "subject", "title",
314 			"buptim", "creatim", "printim", "revtim",
315 			"doccomm",
316 			"footer", "footerf", "footerl", "footerr",
317 			"footnote",
318 			"ftncn", "ftnsep", "ftnsepc",
319 			"header", "headerf", "headerl", "headerr",
320 			"info", "keywords", "operator",
321 			"pict",
322 			"private",
323 			"rxe",
324 			"stylesheet",
325 			"tc",
326 			"txe",
327 			"xe":
328 				return true;
329 			case "colortbl":
330 				return true;
331 			case "fonttbl":
332 				return true;
333 
334 			default: return false;
335 		}
336 	}
337 
338 	dchar toDchar() {
339 		switch(letterSequence) {
340 			case "{": return '{';
341 			case "}": return '}';
342 			case `\`: return '\\';
343 			case "~": return '\&nbsp;';
344 			case "tab": return '\t';
345 			case "line": return '\n';
346 			default: return dchar.init;
347 		}
348 	}
349 
350 	bool isTurnOn() {
351 		return !hadNumber || number != 0;
352 	}
353 
354 	// take no delimiters
355 	bool isControlSymbol() {
356 		// if true, the letterSequence is the symbol
357 		return letterSequence.length && !isAlpha(letterSequence[0]);
358 	}
359 
360 	// letterSequence == ~ is a non breaking space
361 
362 	static RtfControlWord tab() {
363 		RtfControlWord w;
364 		w.letterSequence = "tab";
365 		return w;
366 	}
367 }
368 
369 private bool isAlpha(char c) {
370 	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
371 }
372 
373 // a { ... } thing
374 struct RtfGroup {
375 	RtfPiece[] pieces;
376 
377 	string destination() {
378 		return isStarred() ?
379 			((pieces.length > 1 && pieces[1].contains == RtfPiece.Contains.controlWord) ? pieces[1].controlWord.letterSequence : null)
380 			: ((pieces.length && pieces[0].contains == RtfPiece.Contains.controlWord && pieces[0].controlWord.isDestination) ? pieces[0].controlWord.letterSequence : null);
381 	}
382 
383 	bool isStarred() {
384 		return (pieces.length && pieces[0].contains == RtfPiece.Contains.controlWord && pieces[0].controlWord.letterSequence == "*");
385 	}
386 }
387 
388 /+
389 	\pard = paragraph defaults
390 +/