1 /++ 2 Some support for the RTF file format - rich text format, like produced by Windows WordPad. 3 4 History: 5 Added February 13, 2025 6 +/ 7 module arsd.rtf; 8 9 // https://www.biblioscape.com/rtf15_spec.htm 10 // https://latex2rtf.sourceforge.net/rtfspec_62.html 11 // https://en.wikipedia.org/wiki/Rich_Text_Format 12 13 // spacing is in "twips" or 1/20 of a point (as in text size unit). aka 1/1440th of an inch. 14 15 import arsd.core; 16 import arsd.color; 17 18 /++ 19 20 +/ 21 struct RtfDocument { 22 RtfGroup root; 23 24 /++ 25 There are two helper functions to process a RTF file: one that does minimal processing 26 and sends you the data as it appears in the file, and one that sends you preprocessed 27 results upon significant state changes. 28 29 The former makes you do more work, but also exposes (almost) the whole file to you (it is still partially processed). The latter lets you just get down to business processing the text, but is not a complete implementation. 30 +/ 31 void process(void delegate(RtfPiece piece, ref RtfState state) dg) { 32 recurseIntoGroup(root, RtfState.init, dg); 33 } 34 35 private static void recurseIntoGroup(RtfGroup group, RtfState parentState, void delegate(RtfPiece piece, ref RtfState state) dg) { 36 // might need to copy... 37 RtfState state = parentState; 38 auto newDestination = group.destination; 39 if(newDestination.length) 40 state.currentDestination = newDestination; 41 42 foreach(piece; group.pieces) { 43 if(piece.contains == RtfPiece.Contains.group) { 44 recurseIntoGroup(piece.group, state, dg); 45 } else { 46 dg(piece, state); 47 } 48 } 49 50 } 51 52 //Color[] colorTable; 53 //Object[] fontTable; 54 } 55 56 /// ditto 57 RtfDocument readRtfFromString(const(char)[] s) { 58 return readRtfFromBytes(cast(const(ubyte)[]) s); 59 } 60 61 /// ditto 62 RtfDocument readRtfFromBytes(const(ubyte)[] s) { 63 RtfDocument document; 64 65 if(s.length < 7) 66 throw new ArsdException!"not a RTF file"("too short"); 67 if((cast(char[]) s[0..6]) != `{\rtf1`) 68 throw new ArsdException!"not a RTF file"("wrong magic number"); 69 70 document.root = parseRtfGroup(s); 71 72 return document; 73 } 74 75 /// ditto 76 struct RtfState { 77 string currentDestination; 78 } 79 80 unittest { 81 auto document = readRtfFromString("{\\rtf1Hello\nWorld}"); 82 //import std.file; auto document = readRtfFromString(readText("/home/me/test.rtf")); 83 document.process((piece, ref state) { 84 final switch(piece.contains) { 85 case RtfPiece.Contains.controlWord: 86 // writeln(state.currentDestination, ": ", piece.controlWord); 87 break; 88 case RtfPiece.Contains.text: 89 // writeln(state.currentDestination, ": ", piece.text); 90 break; 91 case RtfPiece.Contains.group: 92 assert(0); 93 } 94 }); 95 96 // writeln(toPlainText(document)); 97 } 98 99 /++ 100 Returns a plan text string that represents the jist of the document's content. 101 +/ 102 string toPlainText(RtfDocument document) { 103 string ret; 104 document.process((piece, ref state) { 105 if(state.currentDestination.length) 106 return; 107 108 final switch(piece.contains) { 109 case RtfPiece.Contains.controlWord: 110 if(piece.controlWord.letterSequence == "par") 111 ret ~= "\n\n"; 112 else if(piece.controlWord.toDchar != dchar.init) 113 ret ~= piece.controlWord.toDchar; 114 break; 115 case RtfPiece.Contains.text: 116 ret ~= piece.text; 117 break; 118 case RtfPiece.Contains.group: 119 assert(0); 120 } 121 }); 122 123 return ret; 124 } 125 126 private RtfGroup parseRtfGroup(ref const(ubyte)[] s) { 127 RtfGroup group; 128 129 assert(s[0] == '{'); 130 s = s[1 .. $]; 131 if(s.length == 0) 132 throw new ArsdException!"bad RTF file"("premature end after {"); 133 while(s[0] != '}') { 134 group.pieces ~= parseRtfPiece(s); 135 if(s.length == 0) 136 throw new ArsdException!"bad RTF file"("premature end before {"); 137 } 138 s = s[1 .. $]; 139 return group; 140 } 141 142 private RtfPiece parseRtfPiece(ref const(ubyte)[] s) { 143 while(true) 144 switch(s[0]) { 145 case '\\': 146 return RtfPiece(parseRtfControlWord(s)); 147 case '{': 148 return RtfPiece(parseRtfGroup(s)); 149 case '\t': 150 s = s[1 .. $]; 151 return RtfPiece(RtfControlWord.tab); 152 case '\r': 153 case '\n': 154 // skip irrelevant characters 155 s = s[1 .. $]; 156 continue; 157 default: 158 return RtfPiece(parseRtfText(s)); 159 } 160 } 161 162 private RtfControlWord parseRtfControlWord(ref const(ubyte)[] s) { 163 assert(s[0] == '\\'); 164 s = s[1 .. $]; 165 166 if(s.length == 0) 167 throw new ArsdException!"bad RTF file"("premature end after \\"); 168 169 RtfControlWord ret; 170 171 size_t pos; 172 do { 173 pos++; 174 } while(pos < s.length && isAlpha(cast(char) s[pos])); 175 176 ret.letterSequence = (cast(const char[]) s)[0 .. pos].idup; 177 s = s[pos .. $]; 178 179 if(isAlpha(ret.letterSequence[0])) { 180 if(s.length == 0) 181 throw new ArsdException!"bad RTF file"("premature end after control word"); 182 183 int readNumber() { 184 if(s.length == 0) 185 throw new ArsdException!"bad RTF file"("premature end when reading number"); 186 int count; 187 while(s[count] >= '0' && s[count] <= '9') 188 count++; 189 if(count == 0) 190 throw new ArsdException!"bad RTF file"("expected negative number, got something else"); 191 192 auto buffer = cast(const(char)[]) s[0 .. count]; 193 s = s[count .. $]; 194 195 int accumulator; 196 foreach(ch; buffer) { 197 accumulator *= 10; 198 accumulator += ch - '0'; 199 } 200 201 return accumulator; 202 } 203 204 if(s[0] == '-') { 205 ret.hadNumber = true; 206 s = s[1 .. $]; 207 ret.number = - readNumber(); 208 209 // negative number 210 } else if(s[0] >= '0' && s[0] <= '9') { 211 // non-negative number 212 ret.hadNumber = true; 213 ret.number = readNumber(); 214 } 215 216 if(s[0] == ' ') { 217 ret.hadSpaceAtEnd = true; 218 s = s[1 .. $]; 219 } 220 221 } else { 222 // it was a control symbol 223 if(ret.letterSequence == "\r" || ret.letterSequence == "\n") 224 ret.letterSequence = "par"; 225 } 226 227 return ret; 228 } 229 230 private string parseRtfText(ref const(ubyte)[] s) { 231 size_t end = s.length; 232 foreach(idx, ch; s) { 233 if(ch == '\\' || ch == '{' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '}') { 234 end = idx; 235 break; 236 } 237 } 238 auto ret = s[0 .. end]; 239 s = s[end .. $]; 240 241 // FIXME: charset conversion? 242 return (cast(const char[]) ret).idup; 243 } 244 245 // \r and \n chars w/o a \\ before them are ignored. but \ at the end of al ine is a \par 246 // \t is read but you should use \tab generally 247 // when reading, ima translate the ascii tab to \tab control word 248 // and ignore 249 /++ 250 A union of entities you can see while parsing a RTF file. 251 +/ 252 struct RtfPiece { 253 /++ 254 +/ 255 Contains contains() { 256 return contains_; 257 } 258 /// ditto 259 enum Contains { 260 controlWord, 261 group, 262 text 263 } 264 265 this(RtfControlWord cw) { 266 this.controlWord_ = cw; 267 this.contains_ = Contains.controlWord; 268 } 269 this(RtfGroup g) { 270 this.group_ = g; 271 this.contains_ = Contains.group; 272 } 273 this(string s) { 274 this.text_ = s; 275 this.contains_ = Contains.text; 276 } 277 278 /++ 279 +/ 280 RtfControlWord controlWord() { 281 if(contains != Contains.controlWord) 282 throw ArsdException!"RtfPiece type mismatch"(contains); 283 return controlWord_; 284 } 285 /++ 286 +/ 287 RtfGroup group() { 288 if(contains != Contains.group) 289 throw ArsdException!"RtfPiece type mismatch"(contains); 290 return group_; 291 } 292 /++ 293 +/ 294 string text() { 295 if(contains != Contains.text) 296 throw ArsdException!"RtfPiece type mismatch"(contains); 297 return text_; 298 } 299 300 private Contains contains_; 301 302 private union { 303 RtfControlWord controlWord_; 304 RtfGroup group_; 305 string text_; 306 } 307 } 308 309 // a \word thing 310 /++ 311 A control word directly from the RTF file format. 312 +/ 313 struct RtfControlWord { 314 bool hadSpaceAtEnd; 315 bool hadNumber; 316 string letterSequence; // what the word is 317 int number; 318 319 bool isDestination() { 320 switch(letterSequence) { 321 case 322 "author", "comment", "subject", "title", 323 "buptim", "creatim", "printim", "revtim", 324 "doccomm", 325 "footer", "footerf", "footerl", "footerr", 326 "footnote", 327 "ftncn", "ftnsep", "ftnsepc", 328 "header", "headerf", "headerl", "headerr", 329 "info", "keywords", "operator", 330 "pict", 331 "private", 332 "rxe", 333 "stylesheet", 334 "tc", 335 "txe", 336 "xe": 337 return true; 338 case "colortbl": 339 return true; 340 case "fonttbl": 341 return true; 342 343 default: return false; 344 } 345 } 346 347 dchar toDchar() { 348 switch(letterSequence) { 349 case "{": return '{'; 350 case "}": return '}'; 351 case `\`: return '\\'; 352 case "~": return '\ '; 353 case "tab": return '\t'; 354 case "line": return '\n'; 355 default: return dchar.init; 356 } 357 } 358 359 bool isTurnOn() { 360 return !hadNumber || number != 0; 361 } 362 363 // take no delimiters 364 bool isControlSymbol() { 365 // if true, the letterSequence is the symbol 366 return letterSequence.length && !isAlpha(letterSequence[0]); 367 } 368 369 // letterSequence == ~ is a non breaking space 370 371 static RtfControlWord tab() { 372 RtfControlWord w; 373 w.letterSequence = "tab"; 374 return w; 375 } 376 } 377 378 private bool isAlpha(char c) { 379 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); 380 } 381 382 // a { ... } thing 383 /++ 384 A group directly from the RTF file. 385 +/ 386 struct RtfGroup { 387 RtfPiece[] pieces; 388 389 string destination() { 390 return isStarred() ? 391 ((pieces.length > 1 && pieces[1].contains == RtfPiece.Contains.controlWord) ? pieces[1].controlWord.letterSequence : null) 392 : ((pieces.length && pieces[0].contains == RtfPiece.Contains.controlWord && pieces[0].controlWord.isDestination) ? pieces[0].controlWord.letterSequence : null); 393 } 394 395 bool isStarred() { 396 return (pieces.length && pieces[0].contains == RtfPiece.Contains.controlWord && pieces[0].controlWord.letterSequence == "*"); 397 } 398 } 399 400 /+ 401 \pard = paragraph defaults 402 +/