1 /++ 2 Some support for the RTF file format - rich text format, like produced by Windows WordPad. 3 4 History: 5 Added February 13, 2025 6 +/ 7 module arsd.rtf; 8 9 // https://www.biblioscape.com/rtf15_spec.htm 10 // https://latex2rtf.sourceforge.net/rtfspec_62.html 11 // https://en.wikipedia.org/wiki/Rich_Text_Format 12 13 // spacing is in "twips" or 1/20 of a point (as in text size unit). aka 1/1440th of an inch. 14 15 import arsd.core; 16 import arsd.color; 17 18 /++ 19 20 +/ 21 struct RtfDocument { 22 RtfGroup root; 23 24 /++ 25 There are two helper functions to process a RTF file: one that does minimal processing 26 and sends you the data as it appears in the file, and one that sends you preprocessed 27 results upon significant state changes. 28 29 The former makes you do more work, but also exposes (almost) the whole file to you (it is still partially processed). The latter lets you just get down to business processing the text, but is not a complete implementation. 30 +/ 31 void process(void delegate(RtfPiece piece, ref RtfState state) dg) { 32 recurseIntoGroup(root, RtfState.init, dg); 33 } 34 35 private static void recurseIntoGroup(RtfGroup group, RtfState parentState, void delegate(RtfPiece piece, ref RtfState state) dg) { 36 // might need to copy... 37 RtfState state = parentState; 38 auto newDestination = group.destination; 39 if(newDestination.length) 40 state.currentDestination = newDestination; 41 42 foreach(piece; group.pieces) { 43 if(piece.contains == RtfPiece.Contains.group) { 44 recurseIntoGroup(piece.group, state, dg); 45 } else { 46 dg(piece, state); 47 } 48 } 49 50 } 51 52 //Color[] colorTable; 53 //Object[] fontTable; 54 } 55 56 /// ditto 57 RtfDocument readRtfFromString(const(char)[] s) { 58 return readRtfFromBytes(cast(const(ubyte)[]) s); 59 } 60 61 /// ditto 62 RtfDocument readRtfFromBytes(const(ubyte)[] s) { 63 RtfDocument document; 64 65 if(s.length < 7) 66 throw new ArsdException!"not a RTF file"("too short"); 67 if((cast(char[]) s[0..6]) != `{\rtf1`) 68 throw new ArsdException!"not a RTF file"("wrong magic number"); 69 70 document.root = parseRtfGroup(s); 71 72 return document; 73 } 74 75 /// ditto 76 struct RtfState { 77 string currentDestination; 78 } 79 80 unittest { 81 auto document = readRtfFromString("{\\rtf1Hello\nWorld}"); 82 //import std.file; auto document = readRtfFromString(readText("/home/me/test.rtf")); 83 document.process((piece, ref state) { 84 final switch(piece.contains) { 85 case RtfPiece.Contains.controlWord: 86 // writeln(state.currentDestination, ": ", piece.controlWord); 87 break; 88 case RtfPiece.Contains.text: 89 // writeln(state.currentDestination, ": ", piece.text); 90 break; 91 case RtfPiece.Contains.group: 92 assert(0); 93 } 94 }); 95 96 // writeln(toPlainText(document)); 97 } 98 99 string toPlainText(RtfDocument document) { 100 string ret; 101 document.process((piece, ref state) { 102 if(state.currentDestination.length) 103 return; 104 105 final switch(piece.contains) { 106 case RtfPiece.Contains.controlWord: 107 if(piece.controlWord.letterSequence == "par") 108 ret ~= "\n\n"; 109 else if(piece.controlWord.toDchar != dchar.init) 110 ret ~= piece.controlWord.toDchar; 111 break; 112 case RtfPiece.Contains.text: 113 ret ~= piece.text; 114 break; 115 case RtfPiece.Contains.group: 116 assert(0); 117 } 118 }); 119 120 return ret; 121 } 122 123 private RtfGroup parseRtfGroup(ref const(ubyte)[] s) { 124 RtfGroup group; 125 126 assert(s[0] == '{'); 127 s = s[1 .. $]; 128 if(s.length == 0) 129 throw new ArsdException!"bad RTF file"("premature end after {"); 130 while(s[0] != '}') { 131 group.pieces ~= parseRtfPiece(s); 132 if(s.length == 0) 133 throw new ArsdException!"bad RTF file"("premature end before {"); 134 } 135 s = s[1 .. $]; 136 return group; 137 } 138 139 private RtfPiece parseRtfPiece(ref const(ubyte)[] s) { 140 while(true) 141 switch(s[0]) { 142 case '\\': 143 return RtfPiece(parseRtfControlWord(s)); 144 case '{': 145 return RtfPiece(parseRtfGroup(s)); 146 case '\t': 147 s = s[1 .. $]; 148 return RtfPiece(RtfControlWord.tab); 149 case '\r': 150 case '\n': 151 // skip irrelevant characters 152 s = s[1 .. $]; 153 continue; 154 default: 155 return RtfPiece(parseRtfText(s)); 156 } 157 } 158 159 private RtfControlWord parseRtfControlWord(ref const(ubyte)[] s) { 160 assert(s[0] == '\\'); 161 s = s[1 .. $]; 162 163 if(s.length == 0) 164 throw new ArsdException!"bad RTF file"("premature end after \\"); 165 166 RtfControlWord ret; 167 168 size_t pos; 169 do { 170 pos++; 171 } while(pos < s.length && isAlpha(cast(char) s[pos])); 172 173 ret.letterSequence = (cast(const char[]) s)[0 .. pos].idup; 174 s = s[pos .. $]; 175 176 if(isAlpha(ret.letterSequence[0])) { 177 if(s.length == 0) 178 throw new ArsdException!"bad RTF file"("premature end after control word"); 179 180 int readNumber() { 181 if(s.length == 0) 182 throw new ArsdException!"bad RTF file"("premature end when reading number"); 183 int count; 184 while(s[count] >= '0' && s[count] <= '9') 185 count++; 186 if(count == 0) 187 throw new ArsdException!"bad RTF file"("expected negative number, got something else"); 188 189 auto buffer = cast(const(char)[]) s[0 .. count]; 190 s = s[count .. $]; 191 192 int accumulator; 193 foreach(ch; buffer) { 194 accumulator *= 10; 195 accumulator += ch - '0'; 196 } 197 198 return accumulator; 199 } 200 201 if(s[0] == '-') { 202 ret.hadNumber = true; 203 s = s[1 .. $]; 204 ret.number = - readNumber(); 205 206 // negative number 207 } else if(s[0] >= '0' && s[0] <= '9') { 208 // non-negative number 209 ret.hadNumber = true; 210 ret.number = readNumber(); 211 } 212 213 if(s[0] == ' ') { 214 ret.hadSpaceAtEnd = true; 215 s = s[1 .. $]; 216 } 217 218 } else { 219 // it was a control symbol 220 if(ret.letterSequence == "\r" || ret.letterSequence == "\n") 221 ret.letterSequence = "par"; 222 } 223 224 return ret; 225 } 226 227 private string parseRtfText(ref const(ubyte)[] s) { 228 size_t end = s.length; 229 foreach(idx, ch; s) { 230 if(ch == '\\' || ch == '{' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '}') { 231 end = idx; 232 break; 233 } 234 } 235 auto ret = s[0 .. end]; 236 s = s[end .. $]; 237 238 // FIXME: charset conversion? 239 return (cast(const char[]) ret).idup; 240 } 241 242 // \r and \n chars w/o a \\ before them are ignored. but \ at the end of al ine is a \par 243 // \t is read but you should use \tab generally 244 // when reading, ima translate the ascii tab to \tab control word 245 // and ignore 246 struct RtfPiece { 247 /++ 248 +/ 249 Contains contains() { 250 return contains_; 251 } 252 /// ditto 253 enum Contains { 254 controlWord, 255 group, 256 text 257 } 258 259 this(RtfControlWord cw) { 260 this.controlWord_ = cw; 261 this.contains_ = Contains.controlWord; 262 } 263 this(RtfGroup g) { 264 this.group_ = g; 265 this.contains_ = Contains.group; 266 } 267 this(string s) { 268 this.text_ = s; 269 this.contains_ = Contains.text; 270 } 271 272 /++ 273 +/ 274 RtfControlWord controlWord() { 275 if(contains != Contains.controlWord) 276 throw ArsdException!"RtfPiece type mismatch"(contains); 277 return controlWord_; 278 } 279 /++ 280 +/ 281 RtfGroup group() { 282 if(contains != Contains.group) 283 throw ArsdException!"RtfPiece type mismatch"(contains); 284 return group_; 285 } 286 /++ 287 +/ 288 string text() { 289 if(contains != Contains.text) 290 throw ArsdException!"RtfPiece type mismatch"(contains); 291 return text_; 292 } 293 294 private Contains contains_; 295 296 private union { 297 RtfControlWord controlWord_; 298 RtfGroup group_; 299 string text_; 300 } 301 } 302 303 // a \word thing 304 struct RtfControlWord { 305 bool hadSpaceAtEnd; 306 bool hadNumber; 307 string letterSequence; // what the word is 308 int number; 309 310 bool isDestination() { 311 switch(letterSequence) { 312 case 313 "author", "comment", "subject", "title", 314 "buptim", "creatim", "printim", "revtim", 315 "doccomm", 316 "footer", "footerf", "footerl", "footerr", 317 "footnote", 318 "ftncn", "ftnsep", "ftnsepc", 319 "header", "headerf", "headerl", "headerr", 320 "info", "keywords", "operator", 321 "pict", 322 "private", 323 "rxe", 324 "stylesheet", 325 "tc", 326 "txe", 327 "xe": 328 return true; 329 case "colortbl": 330 return true; 331 case "fonttbl": 332 return true; 333 334 default: return false; 335 } 336 } 337 338 dchar toDchar() { 339 switch(letterSequence) { 340 case "{": return '{'; 341 case "}": return '}'; 342 case `\`: return '\\'; 343 case "~": return '\ '; 344 case "tab": return '\t'; 345 case "line": return '\n'; 346 default: return dchar.init; 347 } 348 } 349 350 bool isTurnOn() { 351 return !hadNumber || number != 0; 352 } 353 354 // take no delimiters 355 bool isControlSymbol() { 356 // if true, the letterSequence is the symbol 357 return letterSequence.length && !isAlpha(letterSequence[0]); 358 } 359 360 // letterSequence == ~ is a non breaking space 361 362 static RtfControlWord tab() { 363 RtfControlWord w; 364 w.letterSequence = "tab"; 365 return w; 366 } 367 } 368 369 private bool isAlpha(char c) { 370 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); 371 } 372 373 // a { ... } thing 374 struct RtfGroup { 375 RtfPiece[] pieces; 376 377 string destination() { 378 return isStarred() ? 379 ((pieces.length > 1 && pieces[1].contains == RtfPiece.Contains.controlWord) ? pieces[1].controlWord.letterSequence : null) 380 : ((pieces.length && pieces[0].contains == RtfPiece.Contains.controlWord && pieces[0].controlWord.isDestination) ? pieces[0].controlWord.letterSequence : null); 381 } 382 383 bool isStarred() { 384 return (pieces.length && pieces[0].contains == RtfPiece.Contains.controlWord && pieces[0].controlWord.letterSequence == "*"); 385 } 386 } 387 388 /+ 389 \pard = paragraph defaults 390 +/