1 /++ 2 Converts HTML to plain text. Can also output VT escape sequences for terminal output. 3 4 The exact output of this is subject to change - it is just what appears nice for me. (I actually use this on my personal email setup.) 5 +/ 6 module arsd.htmltotext; 7 8 import arsd.dom; 9 import arsd.color; 10 import std.string; 11 12 import std.uni : isWhite; 13 14 /// 15 class HtmlConverter { 16 int width; 17 18 /++ 19 Will enable color output using VT codes. Determines color through dom.d's css support, which means you need to apply a stylesheet first. 20 21 --- 22 import arsd.dom; 23 24 auto document = new Document(source_code_for_html); 25 auto stylesheet = new Stylesheet(source_code_for_css); 26 stylesheet.apply(document); 27 --- 28 +/ 29 bool enableVtOutput; 30 31 32 string color; 33 string backgroundColor; 34 35 /// 36 void htmlToText(Element element, bool preformatted, int width) { 37 string color, backgroundColor; 38 if(enableVtOutput) { 39 color = element.computedStyle.getValue("color"); 40 backgroundColor = element.computedStyle.getValue("background-color"); 41 } 42 43 string originalColor = this.color, originalBackgroundColor = this.backgroundColor; 44 45 this.color = color.length ? color : this.color; 46 this.backgroundColor = backgroundColor.length ? backgroundColor : this.backgroundColor; 47 48 scope(exit) { 49 // the idea is as we pop working back up the tree, it restores what it was here 50 this.color = originalColor; 51 this.backgroundColor = originalBackgroundColor; 52 } 53 54 55 this.width = width; 56 if(auto tn = cast(TextNode) element) { 57 foreach(dchar ch; tn.nodeValue) { 58 sink(ch, preformatted); 59 } 60 } else { 61 void sinkChildren() { 62 foreach(child; element.childNodes) 63 htmlToText(child, preformatted, width); 64 } 65 switch(element.tagName) { 66 case "head", "script", "style": 67 // intentionally blank 68 break; 69 // The table stuff is removed right now because while it looks 70 // ok for test tables, it isn't working well for the emails I have 71 // - it handles data ok but not really nested layouts. 72 case "trlol": 73 auto children = element.childElements; 74 75 auto tdWidth = (width - cast(int)(children.length)*3) / cast(int)(children.length); 76 if(tdWidth < 12) { 77 // too narrow to be reasonable 78 startBlock(); 79 sinkChildren(); 80 endBlock(); 81 } else { 82 string[] tdBlocks; 83 int longestBlock; 84 foreach(child; children) { 85 auto fmt = new HtmlConverter(); 86 87 fmt.htmlToText(child, false, tdWidth); 88 tdBlocks ~= fmt.s; 89 int lineCount = 1; 90 foreach(ch; fmt.s) 91 if(ch == '\n') 92 lineCount++; 93 if(lineCount > longestBlock) 94 longestBlock = lineCount; 95 } 96 97 if(s.length && s[$-1] != '\n') 98 s ~= '\n'; 99 foreach(lineNumber; 0 .. longestBlock) { 100 foreach(bidx, ref block; tdBlocks) { 101 auto ob = block; 102 if(bidx) 103 s ~= " | "; 104 if(block.length) { 105 auto idx = block.indexOf("\n"); 106 if(idx == -1) 107 idx = block.length; 108 109 s ~= block[0 .. idx]; 110 111 if(idx == block.length) 112 block = block[$..$]; 113 else 114 block = block[idx + 1 .. $]; 115 } 116 117 if(ob.length < tdWidth) 118 foreach(a; 0 .. tdWidth - block.length) 119 s ~= " "; 120 121 } 122 s ~= "\n"; 123 } 124 125 foreach(a; 0 .. children.length) { 126 foreach(w; 0 .. tdWidth) { 127 s ~= "-"; 128 } 129 if(a +1 != children.length) 130 s ~= "-+-"; 131 } 132 s ~= "\n"; 133 } 134 break; 135 case "tr": 136 startBlock(2); 137 sinkChildren(); 138 endBlock(); 139 break; 140 case "td": 141 startBlock(0); 142 sinkChildren(); 143 endBlock(); 144 break; 145 case "a": 146 sinkChildren(); 147 if(element.href != element.innerText) { 148 sink(' ', false); 149 sink('<', false); 150 // I want the link itself to NOT word wrap 151 // to make for easier double-clicking of it in 152 // the terminal 153 foreach(dchar ch; element.href) 154 sink(ch, false, int.max); 155 sink('>', false); 156 } 157 break; 158 case "span": 159 if(enableVtOutput) { 160 auto csc = color; // element.computedStyle.getValue("color"); 161 if(csc.length) { 162 auto c = Color.fromString(csc); 163 s ~= format("\033[38;2;%d;%d;%dm", c.r, c.g, c.b); 164 } 165 166 bool bold = element.computedStyle.getValue("font-weight") == "bold"; 167 168 if(bold) 169 s ~= "\033[1m"; 170 171 sinkChildren(); 172 173 if(bold) 174 s ~= "\033[0m"; 175 if(csc.length) 176 s ~= "\033[39m"; 177 } else { 178 sinkChildren(); 179 } 180 break; 181 case "p": 182 startBlock(); 183 sinkChildren(); 184 endBlock(); 185 break; 186 case "b", "strong": 187 case "em", "i": 188 if(element.innerText.length == 0) 189 break; 190 if(enableVtOutput) { 191 s ~= "\033[1m"; 192 sinkChildren(); 193 s ~= "\033[0m"; 194 } else { 195 sink('*', false); 196 sinkChildren(); 197 sink('*', false); 198 } 199 break; 200 case "u": 201 if(element.innerText.length == 0) 202 break; 203 sink('_', false); 204 sinkChildren(); 205 sink('_', false); 206 break; 207 case "ul": 208 ulDepth++; 209 startBlock(2); 210 sinkChildren(); 211 endBlock(); 212 ulDepth--; 213 break; 214 case "ol": 215 olDepth++; 216 startBlock(2); 217 sinkChildren(); 218 endBlock(); 219 olDepth--; 220 break; 221 case "li": 222 startBlock(); 223 224 //sink('\t', true); 225 /* 226 foreach(cnt; 0 .. olDepth + ulDepth) { 227 sink(' ', true); 228 sink(' ', true); 229 } 230 */ 231 if(olDepth) 232 sink('*', false); 233 if(ulDepth) 234 sink('*', false); 235 sink(' ', true); 236 237 sinkChildren(); 238 239 endBlock(); 240 break; 241 242 case "dl": 243 case "dt": 244 case "dd": 245 startBlock(element.tagName == "dd" ? 2 : 0); 246 sinkChildren(); 247 endBlock(); 248 break; 249 250 case "h1": 251 startBlock(); 252 sink('#', true); 253 sink('#', true); 254 sink(' ', true); 255 sinkChildren(); 256 sink(' ', true); 257 sink('#', true); 258 sink('#', true); 259 endBlock(); 260 break; 261 case "h2", "h3": 262 startBlock(); 263 sinkChildren(); 264 sink('\n', true); 265 foreach(dchar ch; element.innerText) 266 sink(element.tagName == "h2" ? '=' : '-', false); 267 endBlock(); 268 break; 269 case "hr": 270 startBlock(); 271 foreach(i; 0 .. width / 4) 272 sink(' ', true); 273 foreach(i; 0 .. width / 2) 274 sink('-', false); 275 endBlock(); 276 break; 277 278 case "br": 279 sink('\n', true); 280 break; 281 case "div": 282 startBlock(); 283 284 /* 285 auto csc = element.computedStyle.getValue("background-color"); 286 if(csc.length) { 287 auto c = Color.fromString(csc); 288 s ~= format("\033[48;2;%d;%d;%dm", c.r, c.g, c.b); 289 } 290 */ 291 292 sinkChildren(); 293 294 /* 295 if(csc.length) 296 s ~= "\033[49m"; 297 */ 298 299 endBlock(); 300 break; 301 case "pre": 302 startBlock(4); 303 foreach(child; element.childNodes) 304 htmlToText(child, true, width); 305 endBlock(); 306 break; 307 default: 308 sinkChildren(); 309 } 310 } 311 } 312 313 int olDepth; 314 int ulDepth; 315 316 /// 317 string convert(string html, bool wantWordWrap = true, int wrapAmount = 74) { 318 Document document = new Document; 319 320 document.parse("<roottag>" ~ html ~ "</roottag>"); 321 322 Element start; 323 auto bod = document.getElementsByTagName("body"); 324 if(bod.length) 325 start = bod[0]; 326 else 327 start = document.root; 328 329 //import std.file; 330 //auto stylesheet = new StyleSheet(readText("/var/www/dpldocs.info/experimental-docs/style.css")); 331 //stylesheet.apply(document); 332 333 return convert(start, wantWordWrap, wrapAmount); 334 } 335 336 /// 337 string convert(Element start, bool wantWordWrap = true, int wrapAmount = 74) { 338 htmlToText(start, false, wrapAmount); 339 return s; 340 } 341 342 /// 343 void reset() { 344 s = null; 345 justOutputWhitespace = true; 346 justOutputBlock = true; 347 justOutputMargin = true; 348 } 349 350 /// 351 string s; 352 bool justOutputWhitespace = true; 353 bool justOutputBlock = true; 354 bool justOutputMargin = true; 355 int lineLength; 356 357 void sink(dchar item, bool preformatted, int lineWidthOverride = int.min) { 358 359 if(needsIndent && item != '\n') { 360 lineLength += doIndent(); 361 needsIndent = false; 362 } 363 364 int width = lineWidthOverride == int.min ? this.width : lineWidthOverride; 365 if(!preformatted && isWhite(item)) { 366 if(!justOutputWhitespace) { 367 item = ' '; 368 justOutputWhitespace = true; 369 } else { 370 return; 371 } 372 } else { 373 // if it is preformatted, we still need to keep track of if it is whitespace 374 // so stuff like <br> is somewhat sane 375 justOutputWhitespace = preformatted && isWhite(item); 376 } 377 378 s ~= item; 379 380 if(lineLength >= width) { 381 // rewind to the nearest space, if there is one, to break on a word boundary 382 int c = lineLength; 383 bool broken; 384 foreach_reverse(idx, char ch; s) { 385 if(ch == '\n') 386 break; 387 if(ch == ' ') { 388 auto os = s; 389 s = os[0 .. idx]; 390 s ~= '\n'; 391 lineLength = cast(int)(os[idx+1..$].length); 392 lineLength += doIndent(); 393 s ~= os[idx + 1 .. $]; 394 broken = true; 395 break; 396 } 397 c--; 398 if(c < 5) 399 break; 400 } 401 402 if(!broken) { 403 s ~= '\n'; 404 lineLength = 0; 405 needsIndent = true; 406 justOutputWhitespace = true; 407 } 408 409 } 410 411 412 if(item == '\n') { 413 lineLength = 0; 414 needsIndent = true; 415 } else 416 lineLength ++; 417 418 419 if(!justOutputWhitespace) { 420 justOutputBlock = false; 421 justOutputMargin = false; 422 } 423 } 424 425 int doIndent() { 426 int cnt = 0; 427 foreach(i; indentStack) 428 foreach(lol; 0 .. i) { 429 s ~= ' '; 430 cnt++; 431 } 432 return cnt; 433 } 434 435 int[] indentStack; 436 bool needsIndent = false; 437 438 void startBlock(int indent = 0) { 439 440 indentStack ~= indent; 441 442 if(!justOutputBlock) { 443 s ~= "\n"; 444 lineLength = 0; 445 needsIndent = true; 446 justOutputBlock = true; 447 } 448 if(!justOutputMargin) { 449 s ~= "\n"; 450 lineLength = 0; 451 needsIndent = true; 452 justOutputMargin = true; 453 } 454 } 455 void endBlock() { 456 if(indentStack.length) 457 indentStack = indentStack[0 .. $ - 1]; 458 459 if(!justOutputMargin) { 460 s ~= "\n"; 461 lineLength = 0; 462 needsIndent = true; 463 justOutputMargin = true; 464 } 465 } 466 } 467 468 /// 469 string htmlToText(string html, bool wantWordWrap = true, int wrapAmount = 74) { 470 auto converter = new HtmlConverter(); 471 return converter.convert(html, wantWordWrap, wrapAmount); 472 } 473