1 /++ 2 Converts HTML to plain text. Can also output VT escape sequences for terminal output. 3 4 The exact output of this is subject to change - it is just what appears nice for me. (I actually use this on my personal email setup.) 5 +/ 6 module arsd.htmltotext; 7 8 import arsd.dom; 9 import arsd.color; 10 import std.string; 11 12 import std.uni : isWhite; 13 import std.string : indexOf, startsWith, endsWith, strip; 14 15 /// 16 class HtmlConverter { 17 int width; 18 19 /++ 20 Will enable color output using VT codes. Determines color through dom.d's css support, which means you need to apply a stylesheet first. 21 22 --- 23 import arsd.dom; 24 25 auto document = new Document(source_code_for_html); 26 auto stylesheet = new Stylesheet(source_code_for_css); 27 stylesheet.apply(document); 28 --- 29 +/ 30 bool enableVtOutput; 31 32 33 string color; 34 string backgroundColor; 35 36 /// 37 void htmlToText(Element element, bool preformatted, int width) { 38 string color, backgroundColor; 39 if(enableVtOutput) { 40 color = element.computedStyle.getValue("color"); 41 backgroundColor = element.computedStyle.getValue("background-color"); 42 } 43 44 string originalColor = this.color, originalBackgroundColor = this.backgroundColor; 45 46 this.color = color.length ? color : this.color; 47 this.backgroundColor = backgroundColor.length ? backgroundColor : this.backgroundColor; 48 49 scope(exit) { 50 // the idea is as we pop working back up the tree, it restores what it was here 51 this.color = originalColor; 52 this.backgroundColor = originalBackgroundColor; 53 } 54 55 56 this.width = width; 57 if(auto tn = cast(TextNode) element) { 58 foreach(dchar ch; tn.nodeValue) { 59 sink(ch, preformatted); 60 } 61 } else { 62 void sinkChildren() { 63 foreach(child; element.childNodes) 64 htmlToText(child, preformatted, width); 65 } 66 switch(element.tagName) { 67 case "head", "script", "style": 68 // intentionally blank 69 break; 70 // The table stuff is removed right now because while it looks 71 // ok for test tables, it isn't working well for the emails I have 72 // - it handles data ok but not really nested layouts. 73 case "trlol": 74 auto children = element.childElements; 75 76 auto tdWidth = (width - cast(int)(children.length)*3) / cast(int)(children.length); 77 if(tdWidth < 12) { 78 // too narrow to be reasonable 79 startBlock(); 80 sinkChildren(); 81 endBlock(); 82 } else { 83 string[] tdBlocks; 84 int longestBlock; 85 foreach(child; children) { 86 auto fmt = new HtmlConverter(); 87 88 fmt.htmlToText(child, false, tdWidth); 89 tdBlocks ~= fmt.s; 90 int lineCount = 1; 91 foreach(ch; fmt.s) 92 if(ch == '\n') 93 lineCount++; 94 if(lineCount > longestBlock) 95 longestBlock = lineCount; 96 } 97 98 if(s.length && s[$-1] != '\n') 99 s ~= '\n'; 100 foreach(lineNumber; 0 .. longestBlock) { 101 foreach(bidx, ref block; tdBlocks) { 102 auto ob = block; 103 if(bidx) 104 s ~= " | "; 105 if(block.length) { 106 auto idx = block.indexOf("\n"); 107 if(idx == -1) 108 idx = block.length; 109 110 s ~= block[0 .. idx]; 111 112 if(idx == block.length) 113 block = block[$..$]; 114 else 115 block = block[idx + 1 .. $]; 116 } 117 118 if(ob.length < tdWidth) 119 foreach(a; 0 .. tdWidth - block.length) 120 s ~= " "; 121 122 } 123 s ~= "\n"; 124 } 125 126 foreach(a; 0 .. children.length) { 127 foreach(w; 0 .. tdWidth) { 128 s ~= "-"; 129 } 130 if(a +1 != children.length) 131 s ~= "-+-"; 132 } 133 s ~= "\n"; 134 } 135 break; 136 case "tr": 137 startBlock(2); 138 sinkChildren(); 139 endBlock(); 140 break; 141 case "td": 142 startBlock(0); 143 sinkChildren(); 144 endBlock(); 145 break; 146 case "a": 147 sinkChildren(); 148 if(element.href != element.innerText) { 149 sink(' ', false); 150 sink('<', false); 151 // I want the link itself to NOT word wrap 152 // to make for easier double-clicking of it in 153 // the terminal 154 foreach(dchar ch; element.href) 155 sink(ch, false, int.max); 156 sink('>', false); 157 } 158 break; 159 case "span": 160 if(enableVtOutput) { 161 auto csc = color; // element.computedStyle.getValue("color"); 162 if(csc.length) { 163 auto c = Color.fromString(csc); 164 s ~= format("\033[38;2;%d;%d;%dm", c.r, c.g, c.b); 165 } 166 167 bool bold = element.computedStyle.getValue("font-weight") == "bold"; 168 169 if(bold) 170 s ~= "\033[1m"; 171 172 sinkChildren(); 173 174 if(bold) 175 s ~= "\033[0m"; 176 if(csc.length) 177 s ~= "\033[39m"; 178 } else { 179 sinkChildren(); 180 } 181 break; 182 case "p": 183 startBlock(); 184 sinkChildren(); 185 endBlock(); 186 break; 187 case "b", "strong": 188 case "em", "i": 189 if(element.innerText.length == 0) 190 break; 191 if(enableVtOutput) { 192 s ~= "\033[1m"; 193 sinkChildren(); 194 s ~= "\033[0m"; 195 } else { 196 sink('*', false); 197 sinkChildren(); 198 sink('*', false); 199 } 200 break; 201 case "u": 202 if(element.innerText.length == 0) 203 break; 204 sink('_', false); 205 sinkChildren(); 206 sink('_', false); 207 break; 208 case "ul": 209 ulDepth++; 210 startBlock(2); 211 sinkChildren(); 212 endBlock(); 213 ulDepth--; 214 break; 215 case "ol": 216 olDepth++; 217 startBlock(2); 218 sinkChildren(); 219 endBlock(); 220 olDepth--; 221 break; 222 case "li": 223 startBlock(); 224 225 //sink('\t', true); 226 /* 227 foreach(cnt; 0 .. olDepth + ulDepth) { 228 sink(' ', true); 229 sink(' ', true); 230 } 231 */ 232 if(olDepth) 233 sink('*', false); 234 if(ulDepth) 235 sink('*', false); 236 sink(' ', true); 237 238 sinkChildren(); 239 240 endBlock(); 241 break; 242 243 case "dl": 244 case "dt": 245 case "dd": 246 startBlock(element.tagName == "dd" ? 2 : 0); 247 sinkChildren(); 248 endBlock(); 249 break; 250 251 case "h1": 252 startBlock(); 253 sink('#', true); 254 sink('#', true); 255 sink(' ', true); 256 sinkChildren(); 257 sink(' ', true); 258 sink('#', true); 259 sink('#', true); 260 endBlock(); 261 break; 262 case "h2", "h3": 263 startBlock(); 264 sinkChildren(); 265 sink('\n', true); 266 foreach(dchar ch; element.innerText) 267 sink(element.tagName == "h2" ? '=' : '-', false); 268 endBlock(); 269 break; 270 case "hr": 271 startBlock(); 272 foreach(i; 0 .. width / 4) 273 sink(' ', true); 274 foreach(i; 0 .. width / 2) 275 sink('-', false); 276 endBlock(); 277 break; 278 279 case "br": 280 sink('\n', true); 281 break; 282 case "div": 283 startBlock(); 284 285 /* 286 auto csc = element.computedStyle.getValue("background-color"); 287 if(csc.length) { 288 auto c = Color.fromString(csc); 289 s ~= format("\033[48;2;%d;%d;%dm", c.r, c.g, c.b); 290 } 291 */ 292 293 sinkChildren(); 294 295 /* 296 if(csc.length) 297 s ~= "\033[49m"; 298 */ 299 300 endBlock(); 301 break; 302 case "pre": 303 startBlock(4); 304 foreach(child; element.childNodes) 305 htmlToText(child, true, width); 306 endBlock(); 307 break; 308 default: 309 sinkChildren(); 310 } 311 } 312 } 313 314 int olDepth; 315 int ulDepth; 316 317 /// 318 string convert(string html, bool wantWordWrap = true, int wrapAmount = 74) { 319 Document document = new Document; 320 321 document.parse("<roottag>" ~ html ~ "</roottag>"); 322 323 Element start; 324 auto bod = document.getElementsByTagName("body"); 325 if(bod.length) 326 start = bod[0]; 327 else 328 start = document.root; 329 330 //import std.file; 331 //auto stylesheet = new StyleSheet(readText("/var/www/dpldocs.info/experimental-docs/style.css")); 332 //stylesheet.apply(document); 333 334 return convert(start, wantWordWrap, wrapAmount); 335 } 336 337 /// 338 string convert(Element start, bool wantWordWrap = true, int wrapAmount = 74) { 339 htmlToText(start, false, wrapAmount); 340 return s; 341 } 342 343 /// 344 void reset() { 345 s = null; 346 justOutputWhitespace = true; 347 justOutputBlock = true; 348 justOutputMargin = true; 349 } 350 351 /// 352 string s; 353 bool justOutputWhitespace = true; 354 bool justOutputBlock = true; 355 bool justOutputMargin = true; 356 int lineLength; 357 358 void sink(dchar item, bool preformatted, int lineWidthOverride = int.min) { 359 360 if(needsIndent && item != '\n') { 361 lineLength += doIndent(); 362 needsIndent = false; 363 } 364 365 int width = lineWidthOverride == int.min ? this.width : lineWidthOverride; 366 if(!preformatted && isWhite(item)) { 367 if(!justOutputWhitespace) { 368 item = ' '; 369 justOutputWhitespace = true; 370 } else { 371 return; 372 } 373 } else { 374 // if it is preformatted, we still need to keep track of if it is whitespace 375 // so stuff like <br> is somewhat sane 376 justOutputWhitespace = preformatted && isWhite(item); 377 } 378 379 s ~= item; 380 381 if(lineLength >= width) { 382 // rewind to the nearest space, if there is one, to break on a word boundary 383 int c = lineLength; 384 bool broken; 385 foreach_reverse(idx, char ch; s) { 386 if(ch == '\n') 387 break; 388 if(ch == ' ') { 389 auto os = s; 390 s = os[0 .. idx]; 391 s ~= '\n'; 392 lineLength = cast(int)(os[idx+1..$].length); 393 lineLength += doIndent(); 394 s ~= os[idx + 1 .. $]; 395 broken = true; 396 break; 397 } 398 c--; 399 if(c < 5) 400 break; 401 } 402 403 if(!broken) { 404 s ~= '\n'; 405 lineLength = 0; 406 needsIndent = true; 407 justOutputWhitespace = true; 408 } 409 410 } 411 412 413 if(item == '\n') { 414 lineLength = 0; 415 needsIndent = true; 416 } else 417 lineLength ++; 418 419 420 if(!justOutputWhitespace) { 421 justOutputBlock = false; 422 justOutputMargin = false; 423 } 424 } 425 426 int doIndent() { 427 int cnt = 0; 428 foreach(i; indentStack) 429 foreach(lol; 0 .. i) { 430 s ~= ' '; 431 cnt++; 432 } 433 return cnt; 434 } 435 436 int[] indentStack; 437 bool needsIndent = false; 438 439 void startBlock(int indent = 0) { 440 441 indentStack ~= indent; 442 443 if(!justOutputBlock) { 444 s ~= "\n"; 445 lineLength = 0; 446 needsIndent = true; 447 justOutputBlock = true; 448 } 449 if(!justOutputMargin) { 450 s ~= "\n"; 451 lineLength = 0; 452 needsIndent = true; 453 justOutputMargin = true; 454 } 455 } 456 void endBlock() { 457 if(indentStack.length) 458 indentStack = indentStack[0 .. $ - 1]; 459 460 if(!justOutputMargin) { 461 s ~= "\n"; 462 lineLength = 0; 463 needsIndent = true; 464 justOutputMargin = true; 465 } 466 } 467 } 468 469 /// 470 string htmlToText(string html, bool wantWordWrap = true, int wrapAmount = 74) { 471 auto converter = new HtmlConverter(); 472 return converter.convert(html, wantWordWrap, wrapAmount); 473 } 474