1 /// 2 module arsd.htmltotext; 3 4 import arsd.dom; 5 import arsd.color; 6 import std.string; 7 8 import std.uni : isWhite; 9 10 /// 11 class HtmlConverter { 12 int width; 13 14 /++ 15 Will enable color output using VT codes. Determines color through dom.d's css support, which means you need to apply a stylesheet first. 16 17 --- 18 import arsd.dom; 19 20 auto document = new Document(source_code_for_html); 21 auto stylesheet = new Stylesheet(source_code_for_css); 22 stylesheet.apply(document); 23 --- 24 +/ 25 bool enableVtOutput; 26 27 28 string color; 29 string backgroundColor; 30 31 /// 32 void htmlToText(Element element, bool preformatted, int width) { 33 string color, backgroundColor; 34 if(enableVtOutput) { 35 color = element.computedStyle.getValue("color"); 36 backgroundColor = element.computedStyle.getValue("background-color"); 37 } 38 39 string originalColor = this.color, originalBackgroundColor = this.backgroundColor; 40 41 this.color = color.length ? color : this.color; 42 this.backgroundColor = backgroundColor.length ? backgroundColor : this.backgroundColor; 43 44 scope(exit) { 45 // the idea is as we pop working back up the tree, it restores what it was here 46 this.color = originalColor; 47 this.backgroundColor = originalBackgroundColor; 48 } 49 50 51 this.width = width; 52 if(auto tn = cast(TextNode) element) { 53 foreach(dchar ch; tn.nodeValue) { 54 sink(ch, preformatted); 55 } 56 } else { 57 void sinkChildren() { 58 foreach(child; element.childNodes) 59 htmlToText(child, preformatted, width); 60 } 61 switch(element.tagName) { 62 case "head", "script", "style": 63 // intentionally blank 64 break; 65 // The table stuff is removed right now because while it looks 66 // ok for test tables, it isn't working well for the emails I have 67 // - it handles data ok but not really nested layouts. 68 case "trlol": 69 auto children = element.childElements; 70 71 auto tdWidth = (width - cast(int)(children.length)*3) / cast(int)(children.length); 72 if(tdWidth < 12) { 73 // too narrow to be reasonable 74 startBlock(); 75 sinkChildren(); 76 endBlock(); 77 } else { 78 string[] tdBlocks; 79 int longestBlock; 80 foreach(child; children) { 81 auto fmt = new HtmlConverter(); 82 83 fmt.htmlToText(child, false, tdWidth); 84 tdBlocks ~= fmt.s; 85 int lineCount = 1; 86 foreach(ch; fmt.s) 87 if(ch == '\n') 88 lineCount++; 89 if(lineCount > longestBlock) 90 longestBlock = lineCount; 91 } 92 93 if(s.length && s[$-1] != '\n') 94 s ~= '\n'; 95 foreach(lineNumber; 0 .. longestBlock) { 96 foreach(bidx, ref block; tdBlocks) { 97 auto ob = block; 98 if(bidx) 99 s ~= " | "; 100 if(block.length) { 101 auto idx = block.indexOf("\n"); 102 if(idx == -1) 103 idx = block.length; 104 105 s ~= block[0 .. idx]; 106 107 if(idx == block.length) 108 block = block[$..$]; 109 else 110 block = block[idx + 1 .. $]; 111 } 112 113 if(ob.length < tdWidth) 114 foreach(a; 0 .. tdWidth - block.length) 115 s ~= " "; 116 117 } 118 s ~= "\n"; 119 } 120 121 foreach(a; 0 .. children.length) { 122 foreach(w; 0 .. tdWidth) { 123 s ~= "-"; 124 } 125 if(a +1 != children.length) 126 s ~= "-+-"; 127 } 128 s ~= "\n"; 129 } 130 break; 131 case "tr": 132 startBlock(2); 133 sinkChildren(); 134 endBlock(); 135 break; 136 case "td": 137 startBlock(0); 138 sinkChildren(); 139 endBlock(); 140 break; 141 case "a": 142 sinkChildren(); 143 if(element.href != element.innerText) { 144 sink(' ', false); 145 sink('<', false); 146 // I want the link itself to NOT word wrap 147 // to make for easier double-clicking of it in 148 // the terminal 149 foreach(dchar ch; element.href) 150 sink(ch, false, int.max); 151 sink('>', false); 152 } 153 break; 154 case "span": 155 if(enableVtOutput) { 156 auto csc = color; // element.computedStyle.getValue("color"); 157 if(csc.length) { 158 auto c = Color.fromString(csc); 159 s ~= format("\033[38;2;%d;%d;%dm", c.r, c.g, c.b); 160 } 161 162 bool bold = element.computedStyle.getValue("font-weight") == "bold"; 163 164 if(bold) 165 s ~= "\033[1m"; 166 167 sinkChildren(); 168 169 if(bold) 170 s ~= "\033[0m"; 171 if(csc.length) 172 s ~= "\033[39m"; 173 } else { 174 sinkChildren(); 175 } 176 break; 177 case "p": 178 startBlock(); 179 sinkChildren(); 180 endBlock(); 181 break; 182 case "b", "strong": 183 case "em", "i": 184 if(element.innerText.length == 0) 185 break; 186 if(enableVtOutput) { 187 s ~= "\033[1m"; 188 sinkChildren(); 189 s ~= "\033[0m"; 190 } else { 191 sink('*', false); 192 sinkChildren(); 193 sink('*', false); 194 } 195 break; 196 case "u": 197 if(element.innerText.length == 0) 198 break; 199 sink('_', false); 200 sinkChildren(); 201 sink('_', false); 202 break; 203 case "ul": 204 ulDepth++; 205 startBlock(2); 206 sinkChildren(); 207 endBlock(); 208 ulDepth--; 209 break; 210 case "ol": 211 olDepth++; 212 startBlock(2); 213 sinkChildren(); 214 endBlock(); 215 olDepth--; 216 break; 217 case "li": 218 startBlock(); 219 220 //sink('\t', true); 221 /* 222 foreach(cnt; 0 .. olDepth + ulDepth) { 223 sink(' ', true); 224 sink(' ', true); 225 } 226 */ 227 if(olDepth) 228 sink('*', false); 229 if(ulDepth) 230 sink('*', false); 231 sink(' ', true); 232 233 sinkChildren(); 234 235 endBlock(); 236 break; 237 238 case "dl": 239 case "dt": 240 case "dd": 241 startBlock(element.tagName == "dd" ? 2 : 0); 242 sinkChildren(); 243 endBlock(); 244 break; 245 246 case "h1": 247 startBlock(); 248 sink('#', true); 249 sink('#', true); 250 sink(' ', true); 251 sinkChildren(); 252 sink(' ', true); 253 sink('#', true); 254 sink('#', true); 255 endBlock(); 256 break; 257 case "h2", "h3": 258 startBlock(); 259 sinkChildren(); 260 sink('\n', true); 261 foreach(dchar ch; element.innerText) 262 sink(element.tagName == "h2" ? '=' : '-', false); 263 endBlock(); 264 break; 265 case "hr": 266 startBlock(); 267 foreach(i; 0 .. width / 4) 268 sink(' ', true); 269 foreach(i; 0 .. width / 2) 270 sink('-', false); 271 endBlock(); 272 break; 273 274 case "br": 275 sink('\n', true); 276 break; 277 case "div": 278 startBlock(); 279 280 /* 281 auto csc = element.computedStyle.getValue("background-color"); 282 if(csc.length) { 283 auto c = Color.fromString(csc); 284 s ~= format("\033[48;2;%d;%d;%dm", c.r, c.g, c.b); 285 } 286 */ 287 288 sinkChildren(); 289 290 /* 291 if(csc.length) 292 s ~= "\033[49m"; 293 */ 294 295 endBlock(); 296 break; 297 case "pre": 298 startBlock(4); 299 foreach(child; element.childNodes) 300 htmlToText(child, true, width); 301 endBlock(); 302 break; 303 default: 304 sinkChildren(); 305 } 306 } 307 } 308 309 int olDepth; 310 int ulDepth; 311 312 /// 313 string convert(string html, bool wantWordWrap = true, int wrapAmount = 74) { 314 Document document = new Document; 315 316 document.parse("<roottag>" ~ html ~ "</roottag>"); 317 318 Element start; 319 auto bod = document.getElementsByTagName("body"); 320 if(bod.length) 321 start = bod[0]; 322 else 323 start = document.root; 324 325 //import std.file; 326 //auto stylesheet = new StyleSheet(readText("/var/www/dpldocs.info/experimental-docs/style.css")); 327 //stylesheet.apply(document); 328 329 return convert(start, wantWordWrap, wrapAmount); 330 } 331 332 /// 333 string convert(Element start, bool wantWordWrap = true, int wrapAmount = 74) { 334 htmlToText(start, false, wrapAmount); 335 return s; 336 } 337 338 /// 339 void reset() { 340 s = null; 341 justOutputWhitespace = true; 342 justOutputBlock = true; 343 justOutputMargin = true; 344 } 345 346 /// 347 string s; 348 bool justOutputWhitespace = true; 349 bool justOutputBlock = true; 350 bool justOutputMargin = true; 351 int lineLength; 352 353 void sink(dchar item, bool preformatted, int lineWidthOverride = int.min) { 354 355 if(needsIndent && item != '\n') { 356 lineLength += doIndent(); 357 needsIndent = false; 358 } 359 360 int width = lineWidthOverride == int.min ? this.width : lineWidthOverride; 361 if(!preformatted && isWhite(item)) { 362 if(!justOutputWhitespace) { 363 item = ' '; 364 justOutputWhitespace = true; 365 } else { 366 return; 367 } 368 } else { 369 // if it is preformatted, we still need to keep track of if it is whitespace 370 // so stuff like <br> is somewhat sane 371 justOutputWhitespace = preformatted && isWhite(item); 372 } 373 374 s ~= item; 375 376 if(lineLength >= width) { 377 // rewind to the nearest space, if there is one, to break on a word boundary 378 int c = lineLength; 379 bool broken; 380 foreach_reverse(idx, char ch; s) { 381 if(ch == '\n') 382 break; 383 if(ch == ' ') { 384 auto os = s; 385 s = os[0 .. idx]; 386 s ~= '\n'; 387 lineLength = cast(int)(os[idx+1..$].length); 388 lineLength += doIndent(); 389 s ~= os[idx + 1 .. $]; 390 broken = true; 391 break; 392 } 393 c--; 394 if(c < 5) 395 break; 396 } 397 398 if(!broken) { 399 s ~= '\n'; 400 lineLength = 0; 401 needsIndent = true; 402 justOutputWhitespace = true; 403 } 404 405 } 406 407 408 if(item == '\n') { 409 lineLength = 0; 410 needsIndent = true; 411 } else 412 lineLength ++; 413 414 415 if(!justOutputWhitespace) { 416 justOutputBlock = false; 417 justOutputMargin = false; 418 } 419 } 420 421 int doIndent() { 422 int cnt = 0; 423 foreach(i; indentStack) 424 foreach(lol; 0 .. i) { 425 s ~= ' '; 426 cnt++; 427 } 428 return cnt; 429 } 430 431 int[] indentStack; 432 bool needsIndent = false; 433 434 void startBlock(int indent = 0) { 435 436 indentStack ~= indent; 437 438 if(!justOutputBlock) { 439 s ~= "\n"; 440 lineLength = 0; 441 needsIndent = true; 442 justOutputBlock = true; 443 } 444 if(!justOutputMargin) { 445 s ~= "\n"; 446 lineLength = 0; 447 needsIndent = true; 448 justOutputMargin = true; 449 } 450 } 451 void endBlock() { 452 if(indentStack.length) 453 indentStack = indentStack[0 .. $ - 1]; 454 455 if(!justOutputMargin) { 456 s ~= "\n"; 457 lineLength = 0; 458 needsIndent = true; 459 justOutputMargin = true; 460 } 461 } 462 } 463 464 /// 465 string htmlToText(string html, bool wantWordWrap = true, int wrapAmount = 74) { 466 auto converter = new HtmlConverter(); 467 return converter.convert(html, true, wrapAmount); 468 } 469