1 /++
2 	Converts HTML to plain text. Can also output VT escape sequences for terminal output.
3 
4 	The exact output of this is subject to change - it is just what appears nice for me. (I actually use this on my personal email setup.)
5 +/
6 module arsd.htmltotext;
7 
8 import arsd.dom;
9 import arsd.color;
10 import std.string;
11 
12 import std.uni : isWhite;
13 import std.string : indexOf, startsWith, endsWith, strip;
14 
15 ///
16 class HtmlConverter {
17 	int width;
18 
19 	/++
20 		Will enable color output using VT codes. Determines color through dom.d's css support, which means you need to apply a stylesheet first.
21 
22 		---
23 		import arsd.dom;
24 
25 		auto document = new Document(source_code_for_html);
26 		auto stylesheet = new Stylesheet(source_code_for_css);
27 		stylesheet.apply(document);
28 		---
29 	+/
30 	bool enableVtOutput;
31 
32 
33 	string color;
34 	string backgroundColor;
35 
36 	///
37 	void htmlToText(Element element, bool preformatted, int width) {
38 		string color, backgroundColor;
39 		if(enableVtOutput) {
40 			color = element.computedStyle.getValue("color");
41 			backgroundColor = element.computedStyle.getValue("background-color");
42 		}
43 
44 		string originalColor = this.color, originalBackgroundColor = this.backgroundColor;
45 
46 		this.color = color.length ? color : this.color;
47 		this.backgroundColor = backgroundColor.length ? backgroundColor : this.backgroundColor;
48 
49 		scope(exit) {
50 			// the idea is as we pop working back up the tree, it restores what it was here
51 			this.color = originalColor;
52 			this.backgroundColor = originalBackgroundColor;
53 		}
54 
55 
56 		this.width = width;
57 		if(auto tn = cast(TextNode) element) {
58 			foreach(dchar ch; tn.nodeValue) {
59 				sink(ch, preformatted);
60 			}
61 		} else {
62 			void sinkChildren() {
63 				foreach(child; element.childNodes)
64 					htmlToText(child, preformatted, width);
65 			}
66 			switch(element.tagName) {
67 				case "head", "script", "style":
68 					// intentionally blank
69 				break;
70 				// The table stuff is removed right now because while it looks
71 				// ok for test tables, it isn't working well for the emails I have
72 				// - it handles data ok but not really nested layouts.
73 				case "trlol":
74 					auto children = element.childElements;
75 
76 					auto tdWidth = (width - cast(int)(children.length)*3) / cast(int)(children.length);
77 					if(tdWidth < 12) {
78 						// too narrow to be reasonable
79 						startBlock();
80 						sinkChildren();
81 						endBlock();
82 					} else {
83 						string[] tdBlocks;
84 						int longestBlock;
85 						foreach(child; children) {
86 							auto fmt = new HtmlConverter();
87 
88 							fmt.htmlToText(child, false, tdWidth);
89 							tdBlocks ~= fmt.s;
90 							int lineCount = 1;
91 							foreach(ch; fmt.s)
92 								if(ch == '\n')
93 									lineCount++;
94 							if(lineCount > longestBlock)
95 								longestBlock = lineCount;
96 						}
97 
98 						if(s.length && s[$-1] != '\n')
99 							s ~= '\n';
100 						foreach(lineNumber; 0 .. longestBlock) {
101 							foreach(bidx, ref block; tdBlocks) {
102 								auto ob = block;
103 								if(bidx)
104 									s ~= " | ";
105 								if(block.length) {
106 									auto idx = block.indexOf("\n");
107 									if(idx == -1)
108 										idx = block.length;
109 
110 									s ~= block[0 .. idx];
111 
112 									if(idx == block.length)
113 										block = block[$..$];
114 									else
115 										block = block[idx + 1 .. $];
116 								}
117 
118 								if(ob.length < tdWidth)
119 								foreach(a; 0 .. tdWidth - block.length)
120 									s ~= " ";
121 
122 							}
123 							s ~= "\n";
124 						}
125 
126 						foreach(a; 0 .. children.length) {
127 							foreach(w; 0 .. tdWidth) {
128 								s ~= "-";
129 							}
130 							if(a +1 != children.length)
131 								s ~= "-+-";
132 						}
133 						s ~= "\n";
134 					}
135 				break;
136 				case "tr":
137 					startBlock(2);
138 					sinkChildren();
139 					endBlock();
140 				break;
141 				case "td":
142 					startBlock(0);
143 					sinkChildren();
144 					endBlock();
145 				break;
146 				case "a":
147 					sinkChildren();
148 					if(element.href != element.innerText) {
149 						sink(' ', false);
150 						sink('<', false);
151 						// I want the link itself to NOT word wrap
152 						// to make for easier double-clicking of it in
153 						// the terminal
154 						foreach(dchar ch; element.href)
155 							sink(ch, false, int.max);
156 						sink('>', false);
157 					}
158 				break;
159 				case "span":
160 					if(enableVtOutput) {
161 						auto csc = color; // element.computedStyle.getValue("color");
162 						if(csc.length) {
163 							auto c = Color.fromString(csc);
164 							s ~= format("\033[38;2;%d;%d;%dm", c.r, c.g, c.b);
165 						}
166 
167 						bool bold = element.computedStyle.getValue("font-weight") == "bold";
168 
169 						if(bold)
170 							s ~= "\033[1m";
171 
172 						sinkChildren();
173 
174 						if(bold)
175 							s ~= "\033[0m";
176 						if(csc.length)
177 							s ~= "\033[39m";
178 					} else {
179 						sinkChildren();
180 					}
181 				break;
182 				case "p":
183 					startBlock();
184 					sinkChildren();
185 					endBlock();
186 				break;
187 				case "b", "strong":
188 				case "em", "i":
189 					if(element.innerText.length == 0)
190 						break;
191 					if(enableVtOutput) {
192 						s ~= "\033[1m";
193 						sinkChildren();
194 						s ~= "\033[0m";
195 					} else {
196 						sink('*', false);
197 						sinkChildren();
198 						sink('*', false);
199 					}
200 				break;
201 				case "u":
202 					if(element.innerText.length == 0)
203 						break;
204 					sink('_', false);
205 					sinkChildren();
206 					sink('_', false);
207 				break;
208 				case "ul":
209 					ulDepth++;
210 					startBlock(2);
211 					sinkChildren();
212 					endBlock();
213 					ulDepth--;
214 				break;
215 				case "ol":
216 					olDepth++;
217 					startBlock(2);
218 					sinkChildren();
219 					endBlock();
220 					olDepth--;
221 				break;
222 				case "li":
223 					startBlock();
224 
225 					//sink('\t', true);
226 					/*
227 					foreach(cnt; 0 .. olDepth + ulDepth) {
228 						sink(' ', true);
229 						sink(' ', true);
230 					}
231 					*/
232 					if(olDepth)
233 						sink('*', false);
234 					if(ulDepth)
235 						sink('*', false);
236 					sink(' ', true);
237 
238 					sinkChildren();
239 
240 					endBlock();
241 				break;
242 
243 				case "dl":
244 				case "dt":
245 				case "dd":
246 					startBlock(element.tagName == "dd" ? 2 : 0);
247 					sinkChildren();
248 					endBlock();
249 				break;
250 
251 				case "h1":
252 					startBlock();
253 					sink('#', true);
254 					sink('#', true);
255 					sink(' ', true);
256 					sinkChildren();
257 					sink(' ', true);
258 					sink('#', true);
259 					sink('#', true);
260 					endBlock();
261 				break;
262 				case "h2", "h3":
263 					startBlock();
264 					sinkChildren();
265 					sink('\n', true);
266 					foreach(dchar ch; element.innerText)
267 						sink(element.tagName == "h2" ? '=' : '-', false);
268 					endBlock();
269 				break;
270 				case "hr":
271 					startBlock();
272 					foreach(i; 0 .. width / 4)
273 						sink(' ', true);
274 					foreach(i; 0 .. width / 2)
275 						sink('-', false);
276 					endBlock();
277 				break;
278 
279 				case "br":
280 					sink('\n', true);
281 				break;
282 				case "div":
283 					startBlock();
284 
285 					/*
286 					auto csc = element.computedStyle.getValue("background-color");
287 					if(csc.length) {
288 						auto c = Color.fromString(csc);
289 						s ~= format("\033[48;2;%d;%d;%dm", c.r, c.g, c.b);
290 					}
291 					*/
292 
293 					sinkChildren();
294 
295 					/*
296 					if(csc.length)
297 						s ~= "\033[49m";
298 					*/
299 
300 					endBlock();
301 				break;
302 				case "pre":
303 					startBlock(4);
304 					foreach(child; element.childNodes)
305 						htmlToText(child, true, width);
306 					endBlock();
307 				break;
308 				default:
309 					sinkChildren();
310 			}
311 		}
312 	}
313 
314 	int olDepth;
315 	int ulDepth;
316 
317 	///
318 	string convert(string html, bool wantWordWrap = true, int wrapAmount = 74) {
319 		Document document = new Document;
320 
321 		document.parse("<roottag>" ~ html ~ "</roottag>");
322 
323 		Element start;
324 		auto bod = document.getElementsByTagName("body");
325 		if(bod.length)
326 			start = bod[0];
327 		else
328 			start = document.root;
329 
330 		//import std.file;
331 		//auto stylesheet = new StyleSheet(readText("/var/www/dpldocs.info/experimental-docs/style.css"));
332 		//stylesheet.apply(document);
333 
334 		return convert(start, wantWordWrap, wrapAmount);
335 	}
336 
337 	///
338 	string convert(Element start, bool wantWordWrap = true, int wrapAmount = 74) {
339 		htmlToText(start, false, wrapAmount);
340 		return s;
341 	}
342 
343 	///
344 	void reset() {
345 		s = null;
346 		justOutputWhitespace = true;
347 		justOutputBlock = true;
348 		justOutputMargin = true;
349 	}
350 
351 	///
352 	string s;
353 	bool justOutputWhitespace = true;
354 	bool justOutputBlock = true;
355 	bool justOutputMargin = true;
356 	int lineLength;
357 
358 	void sink(dchar item, bool preformatted, int lineWidthOverride = int.min) {
359 
360 		if(needsIndent && item != '\n') {
361 			lineLength += doIndent();
362 			needsIndent = false;
363 		}
364 
365 		int width = lineWidthOverride == int.min ? this.width : lineWidthOverride;
366 		if(!preformatted && isWhite(item)) {
367 			if(!justOutputWhitespace) {
368 				item = ' ';
369 				justOutputWhitespace = true;
370 			} else {
371 				return;
372 			}
373 		} else {
374 			// if it is preformatted, we still need to keep track of if it is whitespace
375 			// so stuff like <br> is somewhat sane
376 			justOutputWhitespace = preformatted && isWhite(item);
377 		}
378 
379 		s ~= item;
380 
381 		if(lineLength >= width) {
382 			// rewind to the nearest space, if there is one, to break on a word boundary
383 			int c =  lineLength;
384 			bool broken;
385 			foreach_reverse(idx, char ch; s) {
386 				if(ch == '\n')
387 					break;
388 				if(ch == ' ') {
389 					auto os = s;
390 					s = os[0 .. idx];
391 					s ~= '\n';
392 					lineLength = cast(int)(os[idx+1..$].length);
393 					lineLength += doIndent();
394 					s ~= os[idx + 1 .. $];
395 					broken = true;
396 					break;
397 				}
398 				c--;
399 				if(c < 5)
400 					break;
401 			}
402 
403 			if(!broken) {
404 				s ~= '\n';
405 				lineLength = 0;
406 				needsIndent = true;
407 				justOutputWhitespace = true;
408 			}
409 
410 		}
411 
412 
413 		if(item == '\n') {
414 			lineLength = 0;
415 			needsIndent = true;
416 		} else
417 			lineLength ++;
418 
419 
420 		if(!justOutputWhitespace) {
421 			justOutputBlock = false;
422 			justOutputMargin = false;
423 		}
424 	}
425 
426 	int doIndent() {
427 		int cnt = 0;
428 		foreach(i; indentStack)
429 			foreach(lol; 0 .. i) {
430 				s ~= ' ';
431 				cnt++;
432 			}
433 		return cnt;
434 	}
435 
436 	int[] indentStack;
437 	bool needsIndent = false;
438 
439 	void startBlock(int indent = 0) {
440 
441 		indentStack ~= indent;
442 
443 		if(!justOutputBlock) {
444 			s ~= "\n";
445 			lineLength = 0;
446 			needsIndent = true;
447 			justOutputBlock = true;
448 		}
449 		if(!justOutputMargin) {
450 			s ~= "\n";
451 			lineLength = 0;
452 			needsIndent = true;
453 			justOutputMargin = true;
454 		}
455 	}
456 	void endBlock() {
457 		if(indentStack.length)
458 			indentStack = indentStack[0 .. $ - 1];
459 
460 		if(!justOutputMargin) {
461 			s ~= "\n";
462 			lineLength = 0;
463 			needsIndent = true;
464 			justOutputMargin = true;
465 		}
466 	}
467 }
468 
469 ///
470 string htmlToText(string html, bool wantWordWrap = true, int wrapAmount = 74) {
471 	auto converter = new HtmlConverter();
472 	return converter.convert(html, wantWordWrap, wrapAmount);
473 }
474