1 ///
2 module arsd.htmltotext;
3 
4 import arsd.dom;
5 import arsd.color;
6 import std.string;
7 
8 import std.uni : isWhite;
9 
10 ///
11 class HtmlConverter {
12 	int width;
13 
14 	/++
15 		Will enable color output using VT codes. Determines color through dom.d's css support, which means you need to apply a stylesheet first.
16 
17 		---
18 		import arsd.dom;
19 
20 		auto document = new Document(source_code_for_html);
21 		auto stylesheet = new Stylesheet(source_code_for_css);
22 		stylesheet.apply(document);
23 		---
24 	+/
25 	bool enableVtOutput;
26 
27 
28 	string color;
29 	string backgroundColor;
30 
31 	///
32 	void htmlToText(Element element, bool preformatted, int width) {
33 		string color, backgroundColor;
34 		if(enableVtOutput) {
35 			color = element.computedStyle.getValue("color");
36 			backgroundColor = element.computedStyle.getValue("background-color");
37 		}
38 
39 		string originalColor = this.color, originalBackgroundColor = this.backgroundColor;
40 
41 		this.color = color.length ? color : this.color;
42 		this.backgroundColor = backgroundColor.length ? backgroundColor : this.backgroundColor;
43 
44 		scope(exit) {
45 			// the idea is as we pop working back up the tree, it restores what it was here
46 			this.color = originalColor;
47 			this.backgroundColor = originalBackgroundColor;
48 		}
49 
50 
51 		this.width = width;
52 		if(auto tn = cast(TextNode) element) {
53 			foreach(dchar ch; tn.nodeValue) {
54 				sink(ch, preformatted);
55 			}
56 		} else {
57 			void sinkChildren() {
58 				foreach(child; element.childNodes)
59 					htmlToText(child, preformatted, width);
60 			}
61 			switch(element.tagName) {
62 				case "head", "script", "style":
63 					// intentionally blank
64 				break;
65 				// The table stuff is removed right now because while it looks
66 				// ok for test tables, it isn't working well for the emails I have
67 				// - it handles data ok but not really nested layouts.
68 				case "trlol":
69 					auto children = element.childElements;
70 
71 					auto tdWidth = (width - cast(int)(children.length)*3) / cast(int)(children.length);
72 					if(tdWidth < 12) {
73 						// too narrow to be reasonable
74 						startBlock();
75 						sinkChildren();
76 						endBlock();
77 					} else {
78 						string[] tdBlocks;
79 						int longestBlock;
80 						foreach(child; children) {
81 							auto fmt = new HtmlConverter();
82 
83 							fmt.htmlToText(child, false, tdWidth);
84 							tdBlocks ~= fmt.s;
85 							int lineCount = 1;
86 							foreach(ch; fmt.s)
87 								if(ch == '\n')
88 									lineCount++;
89 							if(lineCount > longestBlock)
90 								longestBlock = lineCount;
91 						}
92 
93 						if(s.length && s[$-1] != '\n')
94 							s ~= '\n';
95 						foreach(lineNumber; 0 .. longestBlock) {
96 							foreach(bidx, ref block; tdBlocks) {
97 								auto ob = block;
98 								if(bidx)
99 									s ~= " | ";
100 								if(block.length) {
101 									auto idx = block.indexOf("\n");
102 									if(idx == -1)
103 										idx = block.length;
104 
105 									s ~= block[0 .. idx];
106 
107 									if(idx == block.length)
108 										block = block[$..$];
109 									else
110 										block = block[idx + 1 .. $];
111 								}
112 
113 								if(ob.length < tdWidth)
114 								foreach(a; 0 .. tdWidth - block.length)
115 									s ~= " ";
116 
117 							}
118 							s ~= "\n";
119 						}
120 
121 						foreach(a; 0 .. children.length) {
122 							foreach(w; 0 .. tdWidth) {
123 								s ~= "-";
124 							}
125 							if(a +1 != children.length)
126 								s ~= "-+-";
127 						}
128 						s ~= "\n";
129 					}
130 				break;
131 				case "tr":
132 					startBlock(2);
133 					sinkChildren();
134 					endBlock();
135 				break;
136 				case "td":
137 					startBlock(0);
138 					sinkChildren();
139 					endBlock();
140 				break;
141 				case "a":
142 					sinkChildren();
143 					if(element.href != element.innerText) {
144 						sink(' ', false);
145 						sink('<', false);
146 						// I want the link itself to NOT word wrap
147 						// to make for easier double-clicking of it in
148 						// the terminal
149 						foreach(dchar ch; element.href)
150 							sink(ch, false, int.max);
151 						sink('>', false);
152 					}
153 				break;
154 				case "span":
155 					if(enableVtOutput) {
156 						auto csc = color; // element.computedStyle.getValue("color");
157 						if(csc.length) {
158 							auto c = Color.fromString(csc);
159 							s ~= format("\033[38;2;%d;%d;%dm", c.r, c.g, c.b);
160 						}
161 
162 						bool bold = element.computedStyle.getValue("font-weight") == "bold";
163 
164 						if(bold)
165 							s ~= "\033[1m";
166 
167 						sinkChildren();
168 
169 						if(bold)
170 							s ~= "\033[0m";
171 						if(csc.length)
172 							s ~= "\033[39m";
173 					} else {
174 						sinkChildren();
175 					}
176 				break;
177 				case "p":
178 					startBlock();
179 					sinkChildren();
180 					endBlock();
181 				break;
182 				case "b", "strong":
183 				case "em", "i":
184 					if(element.innerText.length == 0)
185 						break;
186 					if(enableVtOutput) {
187 						s ~= "\033[1m";
188 						sinkChildren();
189 						s ~= "\033[0m";
190 					} else {
191 						sink('*', false);
192 						sinkChildren();
193 						sink('*', false);
194 					}
195 				break;
196 				case "u":
197 					if(element.innerText.length == 0)
198 						break;
199 					sink('_', false);
200 					sinkChildren();
201 					sink('_', false);
202 				break;
203 				case "ul":
204 					ulDepth++;
205 					startBlock(2);
206 					sinkChildren();
207 					endBlock();
208 					ulDepth--;
209 				break;
210 				case "ol":
211 					olDepth++;
212 					startBlock(2);
213 					sinkChildren();
214 					endBlock();
215 					olDepth--;
216 				break;
217 				case "li":
218 					startBlock();
219 
220 					//sink('\t', true);
221 					/*
222 					foreach(cnt; 0 .. olDepth + ulDepth) {
223 						sink(' ', true);
224 						sink(' ', true);
225 					}
226 					*/
227 					if(olDepth)
228 						sink('*', false);
229 					if(ulDepth)
230 						sink('*', false);
231 					sink(' ', true);
232 
233 					sinkChildren();
234 
235 					endBlock();
236 				break;
237 
238 				case "dl":
239 				case "dt":
240 				case "dd":
241 					startBlock(element.tagName == "dd" ? 2 : 0);
242 					sinkChildren();
243 					endBlock();
244 				break;
245 
246 				case "h1":
247 					startBlock();
248 					sink('#', true);
249 					sink('#', true);
250 					sink(' ', true);
251 					sinkChildren();
252 					sink(' ', true);
253 					sink('#', true);
254 					sink('#', true);
255 					endBlock();
256 				break;
257 				case "h2", "h3":
258 					startBlock();
259 					sinkChildren();
260 					sink('\n', true);
261 					foreach(dchar ch; element.innerText)
262 						sink(element.tagName == "h2" ? '=' : '-', false);
263 					endBlock();
264 				break;
265 				case "hr":
266 					startBlock();
267 					foreach(i; 0 .. width / 4)
268 						sink(' ', true);
269 					foreach(i; 0 .. width / 2)
270 						sink('-', false);
271 					endBlock();
272 				break;
273 
274 				case "br":
275 					sink('\n', true);
276 				break;
277 				case "div":
278 					startBlock();
279 
280 					/*
281 					auto csc = element.computedStyle.getValue("background-color");
282 					if(csc.length) {
283 						auto c = Color.fromString(csc);
284 						s ~= format("\033[48;2;%d;%d;%dm", c.r, c.g, c.b);
285 					}
286 					*/
287 
288 					sinkChildren();
289 
290 					/*
291 					if(csc.length)
292 						s ~= "\033[49m";
293 					*/
294 
295 					endBlock();
296 				break;
297 				case "pre":
298 					startBlock(4);
299 					foreach(child; element.childNodes)
300 						htmlToText(child, true, width);
301 					endBlock();
302 				break;
303 				default:
304 					sinkChildren();
305 			}
306 		}
307 	}
308 
309 	int olDepth;
310 	int ulDepth;
311 
312 	///
313 	string convert(string html, bool wantWordWrap = true, int wrapAmount = 74) {
314 		Document document = new Document;
315 
316 		document.parse("<roottag>" ~ html ~ "</roottag>");
317 
318 		Element start;
319 		auto bod = document.getElementsByTagName("body");
320 		if(bod.length)
321 			start = bod[0];
322 		else
323 			start = document.root;
324 
325 		//import std.file;
326 		//auto stylesheet = new StyleSheet(readText("/var/www/dpldocs.info/experimental-docs/style.css"));
327 		//stylesheet.apply(document);
328 
329 		return convert(start, wantWordWrap, wrapAmount);
330 	}
331 
332 	///
333 	string convert(Element start, bool wantWordWrap = true, int wrapAmount = 74) {
334 		htmlToText(start, false, wrapAmount);
335 		return s;
336 	}
337 
338 	///
339 	void reset() {
340 		s = null;
341 		justOutputWhitespace = true;
342 		justOutputBlock = true;
343 		justOutputMargin = true;
344 	}
345 
346 	///
347 	string s;
348 	bool justOutputWhitespace = true;
349 	bool justOutputBlock = true;
350 	bool justOutputMargin = true;
351 	int lineLength;
352 
353 	void sink(dchar item, bool preformatted, int lineWidthOverride = int.min) {
354 
355 		if(needsIndent && item != '\n') {
356 			lineLength += doIndent();
357 			needsIndent = false;
358 		}
359 
360 		int width = lineWidthOverride == int.min ? this.width : lineWidthOverride;
361 		if(!preformatted && isWhite(item)) {
362 			if(!justOutputWhitespace) {
363 				item = ' ';
364 				justOutputWhitespace = true;
365 			} else {
366 				return;
367 			}
368 		} else {
369 			// if it is preformatted, we still need to keep track of if it is whitespace
370 			// so stuff like <br> is somewhat sane
371 			justOutputWhitespace = preformatted && isWhite(item);
372 		}
373 
374 		s ~= item;
375 
376 		if(lineLength >= width) {
377 			// rewind to the nearest space, if there is one, to break on a word boundary
378 			int c =  lineLength;
379 			bool broken;
380 			foreach_reverse(idx, char ch; s) {
381 				if(ch == '\n')
382 					break;
383 				if(ch == ' ') {
384 					auto os = s;
385 					s = os[0 .. idx];
386 					s ~= '\n';
387 					lineLength = cast(int)(os[idx+1..$].length);
388 					lineLength += doIndent();
389 					s ~= os[idx + 1 .. $];
390 					broken = true;
391 					break;
392 				}
393 				c--;
394 				if(c < 5)
395 					break;
396 			}
397 
398 			if(!broken) {
399 				s ~= '\n';
400 				lineLength = 0;
401 				needsIndent = true;
402 				justOutputWhitespace = true;
403 			}
404 
405 		}
406 
407 
408 		if(item == '\n') {
409 			lineLength = 0;
410 			needsIndent = true;
411 		} else
412 			lineLength ++;
413 
414 
415 		if(!justOutputWhitespace) {
416 			justOutputBlock = false;
417 			justOutputMargin = false;
418 		}
419 	}
420 
421 	int doIndent() {
422 		int cnt = 0;
423 		foreach(i; indentStack)
424 			foreach(lol; 0 .. i) {
425 				s ~= ' ';
426 				cnt++;
427 			}
428 		return cnt;
429 	}
430 
431 	int[] indentStack;
432 	bool needsIndent = false;
433 
434 	void startBlock(int indent = 0) {
435 
436 		indentStack ~= indent;
437 
438 		if(!justOutputBlock) {
439 			s ~= "\n";
440 			lineLength = 0;
441 			needsIndent = true;
442 			justOutputBlock = true;
443 		}
444 		if(!justOutputMargin) {
445 			s ~= "\n";
446 			lineLength = 0;
447 			needsIndent = true;
448 			justOutputMargin = true;
449 		}
450 	}
451 	void endBlock() {
452 		if(indentStack.length)
453 			indentStack = indentStack[0 .. $ - 1];
454 
455 		if(!justOutputMargin) {
456 			s ~= "\n";
457 			lineLength = 0;
458 			needsIndent = true;
459 			justOutputMargin = true;
460 		}
461 	}
462 }
463 
464 ///
465 string htmlToText(string html, bool wantWordWrap = true, int wrapAmount = 74) {
466 	auto converter = new HtmlConverter();
467 	return converter.convert(html, true, wrapAmount);
468 }
469