arsd.characterencodings source code

1 // helper program is in ~me/encodings.d to make more tables from wikipedia
2 
3 /**
4 	This is meant to help get data from the wild into utf8 strings
5 	so you can work with them easily inside D.
6 
7 	The main function is convertToUtf8(), which takes a byte array
8 	of your raw data (a byte array because it isn't really a D string
9 	yet until it is utf8), and a runtime string telling it's current
10 	encoding.
11 
12 	The current encoding argument is meant to come from the data's
13 	metadata, and is flexible on exact format - it is case insensitive
14 	and takes several variations on the names.
15 
16 	This way, you should be able to send it the encoding string directly
17 	from an XML document, a HTTP header, or whatever you have, and it
18 	ought to just work.
19 
20 	Example:
21 		---
22 		auto data = cast(immutable(ubyte)[])
23 			std.file.read("my-windows-file.txt");
24 		string utf8String = convertToUtf8(data, "windows-1252");
25 		// utf8String can now be used
26 		---
27 
28 
29 	The encodings currently implemented for decoding are:
30 		$(LIST
31 			* UTF-8 (a no-op; it simply casts the array to string)
32 			* UTF-16,
33 			* UTF-32,
34 			* Windows-1252,
35 			* ISO 8859 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, and 16.
36 			* KOI8-R
37 		)
38 
39 	It treats ISO 8859-1, Latin-1, and Windows-1252 the same way, since
40 	those labels are pretty much de-facto the same thing in wild documents (people mislabel them a lot and I found it more useful to just deal with it than to be pedantic).
41 
42 	This module currently makes no attempt to look at control characters.
43 */
44 module arsd.characterencodings;
45 
46 import std.string;
47 import std.array;
48 import std.conv;
49 
50 // FIXME: use replacement char here instead
51 
52 /// Like convertToUtf8, but if the encoding is unknown, it just strips all chars > 127 and calls it done instead of throwing
53 string convertToUtf8Lossy(immutable(ubyte)[] data, string dataCharacterEncoding) {
54 	try {
55 		auto ret = convertToUtf8(data, dataCharacterEncoding);
56 		import std.utf;
57 		validate(ret);
58 		return ret;
59 	} catch(Exception e) {
60 		string ret;
61 		foreach(b; data)
62 			if(b < 128)
63 				ret ~= b;
64 			else
65 				ret ~= '\uFFFD';
66 		return ret;
67 	}
68 }
69 
70 /// Takes data from a given character encoding and returns it as UTF-8
71 string convertToUtf8(immutable(ubyte)[] data, string dataCharacterEncoding) {
72 	// just to normalize the passed string...
73 	auto encoding = dataCharacterEncoding.toLower();
74 	encoding = encoding.replace(" ", "");
75 	encoding = encoding.replace("-", "");
76 	encoding = encoding.replace("_", "");
77 	// should be good enough.
78 
79 	switch(encoding) {
80 		default:
81 			throw new Exception("I don't know how to convert " ~ dataCharacterEncoding ~ " to UTF-8");
82 		// since the input is immutable, these are ok too.
83 		// just want to cover all the bases with one runtime function.
84 		case "utf16":
85 		case "utf16le":
86 			return to!string(cast(wstring) data);
87 		case "utf32":
88 		case "utf32le":
89 			return to!string(cast(dstring) data);
90 		// FIXME: does the big endian to little endian conversion work?
91 		case "ascii":
92 		case "usascii": // utf-8 is a superset of ascii
93 		case "utf8":
94 			return cast(string) data;
95 		// and now the various 8 bit encodings we support.
96 		case "windows1252":
97 			return decodeImpl(data, ISO_8859_1, Windows_1252);
98 		case "windows1251":
99 			return decodeImpl(data, Windows_1251, Windows_1251_Lower);
100 		case "koi8r":
101 			return decodeImpl(data, KOI8_R, KOI8_R_Lower);
102 		case "latin1":
103 		case "iso88591":
104 			// Why am I putting Windows_1252 here? A lot of
105 			// stuff in the wild is mislabeled, so this will
106 			// do some good in the Just Works department.
107 			// Regardless, I don't handle the
108 			// control char set in that zone anyway right now.
109 			return decodeImpl(data, ISO_8859_1, Windows_1252);
110 		case "iso88592":
111 			return decodeImpl(data, ISO_8859_2);
112 		case "iso88593":
113 			return decodeImpl(data, ISO_8859_3);
114 		case "iso88594":
115 			return decodeImpl(data, ISO_8859_4);
116 		case "iso88595":
117 			return decodeImpl(data, ISO_8859_5);
118 		case "iso88596":
119 			return decodeImpl(data, ISO_8859_6);
120 		case "iso88597":
121 			return decodeImpl(data, ISO_8859_7);
122 		case "iso88598":
123 			return decodeImpl(data, ISO_8859_8);
124 		case "iso88599":
125 			return decodeImpl(data, ISO_8859_9);
126 		case "iso885910":
127 			return decodeImpl(data, ISO_8859_10);
128 		case "iso885911":
129 			return decodeImpl(data, ISO_8859_11);
130 		case "iso885913":
131 			return decodeImpl(data, ISO_8859_13);
132 		case "iso885914":
133 			return decodeImpl(data, ISO_8859_14);
134 		case "iso885915":
135 			return decodeImpl(data, ISO_8859_15);
136 		case "iso885916":
137 			return decodeImpl(data, ISO_8859_16);
138 	}
139 
140 	assert(0);
141 }
142 
143 /// Tries to determine the current encoding based on the content.
144 /// Only really helps with the UTF variants.
145 /// Returns null if it can't be reasonably sure.
146 string tryToDetermineEncoding(in ubyte[] rawdata) {
147 	import std.utf;
148 	try {
149 		validate!string(cast(string) rawdata);
150 		// the odds of non stuff validating as utf-8 are pretty low
151 		return "UTF-8";
152 	} catch(UTFException t) {
153 		// it's definitely not UTF-8!
154 		// we'll look at the first few characters. If there's a
155 		// BOM, it's probably UTF-16 or UTF-32
156 
157 		if(rawdata.length > 4) {
158 			// not checking for utf8 bom; if it was that, we
159 			// wouldn't be here.
160 			if(rawdata[0] == 0xff && rawdata[1] == 0xfe)
161 				return "UTF-16 LE";
162 			else if(rawdata[0] == 0xfe && rawdata[1] == 0xff)
163 				return "UTF-16 BE";
164 			else if(rawdata[0] == 0x00 && rawdata[1] == 0x00
165 			     && rawdata[2] == 0xfe && rawdata[3] == 0xff)
166 				return "UTF-32 BE";
167 			else if(rawdata[0] == 0xff && rawdata[1] == 0xfe
168 			     && rawdata[2] == 0x00 && rawdata[3] == 0x00)
169 				return "UTF-32 LE";
170 			else {
171 				// this space is intentionally left blank
172 			}
173 		}
174 	}
175 
176 	// we don't know with enough confidence. The app will have to find another way.
177 	return null;
178 }
179 
180 // this function actually does the work, using the translation tables
181 // below.
182 string decodeImpl(in ubyte[] data, in dchar[] chars160to255, in dchar[] chars128to159 = null, in dchar[] chars0to127 = null)
183 	in {
184 		assert(chars160to255.length == 256 - 160);
185 		assert(chars128to159 is null || chars128to159.length == 160 - 128);
186 		assert(chars0to127 is null || chars0to127.length == 128 - 0);
187 	}
188 	out(ret) {
189 		import std.utf;
190 		validate(ret);
191 	}
192 do {
193 	string utf8;
194 
195 	/// I'm sure this could be a lot more efficient, but whatever, it
196 	/// works.
197 	foreach(octet; data) {
198 		if(octet < 128) {
199 			if(chars0to127 !is null)
200 				utf8 ~= chars0to127[octet];
201 			else
202 				utf8 ~= cast(char) octet; // ascii is the same
203 		} else if(octet < 160) {
204 			if(chars128to159 !is null)
205 				utf8 ~= chars128to159[octet - 128];
206 			else
207 				utf8 ~= " ";
208 		} else {
209 			utf8 ~= chars160to255[octet - 160];
210 		}
211 	}
212 
213 	return utf8;
214 }
215 
216 
217 // Here come the translation tables.
218 
219 // this table gives characters for decimal 128 through 159.
220 // the < 128 characters are the same as ascii, and > 159 the same as
221 // iso 8859 1, seen below.
222 immutable dchar[] Windows_1252 = [
223 	'€', ' ', '‚', 'ƒ', '„', '…', '†', '‡',
224 	'ˆ', '‰', 'Š', '‹', 'Œ', ' ', 'Ž', ' ',
225 	' ', '‘', '’', '“', '”', '•', '–', '—',
226 	'˜', '™', 'š', '›', 'œ', ' ', 'ž', 'Ÿ'];
227 
228 // the following tables give the characters from decimal 160 up to 255
229 // in the given encodings.
230 
231 immutable dchar[] ISO_8859_1 = [ 
232 	' ', '¡', '¢', '£', '¤', '¥', '¦', '§',
233 	'¨', '©', 'ª', '«', '¬', '', '®', '¯',
234 	'°', '±', '²', '³', '´', 'µ', '¶', '·',
235 	'¸', '¹', 'º', '»', '¼', '½', '¾', '¿',
236 	'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç',
237 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
238 	'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×',
239 	'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß',
240 	'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç',
241 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
242 	'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷',
243 	'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ'];
244 
245 immutable dchar[] ISO_8859_2 = [ 
246 	' ', 'Ą', '˘', 'Ł', '¤', 'Ľ', 'Ś', '§',
247 	'¨', 'Š', 'Ş', 'Ť', 'Ź', '', 'Ž', 'Ż',
248 	'°', 'ą', '˛', 'ł', '´', 'ľ', 'ś', 'ˇ',
249 	'¸', 'š', 'ş', 'ť', 'ź', '˝', 'ž', 'ż',
250 	'Ŕ', 'Á', 'Â', 'Ă', 'Ä', 'Ĺ', 'Ć', 'Ç',
251 	'Č', 'É', 'Ę', 'Ë', 'Ě', 'Í', 'Î', 'Ď',
252 	'Đ', 'Ń', 'Ň', 'Ó', 'Ô', 'Ő', 'Ö', '×',
253 	'Ř', 'Ů', 'Ú', 'Ű', 'Ü', 'Ý', 'Ţ', 'ß',
254 	'ŕ', 'á', 'â', 'ă', 'ä', 'ĺ', 'ć', 'ç',
255 	'č', 'é', 'ę', 'ë', 'ě', 'í', 'î', 'ď',
256 	'đ', 'ń', 'ň', 'ó', 'ô', 'ő', 'ö', '÷',
257 	'ř', 'ů', 'ú', 'ű', 'ü', 'ý', 'ţ', '˙'];
258 
259 immutable dchar[] ISO_8859_3 = [ 
260 	' ', 'Ħ', '˘', '£', '¤', ' ', 'Ĥ', '§',
261 	'¨', 'İ', 'Ş', 'Ğ', 'Ĵ', '', ' ', 'Ż',
262 	'°', 'ħ', '²', '³', '´', 'µ', 'ĥ', '·',
263 	'¸', 'ı', 'ş', 'ğ', 'ĵ', '½', ' ', 'ż',
264 	'À', 'Á', 'Â', ' ', 'Ä', 'Ċ', 'Ĉ', 'Ç',
265 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
266 	' ', 'Ñ', 'Ò', 'Ó', 'Ô', 'Ġ', 'Ö', '×',
267 	'Ĝ', 'Ù', 'Ú', 'Û', 'Ü', 'Ŭ', 'Ŝ', 'ß',
268 	'à', 'á', 'â', ' ', 'ä', 'ċ', 'ĉ', 'ç',
269 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
270 	' ', 'ñ', 'ò', 'ó', 'ô', 'ġ', 'ö', '÷',
271 	'ĝ', 'ù', 'ú', 'û', 'ü', 'ŭ', 'ŝ', '˙'];
272 
273 immutable dchar[] ISO_8859_4 = [ 
274 	' ', 'Ą', 'ĸ', 'Ŗ', '¤', 'Ĩ', 'Ļ', '§',
275 	'¨', 'Š', 'Ē', 'Ģ', 'Ŧ', '', 'Ž', '¯',
276 	'°', 'ą', '˛', 'ŗ', '´', 'ĩ', 'ļ', 'ˇ',
277 	'¸', 'š', 'ē', 'ģ', 'ŧ', 'Ŋ', 'ž', 'ŋ',
278 	'Ā', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Į',
279 	'Č', 'É', 'Ę', 'Ë', 'Ė', 'Í', 'Î', 'Ī',
280 	'Đ', 'Ņ', 'Ō', 'Ķ', 'Ô', 'Õ', 'Ö', '×',
281 	'Ø', 'Ų', 'Ú', 'Û', 'Ü', 'Ũ', 'Ū', 'ß',
282 	'ā', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'į',
283 	'č', 'é', 'ę', 'ë', 'ė', 'í', 'î', 'ī',
284 	'đ', 'ņ', 'ō', 'ķ', 'ô', 'õ', 'ö', '÷',
285 	'ø', 'ų', 'ú', 'û', 'ü', 'ũ', 'ū', '˙'];
286 
287 immutable dchar[] ISO_8859_5 = [ 
288 	' ', 'Ё', 'Ђ', 'Ѓ', 'Є', 'Ѕ', 'І', 'Ї',
289 	'Ј', 'Љ', 'Њ', 'Ћ', 'Ќ', '', 'Ў', 'Џ',
290 	'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З',
291 	'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П',
292 	'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч',
293 	'Ш', 'Щ', 'Ъ', 'Ы', 'Ь', 'Э', 'Ю', 'Я',
294 	'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з',
295 	'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п',
296 	'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч',
297 	'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я',
298 	'№', 'ё', 'ђ', 'ѓ', 'є', 'ѕ', 'і', 'ї',
299 	'ј', 'љ', 'њ', 'ћ', 'ќ', '§', 'ў', 'џ'];
300 
301 immutable dchar[] ISO_8859_6 = [ 
302 	' ', ' ', ' ', ' ', '¤', ' ', ' ', ' ',
303 	' ', ' ', ' ', ' ', '،', '', ' ', ' ',
304 	' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
305 	' ', ' ', ' ', '؛', ' ', ' ', ' ', '؟',
306 	' ', 'ء', 'آ', 'أ', 'ؤ', 'إ', 'ئ', 'ا',
307 	'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ', 'د',
308 	'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط',
309 	'ظ', 'ع', 'غ', ' ', ' ', ' ', ' ', ' ',
310 	'ـ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه',
311 	'و', 'ى', 'ي', 'ً', 'ٌ', 'ٍ', 'َ', 'ُ',
312 	'ِ', 'ّ', 'ْ', ' ', ' ', ' ', ' ', ' ',
313 	' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '];
314 
315 immutable dchar[] ISO_8859_7 = [ 
316 	' ', '‘', '’', '£', '€', '₯', '¦', '§',
317 	'¨', '©', 'ͺ', '«', '¬', '', ' ', '―',
318 	'°', '±', '²', '³', '΄', '΅', 'Ά', '·',
319 	'Έ', 'Ή', 'Ί', '»', 'Ό', '½', 'Ύ', 'Ώ',
320 	'ΐ', 'Α', 'Β', 'Γ', 'Δ', 'Ε', 'Ζ', 'Η',
321 	'Θ', 'Ι', 'Κ', 'Λ', 'Μ', 'Ν', 'Ξ', 'Ο',
322 	'Π', 'Ρ', ' ', 'Σ', 'Τ', 'Υ', 'Φ', 'Χ',
323 	'Ψ', 'Ω', 'Ϊ', 'Ϋ', 'ά', 'έ', 'ή', 'ί',
324 	'ΰ', 'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η',
325 	'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο',
326 	'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'χ',
327 	'ψ', 'ω', 'ϊ', 'ϋ', 'ό', 'ύ', 'ώ', ' '];
328 
329 immutable dchar[] ISO_8859_8 = [ 
330 	' ', ' ', '¢', '£', '¤', '¥', '¦', '§',
331 	'¨', '©', '×', '«', '¬', '', '®', '¯',
332 	'°', '±', '²', '³', '´', 'µ', '¶', '·',
333 	'¸', '¹', '÷', '»', '¼', '½', '¾', ' ',
334 	' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
335 	' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
336 	' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
337 	' ', ' ', ' ', ' ', ' ', ' ', ' ', '‗',
338 	'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח',
339 	'ט', 'י', 'ך', 'כ', 'ל', 'ם', 'מ', 'ן',
340 	'נ', 'ס', 'ע', 'ף', 'פ', 'ץ', 'צ', 'ק',
341 	//                        v    v    those are wrong
342 	'ר', 'ש', 'ת', ' ', ' ', ' ', ' ', ' ']; // FIXME:  those ones marked wrong are supposed to be left to right and right to left markers, not spaces. lol maybe it isn't wrong
343 
344 immutable dchar[] ISO_8859_9 = [ 
345 	' ', '¡', '¢', '£', '¤', '¥', '¦', '§',
346 	'¨', '©', 'ª', '«', '¬', '', '®', '¯',
347 	'°', '±', '²', '³', '´', 'µ', '¶', '·',
348 	'¸', '¹', 'º', '»', '¼', '½', '¾', '¿',
349 	'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç',
350 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
351 	'Ğ', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×',
352 	'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'İ', 'Ş', 'ß',
353 	'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç',
354 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
355 	'ğ', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷',
356 	'ø', 'ù', 'ú', 'û', 'ü', 'ı', 'ş', 'ÿ'];
357 
358 immutable dchar[] ISO_8859_10 = [ 
359 	' ', 'Ą', 'Ē', 'Ģ', 'Ī', 'Ĩ', 'Ķ', '§',
360 	'Ļ', 'Đ', 'Š', 'Ŧ', 'Ž', '', 'Ū', 'Ŋ',
361 	'°', 'ą', 'ē', 'ģ', 'ī', 'ĩ', 'ķ', '·',
362 	'ļ', 'đ', 'š', 'ŧ', 'ž', '―', 'ū', 'ŋ',
363 	'Ā', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Į',
364 	'Č', 'É', 'Ę', 'Ë', 'Ė', 'Í', 'Î', 'Ï',
365 	'Ð', 'Ņ', 'Ō', 'Ó', 'Ô', 'Õ', 'Ö', 'Ũ',
366 	'Ø', 'Ų', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß',
367 	'ā', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'į',
368 	'č', 'é', 'ę', 'ë', 'ė', 'í', 'î', 'ï',
369 	'ð', 'ņ', 'ō', 'ó', 'ô', 'õ', 'ö', 'ũ',
370 	'ø', 'ų', 'ú', 'û', 'ü', 'ý', 'þ', 'ĸ'];
371 
372 immutable dchar[] ISO_8859_11 = [ 
373 	' ', 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง',
374 	'จ', 'ฉ', 'ช', 'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ',
375 	'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท',
376 	'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ',
377 	'ภ', 'ม', 'ย', 'ร', 'ฤ', 'ล', 'ฦ', 'ว',
378 	'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ฯ',
379 	'ะ', 'ั', 'า', 'ำ', 'ิ', 'ี', 'ึ', 'ื',
380 	'ุ', 'ู', 'ฺ', ' ', ' ', ' ', ' ', '฿',
381 	'เ', 'แ', 'โ', 'ใ', 'ไ', 'ๅ', 'ๆ', '็',
382 	'่', '้', '๊', '๋', '์', 'ํ', '๎', '๏',
383 	'๐', '๑', '๒', '๓', '๔', '๕', '๖', '๗',
384 	'๘', '๙', '๚', '๛', ' ', ' ', ' ', ' '];
385 
386 immutable dchar[] ISO_8859_13 = [ 
387 	' ', '”', '¢', '£', '¤', '„', '¦', '§',
388 	'Ø', '©', 'Ŗ', '«', '¬', '', '®', 'Æ',
389 	'°', '±', '²', '³', '“', 'µ', '¶', '·',
390 	'ø', '¹', 'ŗ', '»', '¼', '½', '¾', 'æ',
391 	'Ą', 'Į', 'Ā', 'Ć', 'Ä', 'Å', 'Ę', 'Ē',
392 	'Č', 'É', 'Ź', 'Ė', 'Ģ', 'Ķ', 'Ī', 'Ļ',
393 	'Š', 'Ń', 'Ņ', 'Ó', 'Ō', 'Ő', 'Ö', '×',
394 	'Ų', 'Ł', 'Ś', 'Ū', 'Ü', 'Ż', 'Ž', 'ß',
395 	'ą', 'į', 'ā', 'ć', 'ä', 'å', 'ę', 'ē',
396 	'č', 'é', 'ź', 'ė', 'ģ', 'ķ', 'ī', 'ļ',
397 	'š', 'ń', 'ņ', 'ó', 'ō', 'ő', 'ö', '÷',
398 	'ų', 'ł', 'ś', 'ū', 'ü', 'ż', 'ž', '’'];
399 
400 immutable dchar[] ISO_8859_14 = [ 
401 	' ', 'Ḃ', 'ḃ', '£', 'Ċ', 'ċ', 'Ḋ', '§',
402 	'Ẁ', '©', 'Ẃ', 'ḋ', 'Ỳ', '', '®', 'Ÿ',
403 	'Ḟ', 'ḟ', 'Ġ', 'ġ', 'Ṁ', 'ṁ', '¶', 'Ṗ',
404 	'ẁ', 'ṗ', 'ẃ', 'Ṡ', 'ỳ', 'Ẅ', 'ẅ', 'ṡ',
405 	'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç',
406 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
407 	'Ŵ', 'Ñ', 'Ò', 'Ó', 'Ô', 'Ő', 'Ö', 'Ṫ',
408 	'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Ŷ', 'ß',
409 	'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç',
410 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
411 	'ŵ', 'ñ', 'ò', 'ó', 'ô', 'ő', 'ö', 'ṫ',
412 	'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'ŷ', 'ÿ'];
413 
414 immutable dchar[] ISO_8859_15 = [ 
415 	' ', '¡', '¢', '£', '€', '¥', 'Š', '§',
416 	'š', '©', 'ª', '«', '¬', '', '®', '¯',
417 	'°', '±', '²', '³', 'Ž', 'µ', '¶', '·',
418 	'ž', '¹', 'º', '»', 'Œ', 'œ', 'Ÿ', '¿',
419 	'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç',
420 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
421 	'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Ő', 'Ö', '×',
422 	'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß',
423 	'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç',
424 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
425 	'ð', 'ñ', 'ò', 'ó', 'ô', 'ő', 'ö', '÷',
426 	'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ'];
427 
428 immutable dchar[] ISO_8859_16 = [ 
429 	' ', 'Ą', 'ą', 'Ł', '€', '„', 'Š', '§',
430 	'š', '©', 'Ș', '«', 'Ź', '', 'ź', 'Ż',
431 	'°', '±', 'Č', 'ł', 'Ž', '”', '¶', '·',
432 	'ž', 'č', 'ș', '»', 'Œ', 'œ', 'Ÿ', 'ż',
433 	'À', 'Á', 'Â', 'Ă', 'Ä', 'Ć', 'Æ', 'Ç',
434 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
435 	'Ð', 'Ń', 'Ò', 'Ó', 'Ô', 'Ő', 'Ö', 'Ś',
436 	'Ű', 'Ù', 'Ú', 'Û', 'Ü', 'Ę', 'Ț', 'ß',
437 	'à', 'á', 'â', 'ă', 'ä', 'ć', 'æ', 'ç',
438 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
439 	'đ', 'ń', 'ò', 'ó', 'ô', 'ő', 'ö', 'ś',
440 	'ű', 'ù', 'ú', 'û', 'ü', 'ę', 'ț', 'ÿ'];
441 
442 immutable dchar[] KOI8_R_Lower = [
443 	'─', '│', '┌', '┐', '└', '┘', '├', '┤',
444 	'┬', '┴', '┼', '▀', '▄', '█', '▌', '▐',
445 	'░', '▒', '▓', '⌠', '■', '∙', '√', '≈',
446 	'≤', '≥', '\u00a0', '⌡', '°', '²', '·', '÷'];
447 
448 immutable dchar[] KOI8_R = [
449 	'═', '║', '╒', 'ё', '╓', '╔', '╕', '╖',
450 	'╗', '╘', '╙', '╚', '╛', '╜', '╝', '╞',
451 	'╟', '╠', '╡', 'ё', '╢', '╣', '╤', '╥',
452 	'╦', '╧', '╨', '╩', '╪', '╫', '╬', '©',
453 	'ю', 'а', 'б', 'ц', 'д', 'е', 'ф', 'г',
454 	'х', 'и', 'й', 'к', 'л', 'м', 'н', 'о',
455 	'п', 'я', 'р', 'с', 'т', 'у', 'ж', 'в',
456 	'ь', 'ы', 'з', 'ш', 'э', 'щ', 'ч', 'ъ',
457 	'ю', 'а', 'б', 'ц', 'д', 'е', 'ф', 'г',
458 	'х', 'и', 'й', 'к', 'л', 'м', 'н', 'о',
459 	'п', 'я', 'р', 'с', 'т', 'у', 'ж', 'в',
460 	'ь', 'ы', 'з', 'ш', 'э', 'щ', 'ч', 'ъ'];
461 
462 immutable dchar[] Windows_1251_Lower = [
463 	'Ђ', 'Ѓ', '‚', 'ѓ', '„', '…', '†', '‡',
464 	'€', '‰', 'Љ', '‹', 'Њ', 'Ќ', 'Ћ', 'Џ',
465 	'ђ', '‘', '’', '“', '”', '•', '–', '—',
466 	' ', '™', 'љ', '›', 'њ', 'ќ', 'ћ', 'џ'];
467 
468 immutable dchar[] Windows_1251 = [
469 	' ', 'Ў', 'ў', 'Ј', '¤', 'Ґ', '¦', '§',
470 	'Ё', '©', 'Є', '«', '¬', '', '®', 'Ї',
471 	'°', '±', 'І', 'і', 'ґ', 'µ', '¶', '·',
472 	'ё', '№', 'є', '»', 'ј', 'Ѕ', 'ѕ', 'ї',
473 	'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З',
474 	'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П',
475 	'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч',
476 	'Ш', 'Щ', 'Ъ', 'Ы', 'Ь', 'Э', 'Ю', 'Я',
477 	'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з',
478 	'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п',
479 	'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч',
480 	'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'];
481