arsd.uri source code

1 /++
2 	Future public interface to the Uri struct and encode/decode component functions.
3 
4 	History:
5 		Added May 26, 2025
6 +/
7 module arsd.uri;
8 
9 import arsd.core;
10 
11 import arsd.conv;
12 import arsd.string;
13 
14 alias encodeUriComponent = arsd.core.encodeUriComponent;
15 alias decodeUriComponent = arsd.core.decodeUriComponent;
16 
17 // phobos compatibility names
18 alias encodeComponent = encodeUriComponent;
19 alias decodeComponent = decodeUriComponent;
20 
21 // FIXME: merge and pull Uri struct from http2 and cgi. maybe via core.
22 
23 // might also put base64 in here....
24 
25 
26 
27 /++
28 	Represents a URI. It offers named access to the components and relative uri resolution, though as a user of the library, you'd mostly just construct it like `Uri("http://example.com/index.html")`.
29 
30 	History:
31 		Moved from duplication in [arsd.cgi] and [arsd.http2] to arsd.uri on November 2, 2025.
32 +/
33 struct Uri {
34 	UriString toUriString() {
35 		return UriString(toString());
36 	}
37 
38 	alias toUriString this; // blargh idk a url really is a string, but should it be implicit?
39 
40 	// scheme://userinfo@host:port/path?query#fragment
41 
42 	string scheme; /// e.g. "http" in "http://example.com/"
43 	string userinfo; /// the username (and possibly a password) in the uri
44 	string host; /// the domain name. note it may be an ip address or have percent encoding too.
45 	int port; /// port number, if given. Will be zero if a port was not explicitly given
46 	string path; /// e.g. "/folder/file.html" in "http://example.com/folder/file.html"
47 	string query; /// the stuff after the ? in a uri
48 	string fragment; /// the stuff after the # in a uri.
49 
50 	// cgi.d specific.......
51 	// idk if i want to keep these, since the functions they wrap are used many, many, many times in existing code, so this is either an unnecessary alias or a gratuitous break of compatibility
52 	// the decode ones need to keep different names anyway because we can't overload on return values...
53 	static string encode(string s) { return encodeUriComponent(s); }
54 	static string encode(string[string] s) { return encodeVariables(s); }
55 	static string encode(string[][string] s) { return encodeVariables(s); }
56 
57 	/++
58 		Parses an existing uri string (which should be pre-validated) into this further detailed structure.
59 
60 		History:
61 			Added November 2, 2025.
62 	+/
63 	this(UriString uriString) {
64 		this(uriString.toString());
65 	}
66 
67 	/++
68 		Transforms an interpolated expression sequence into a uri, encoding as appropriate as it reads.
69 
70 		History:
71 			Added November 2, 2025.
72 	+/
73 	this(Args...)(InterpolationHeader header, Args args, InterpolationFooter footer) {
74 		// will need to use iraw here for some cases. paths may partially encoded but still allow slashes, prolly needs a type.
75 		// so like $(path(x)) or $(queryString(x)) or maybe isemi or something. or make user split it into a string[] then recombine here....
76 		string thing;
77 		foreach(arg; args) {
78 			static if(is(typeof(arg) == InterpolationHeader))
79 				{}
80 			else
81 			static if(is(typeof(arg) == InterpolationFooter))
82 				{}
83 			else
84 			static if(is(typeof(arg) == InterpolatedLiteral!part, string part))
85 				thing ~= part;
86 			else
87 			static if(is(typeof(arg) == InterpolatedExpression!code, string code))
88 				{}
89 			else
90 			static if(is(typeof(arg) == iraw))
91 				thing ~= iraw.s;
92 			else
93 				thing ~= encodeUriComponent(to!string(arg));
94 
95 		}
96 
97 		this(thing);
98 	}
99 
100 	unittest {
101 		string bar = "12/";
102 		string baz = "&omg";
103 		auto uri = Uri(i"http://example.com/foo/$bar?thing=$baz");
104 
105 		assert(uri.toString() == "http://example.com/foo/12%2F?thing=%26omg");
106 	}
107 
108 	/// Breaks down a uri string to its components
109 	this(string uri) {
110 		size_t lastGoodIndex;
111 		foreach(char ch; uri) {
112 			if(ch > 127) {
113 				break;
114 			}
115 			lastGoodIndex++;
116 		}
117 
118 		string replacement = uri[0 .. lastGoodIndex];
119 		foreach(char ch; uri[lastGoodIndex .. $]) {
120 			if(ch > 127) {
121 				// need to percent-encode any non-ascii in it
122 				char[3] buffer;
123 				buffer[0] = '%';
124 
125 				auto first = ch / 16;
126 				auto second = ch % 16;
127 				first += (first >= 10) ? ('A'-10) : '0';
128 				second += (second >= 10) ? ('A'-10) : '0';
129 
130 				buffer[1] = cast(char) first;
131 				buffer[2] = cast(char) second;
132 
133 				replacement ~= buffer[];
134 			} else {
135 				replacement ~= ch;
136 			}
137 		}
138 
139 		reparse(replacement);
140 	}
141 
142 	/// Returns `port` if set, otherwise if scheme is https 443, otherwise always 80
143 	int effectivePort() const @property nothrow pure @safe @nogc {
144 		return port != 0 ? port
145 			: scheme == "https" ? 443 : 80;
146 	}
147 
148 	package string unixSocketPath = null;
149 	/// Indicates it should be accessed through a unix socket instead of regular tcp. Returns new version without modifying this object.
150 	Uri viaUnixSocket(string path) const {
151 		Uri copy = this;
152 		copy.unixSocketPath = path;
153 		return copy;
154 	}
155 
156 	/// Goes through a unix socket in the abstract namespace (linux only). Returns new version without modifying this object.
157 	version(linux)
158 	Uri viaAbstractSocket(string path) const {
159 		Uri copy = this;
160 		copy.unixSocketPath = "\0" ~ path;
161 		return copy;
162 	}
163 
164 	// these are like javascript's location.search and location.hash
165 	string search() const {
166 		return query.length ? ("?" ~ query) : "";
167 	}
168 	string hash() const {
169 		return fragment.length ? ("#" ~ fragment) : "";
170 	}
171 
172 
173 	private void reparse(string uri) {
174 		// from RFC 3986
175 		// the ctRegex triples the compile time and makes ugly errors for no real benefit
176 		// it was a nice experiment but just not worth it.
177 		// enum ctr = ctRegex!r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?";
178 		/*
179 			Captures:
180 				0 = whole url
181 				1 = scheme, with :
182 				2 = scheme, no :
183 				3 = authority, with //
184 				4 = authority, no //
185 				5 = path
186 				6 = query string, with ?
187 				7 = query string, no ?
188 				8 = anchor, with #
189 				9 = anchor, no #
190 		*/
191 		// Yikes, even regular, non-CT regex is also unacceptably slow to compile. 1.9s on my computer!
192 		// instead, I will DIY and cut that down to 0.6s on the same computer.
193 		/*
194 
195 				Note that authority is
196 					user:password@domain:port
197 				where the user:password@ part is optional, and the :port is optional.
198 
199 				Regex translation:
200 
201 				Scheme cannot have :, /, ?, or # in it, and must have one or more chars and end in a :. It is optional, but must be first.
202 				Authority must start with //, but cannot have any other /, ?, or # in it. It is optional.
203 				Path cannot have any ? or # in it. It is optional.
204 				Query must start with ? and must not have # in it. It is optional.
205 				Anchor must start with # and can have anything else in it to end of string. It is optional.
206 		*/
207 
208 		this = Uri.init; // reset all state
209 
210 		// empty uri = nothing special
211 		if(uri.length == 0) {
212 			return;
213 		}
214 
215 		size_t idx;
216 
217 		scheme_loop: foreach(char c; uri[idx .. $]) {
218 			switch(c) {
219 				case ':':
220 				case '/':
221 				case '?':
222 				case '#':
223 					break scheme_loop;
224 				default:
225 			}
226 			idx++;
227 		}
228 
229 		if(idx == 0 && uri[idx] == ':') {
230 			// this is actually a path! we skip way ahead
231 			goto path_loop;
232 		}
233 
234 		if(idx == uri.length) {
235 			// the whole thing is a path, apparently
236 			path = uri;
237 			return;
238 		}
239 
240 		if(idx > 0 && uri[idx] == ':') {
241 			scheme = uri[0 .. idx];
242 			idx++;
243 		} else {
244 			// we need to rewind; it found a / but no :, so the whole thing is prolly a path...
245 			idx = 0;
246 		}
247 
248 		if(idx + 2 < uri.length && uri[idx .. idx + 2] == "//") {
249 			// we have an authority....
250 			idx += 2;
251 
252 			auto authority_start = idx;
253 			authority_loop: foreach(char c; uri[idx .. $]) {
254 				switch(c) {
255 					case '/':
256 					case '?':
257 					case '#':
258 						break authority_loop;
259 					default:
260 				}
261 				idx++;
262 			}
263 
264 			auto authority = uri[authority_start .. idx];
265 
266 			auto idx2 = authority.indexOf("@");
267 			if(idx2 != -1) {
268 				userinfo = authority[0 .. idx2];
269 				authority = authority[idx2 + 1 .. $];
270 			}
271 
272 			if(authority.length && authority[0] == '[') {
273 				// ipv6 address special casing
274 				idx2 = authority.indexOf("]");
275 				if(idx2 != -1) {
276 					auto end = authority[idx2 + 1 .. $];
277 					if(end.length && end[0] == ':')
278 						idx2 = idx2 + 1;
279 					else
280 						idx2 = -1;
281 				}
282 			} else {
283 				idx2 = authority.indexOf(":");
284 			}
285 
286 			if(idx2 == -1) {
287 				port = 0; // 0 means not specified; we should use the default for the scheme
288 				host = authority;
289 			} else {
290 				host = authority[0 .. idx2];
291 				if(idx2 + 1 < authority.length)
292 					port = to!int(authority[idx2 + 1 .. $]);
293 				else
294 					port = 0;
295 			}
296 		}
297 
298 		path_loop:
299 		auto path_start = idx;
300 
301 		foreach(char c; uri[idx .. $]) {
302 			if(c == '?' || c == '#')
303 				break;
304 			idx++;
305 		}
306 
307 		path = uri[path_start .. idx];
308 
309 		if(idx == uri.length)
310 			return; // nothing more to examine...
311 
312 		if(uri[idx] == '?') {
313 			idx++;
314 			auto query_start = idx;
315 			foreach(char c; uri[idx .. $]) {
316 				if(c == '#')
317 					break;
318 				idx++;
319 			}
320 			query = uri[query_start .. idx];
321 		}
322 
323 		if(idx < uri.length && uri[idx] == '#') {
324 			idx++;
325 			fragment = uri[idx .. $];
326 		}
327 
328 		// uriInvalidated = false;
329 	}
330 
331 	private string rebuildUri() const {
332 		string ret;
333 		if(scheme.length)
334 			ret ~= scheme ~ ":";
335 		if(userinfo.length || host.length)
336 			ret ~= "//";
337 		if(userinfo.length)
338 			ret ~= userinfo ~ "@";
339 		if(host.length)
340 			ret ~= host;
341 		if(port)
342 			ret ~= ":" ~ to!string(port);
343 
344 		ret ~= path;
345 
346 		if(query.length)
347 			ret ~= "?" ~ query;
348 
349 		if(fragment.length)
350 			ret ~= "#" ~ fragment;
351 
352 		// uri = ret;
353 		// uriInvalidated = false;
354 		return ret;
355 	}
356 
357 	/// Converts the broken down parts back into a complete string
358 	string toString() const {
359 		// if(uriInvalidated)
360 			return rebuildUri();
361 	}
362 
363 	/// Returns a new absolute Uri given a base. It treats this one as
364 	/// relative where possible, but absolute if not. (If protocol, domain, or
365 	/// other info is not set, the new one inherits it from the base.)
366 	///
367 	/// Browsers use a function like this to figure out links in html.
368 	Uri basedOn(in Uri baseUrl) const {
369 		Uri n = this; // copies
370 		if(n.scheme == "data")
371 			return n;
372 		// n.uriInvalidated = true; // make sure we regenerate...
373 
374 		// userinfo is not inherited... is this wrong?
375 
376 		// if anything is given in the existing url, we don't use the base anymore.
377 		if(n.scheme.length == 0) {
378 			n.scheme = baseUrl.scheme;
379 			if(n.host.length == 0) {
380 				n.host = baseUrl.host;
381 				if(n.port == 0) {
382 					n.port = baseUrl.port;
383 					if(n.path.length > 0 && n.path[0] != '/') {
384 						auto b = baseUrl.path[0 .. baseUrl.path.lastIndexOf("/") + 1];
385 						if(b.length == 0)
386 							b = "/";
387 						n.path = b ~ n.path;
388 					} else if(n.path.length == 0) {
389 						n.path = baseUrl.path;
390 					}
391 				}
392 			}
393 		}
394 
395 		n.removeDots();
396 
397 		// if still basically talking to the same thing, we should inherit the unix path
398 		// too since basically the unix path is saying for this service, always use this override.
399 		if(n.host == baseUrl.host && n.scheme == baseUrl.scheme && n.port == baseUrl.port)
400 			n.unixSocketPath = baseUrl.unixSocketPath;
401 
402 		return n;
403 	}
404 
405 	/++
406 		Resolves ../ and ./ parts of the path. Used in the implementation of [basedOn] and you could also use it to normalize things.
407 	+/
408 	void removeDots() {
409 		auto parts = this.path.split("/");
410 		string[] toKeep;
411 		foreach(part; parts) {
412 			if(part == ".") {
413 				continue;
414 			} else if(part == "..") {
415 				//if(toKeep.length > 1)
416 					toKeep = toKeep[0 .. $-1];
417 				//else
418 					//toKeep = [""];
419 				continue;
420 			} else {
421 				//if(toKeep.length && toKeep[$-1].length == 0 && part.length == 0)
422 					//continue; // skip a `//` situation
423 				toKeep ~= part;
424 			}
425 		}
426 
427 		auto path = toKeep.join("/");
428 		if(path.length && path[0] != '/')
429 			path = "/" ~ path;
430 
431 		this.path = path;
432 	}
433 
434 	unittest {
435 		auto uri = Uri("test.html");
436 		assert(uri.path == "test.html");
437 		uri = Uri("path/1/lol");
438 		assert(uri.path == "path/1/lol");
439 		uri = Uri("http://me@example.com");
440 		assert(uri.scheme == "http");
441 		assert(uri.userinfo == "me");
442 		assert(uri.host == "example.com");
443 		uri = Uri("http://example.com/#a");
444 		assert(uri.scheme == "http");
445 		assert(uri.host == "example.com");
446 		assert(uri.fragment == "a");
447 		uri = Uri("#foo");
448 		assert(uri.fragment == "foo");
449 		uri = Uri("?lol");
450 		assert(uri.query == "lol");
451 		uri = Uri("#foo?lol");
452 		assert(uri.fragment == "foo?lol");
453 		uri = Uri("?lol#foo");
454 		assert(uri.fragment == "foo");
455 		assert(uri.query == "lol");
456 
457 		uri = Uri("http://127.0.0.1/");
458 		assert(uri.host == "127.0.0.1");
459 		assert(uri.port == 0);
460 
461 		uri = Uri("http://127.0.0.1:123/");
462 		assert(uri.host == "127.0.0.1");
463 		assert(uri.port == 123);
464 
465 		uri = Uri("http://[ff:ff::0]/");
466 		assert(uri.host == "[ff:ff::0]");
467 
468 		uri = Uri("http://[ff:ff::0]:123/");
469 		assert(uri.host == "[ff:ff::0]");
470 		assert(uri.port == 123);
471 	}
472 
473 	// This can sometimes be a big pain in the butt for me, so lots of copy/paste here to cover
474 	// the possibilities.
475 	unittest {
476 		auto url = Uri("cool.html"); // checking relative links
477 
478 		assert(url.basedOn(Uri("http://test.com/what/test.html")) == "http://test.com/what/cool.html");
479 		assert(url.basedOn(Uri("https://test.com/what/test.html")) == "https://test.com/what/cool.html");
480 		assert(url.basedOn(Uri("http://test.com/what/")) == "http://test.com/what/cool.html");
481 		assert(url.basedOn(Uri("http://test.com/")) == "http://test.com/cool.html");
482 		assert(url.basedOn(Uri("http://test.com")) == "http://test.com/cool.html");
483 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b")) == "http://test.com/what/cool.html");
484 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d")) == "http://test.com/what/cool.html");
485 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d#what")) == "http://test.com/what/cool.html");
486 		assert(url.basedOn(Uri("http://test.com")) == "http://test.com/cool.html");
487 
488 		url = Uri("/something/cool.html"); // same server, different path
489 		assert(url.basedOn(Uri("http://test.com/what/test.html")) == "http://test.com/something/cool.html");
490 		assert(url.basedOn(Uri("https://test.com/what/test.html")) == "https://test.com/something/cool.html");
491 		assert(url.basedOn(Uri("http://test.com/what/")) == "http://test.com/something/cool.html");
492 		assert(url.basedOn(Uri("http://test.com/")) == "http://test.com/something/cool.html");
493 		assert(url.basedOn(Uri("http://test.com")) == "http://test.com/something/cool.html");
494 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b")) == "http://test.com/something/cool.html");
495 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d")) == "http://test.com/something/cool.html");
496 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d#what")) == "http://test.com/something/cool.html");
497 		assert(url.basedOn(Uri("http://test.com")) == "http://test.com/something/cool.html");
498 
499 		url = Uri("?query=answer"); // same path. server, protocol, and port, just different query string and fragment
500 		assert(url.basedOn(Uri("http://test.com/what/test.html")) == "http://test.com/what/test.html?query=answer");
501 		assert(url.basedOn(Uri("https://test.com/what/test.html")) == "https://test.com/what/test.html?query=answer");
502 		assert(url.basedOn(Uri("http://test.com/what/")) == "http://test.com/what/?query=answer");
503 		assert(url.basedOn(Uri("http://test.com/")) == "http://test.com/?query=answer");
504 		assert(url.basedOn(Uri("http://test.com")) == "http://test.com?query=answer");
505 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b")) == "http://test.com/what/test.html?query=answer");
506 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d")) == "http://test.com/what/test.html?query=answer");
507 		assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d#what")) == "http://test.com/what/test.html?query=answer");
508 		assert(url.basedOn(Uri("http://test.com")) == "http://test.com?query=answer");
509 
510 		url = Uri("/test/bar");
511 		assert(Uri("./").basedOn(url) == "/test/", Uri("./").basedOn(url));
512 		assert(Uri("../").basedOn(url) == "/");
513 
514 		url = Uri("http://example.com/");
515 		assert(Uri("../foo").basedOn(url) == "http://example.com/foo");
516 
517 		//auto uriBefore = url;
518 		url = Uri("#anchor"); // everything should remain the same except the anchor
519 		//uriBefore.anchor = "anchor");
520 		//assert(url == uriBefore);
521 
522 		url = Uri("//example.com"); // same protocol, but different server. the path here should be blank.
523 
524 		url = Uri("//example.com/example.html"); // same protocol, but different server and path
525 
526 		url = Uri("http://example.com/test.html"); // completely absolute link should never be modified
527 
528 		url = Uri("http://example.com"); // completely absolute link should never be modified, even if it has no path
529 
530 		// FIXME: add something for port too
531 	}
532 }
533 
534 /// Makes a data:// uri that can be used as links in most newer browsers (IE8+).
535 string makeDataUrl()(string mimeType, in void[] data) {
536 	import std.base64; // FIXME then i can remove the () template
537 	auto data64 = Base64.encode(cast(const(ubyte[])) data);
538 	return "data:" ~ mimeType ~ ";base64," ~ cast(string)(data64);
539 }
540 
541 /// breaks down a url encoded string
542 string[][string] decodeVariables(string data, string separator = "&", string[]* namesInOrder = null, string[]* valuesInOrder = null) {
543 	auto vars = data.split(separator);
544 	string[][string] _get;
545 	foreach(var; vars) {
546 		auto equal = var.indexOf("=");
547 		string name;
548 		string value;
549 		if(equal == -1) {
550 			name = decodeUriComponent(var);
551 			value = "";
552 		} else {
553 			//_get[decodeUriComponent(var[0..equal])] ~= decodeUriComponent(var[equal + 1 .. $].replace("+", " "));
554 			// stupid + -> space conversion.
555 			name = decodeUriComponent(var[0..equal].replace("+", " "));
556 			value = decodeUriComponent(var[equal + 1 .. $].replace("+", " "));
557 		}
558 
559 		_get[name] ~= value;
560 		if(namesInOrder)
561 			(*namesInOrder) ~= name;
562 		if(valuesInOrder)
563 			(*valuesInOrder) ~= value;
564 	}
565 	return _get;
566 }
567 
568 /// breaks down a url encoded string, but only returns the last value of any array
569 string[string] decodeVariablesSingle(string data) {
570 	string[string] va;
571 	auto varArray = decodeVariables(data);
572 	foreach(k, v; varArray)
573 		va[k] = v[$-1];
574 
575 	return va;
576 }
577 
578 
579 /// url encodes the whole string
580 string encodeVariables(in string[string] data) {
581 	string ret;
582 
583 	bool outputted = false;
584 	foreach(k, v; data) {
585 		if(outputted)
586 			ret ~= "&";
587 		else
588 			outputted = true;
589 
590 		ret ~= encodeUriComponent(k) ~ "=" ~ encodeUriComponent(v);
591 	}
592 
593 	return ret;
594 }
595 
596 /// url encodes a whole string
597 string encodeVariables(in string[][string] data) {
598 	string ret;
599 
600 	bool outputted = false;
601 	foreach(k, arr; data) {
602 		foreach(v; arr) {
603 			if(outputted)
604 				ret ~= "&";
605 			else
606 				outputted = true;
607 			ret ~= encodeUriComponent(k) ~ "=" ~ encodeUriComponent(v);
608 		}
609 	}
610 
611 	return ret;
612 }
613 
614 /// Encodes all but the explicitly unreserved characters per rfc 3986
615 /// Alphanumeric and -_.~ are the only ones left unencoded
616 /// name is borrowed from php
617 string rawurlencode(in char[] data) {
618 	string ret;
619 	ret.reserve(data.length * 2);
620 	foreach(char c; data) {
621 		if(
622 			(c >= 'a' && c <= 'z') ||
623 			(c >= 'A' && c <= 'Z') ||
624 			(c >= '0' && c <= '9') ||
625 			c == '-' || c == '_' || c == '.' || c == '~')
626 		{
627 			ret ~= c;
628 		} else {
629 			ret ~= '%';
630 			// since we iterate on char, this should give us the octets of the full utf8 string
631 			ret ~= toHexUpper(c);
632 		}
633 	}
634 
635 	return ret;
636 }
637 
638 
639 char[2] toHexUpper(ubyte num) {
640 	char[2] ret = 0;
641 	ret[0] = num / 16;
642 	ret[1] = num % 16;
643 	ret[0] += cast(char)(ret[0] >= 10 ? 'A' : '0');
644 	ret[1] += cast(char)(ret[1] >= 10 ? 'A' : '0');
645 	return ret;
646 }
647 
648