1 /++ 2 Future public interface to the Uri struct and encode/decode component functions. 3 4 History: 5 Added May 26, 2025 6 +/ 7 module arsd.uri; 8 9 import arsd.core; 10 11 import arsd.conv; 12 import arsd.string; 13 14 alias encodeUriComponent = arsd.core.encodeUriComponent; 15 alias decodeUriComponent = arsd.core.decodeUriComponent; 16 17 // phobos compatibility names 18 alias encodeComponent = encodeUriComponent; 19 alias decodeComponent = decodeUriComponent; 20 21 // FIXME: merge and pull Uri struct from http2 and cgi. maybe via core. 22 23 // might also put base64 in here.... 24 25 26 27 /++ 28 Represents a URI. It offers named access to the components and relative uri resolution, though as a user of the library, you'd mostly just construct it like `Uri("http://example.com/index.html")`. 29 30 History: 31 Moved from duplication in [arsd.cgi] and [arsd.http2] to arsd.uri on November 2, 2025. 32 +/ 33 struct Uri { 34 UriString toUriString() { 35 return UriString(toString()); 36 } 37 38 alias toUriString this; // blargh idk a url really is a string, but should it be implicit? 39 40 // scheme://userinfo@host:port/path?query#fragment 41 42 string scheme; /// e.g. "http" in "http://example.com/" 43 string userinfo; /// the username (and possibly a password) in the uri 44 string host; /// the domain name. note it may be an ip address or have percent encoding too. 45 int port; /// port number, if given. Will be zero if a port was not explicitly given 46 string path; /// e.g. "/folder/file.html" in "http://example.com/folder/file.html" 47 string query; /// the stuff after the ? in a uri 48 string fragment; /// the stuff after the # in a uri. 49 50 // cgi.d specific....... 51 // idk if i want to keep these, since the functions they wrap are used many, many, many times in existing code, so this is either an unnecessary alias or a gratuitous break of compatibility 52 // the decode ones need to keep different names anyway because we can't overload on return values... 53 static string encode(string s) { return encodeUriComponent(s); } 54 static string encode(string[string] s) { return encodeVariables(s); } 55 static string encode(string[][string] s) { return encodeVariables(s); } 56 57 /++ 58 Parses an existing uri string (which should be pre-validated) into this further detailed structure. 59 60 History: 61 Added November 2, 2025. 62 +/ 63 this(UriString uriString) { 64 this(uriString.toString()); 65 } 66 67 /++ 68 Transforms an interpolated expression sequence into a uri, encoding as appropriate as it reads. 69 70 History: 71 Added November 2, 2025. 72 +/ 73 this(Args...)(InterpolationHeader header, Args args, InterpolationFooter footer) { 74 // will need to use iraw here for some cases. paths may partially encoded but still allow slashes, prolly needs a type. 75 // so like $(path(x)) or $(queryString(x)) or maybe isemi or something. or make user split it into a string[] then recombine here.... 76 string thing; 77 foreach(arg; args) { 78 static if(is(typeof(arg) == InterpolationHeader)) 79 {} 80 else 81 static if(is(typeof(arg) == InterpolationFooter)) 82 {} 83 else 84 static if(is(typeof(arg) == InterpolatedLiteral!part, string part)) 85 thing ~= part; 86 else 87 static if(is(typeof(arg) == InterpolatedExpression!code, string code)) 88 {} 89 else 90 static if(is(typeof(arg) == iraw)) 91 thing ~= iraw.s; 92 else 93 thing ~= encodeUriComponent(to!string(arg)); 94 95 } 96 97 this(thing); 98 } 99 100 unittest { 101 string bar = "12/"; 102 string baz = "&omg"; 103 auto uri = Uri(i"http://example.com/foo/$bar?thing=$baz"); 104 105 assert(uri.toString() == "http://example.com/foo/12%2F?thing=%26omg"); 106 } 107 108 /// Breaks down a uri string to its components 109 this(string uri) { 110 size_t lastGoodIndex; 111 foreach(char ch; uri) { 112 if(ch > 127) { 113 break; 114 } 115 lastGoodIndex++; 116 } 117 118 string replacement = uri[0 .. lastGoodIndex]; 119 foreach(char ch; uri[lastGoodIndex .. $]) { 120 if(ch > 127) { 121 // need to percent-encode any non-ascii in it 122 char[3] buffer; 123 buffer[0] = '%'; 124 125 auto first = ch / 16; 126 auto second = ch % 16; 127 first += (first >= 10) ? ('A'-10) : '0'; 128 second += (second >= 10) ? ('A'-10) : '0'; 129 130 buffer[1] = cast(char) first; 131 buffer[2] = cast(char) second; 132 133 replacement ~= buffer[]; 134 } else { 135 replacement ~= ch; 136 } 137 } 138 139 reparse(replacement); 140 } 141 142 /// Returns `port` if set, otherwise if scheme is https 443, otherwise always 80 143 int effectivePort() const @property nothrow pure @safe @nogc { 144 return port != 0 ? port 145 : scheme == "https" ? 443 : 80; 146 } 147 148 package string unixSocketPath = null; 149 /// Indicates it should be accessed through a unix socket instead of regular tcp. Returns new version without modifying this object. 150 Uri viaUnixSocket(string path) const { 151 Uri copy = this; 152 copy.unixSocketPath = path; 153 return copy; 154 } 155 156 /// Goes through a unix socket in the abstract namespace (linux only). Returns new version without modifying this object. 157 version(linux) 158 Uri viaAbstractSocket(string path) const { 159 Uri copy = this; 160 copy.unixSocketPath = "\0" ~ path; 161 return copy; 162 } 163 164 // these are like javascript's location.search and location.hash 165 string search() const { 166 return query.length ? ("?" ~ query) : ""; 167 } 168 string hash() const { 169 return fragment.length ? ("#" ~ fragment) : ""; 170 } 171 172 173 private void reparse(string uri) { 174 // from RFC 3986 175 // the ctRegex triples the compile time and makes ugly errors for no real benefit 176 // it was a nice experiment but just not worth it. 177 // enum ctr = ctRegex!r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?"; 178 /* 179 Captures: 180 0 = whole url 181 1 = scheme, with : 182 2 = scheme, no : 183 3 = authority, with // 184 4 = authority, no // 185 5 = path 186 6 = query string, with ? 187 7 = query string, no ? 188 8 = anchor, with # 189 9 = anchor, no # 190 */ 191 // Yikes, even regular, non-CT regex is also unacceptably slow to compile. 1.9s on my computer! 192 // instead, I will DIY and cut that down to 0.6s on the same computer. 193 /* 194 195 Note that authority is 196 user:password@domain:port 197 where the user:password@ part is optional, and the :port is optional. 198 199 Regex translation: 200 201 Scheme cannot have :, /, ?, or # in it, and must have one or more chars and end in a :. It is optional, but must be first. 202 Authority must start with //, but cannot have any other /, ?, or # in it. It is optional. 203 Path cannot have any ? or # in it. It is optional. 204 Query must start with ? and must not have # in it. It is optional. 205 Anchor must start with # and can have anything else in it to end of string. It is optional. 206 */ 207 208 this = Uri.init; // reset all state 209 210 // empty uri = nothing special 211 if(uri.length == 0) { 212 return; 213 } 214 215 size_t idx; 216 217 scheme_loop: foreach(char c; uri[idx .. $]) { 218 switch(c) { 219 case ':': 220 case '/': 221 case '?': 222 case '#': 223 break scheme_loop; 224 default: 225 } 226 idx++; 227 } 228 229 if(idx == 0 && uri[idx] == ':') { 230 // this is actually a path! we skip way ahead 231 goto path_loop; 232 } 233 234 if(idx == uri.length) { 235 // the whole thing is a path, apparently 236 path = uri; 237 return; 238 } 239 240 if(idx > 0 && uri[idx] == ':') { 241 scheme = uri[0 .. idx]; 242 idx++; 243 } else { 244 // we need to rewind; it found a / but no :, so the whole thing is prolly a path... 245 idx = 0; 246 } 247 248 if(idx + 2 < uri.length && uri[idx .. idx + 2] == "//") { 249 // we have an authority.... 250 idx += 2; 251 252 auto authority_start = idx; 253 authority_loop: foreach(char c; uri[idx .. $]) { 254 switch(c) { 255 case '/': 256 case '?': 257 case '#': 258 break authority_loop; 259 default: 260 } 261 idx++; 262 } 263 264 auto authority = uri[authority_start .. idx]; 265 266 auto idx2 = authority.indexOf("@"); 267 if(idx2 != -1) { 268 userinfo = authority[0 .. idx2]; 269 authority = authority[idx2 + 1 .. $]; 270 } 271 272 if(authority.length && authority[0] == '[') { 273 // ipv6 address special casing 274 idx2 = authority.indexOf("]"); 275 if(idx2 != -1) { 276 auto end = authority[idx2 + 1 .. $]; 277 if(end.length && end[0] == ':') 278 idx2 = idx2 + 1; 279 else 280 idx2 = -1; 281 } 282 } else { 283 idx2 = authority.indexOf(":"); 284 } 285 286 if(idx2 == -1) { 287 port = 0; // 0 means not specified; we should use the default for the scheme 288 host = authority; 289 } else { 290 host = authority[0 .. idx2]; 291 if(idx2 + 1 < authority.length) 292 port = to!int(authority[idx2 + 1 .. $]); 293 else 294 port = 0; 295 } 296 } 297 298 path_loop: 299 auto path_start = idx; 300 301 foreach(char c; uri[idx .. $]) { 302 if(c == '?' || c == '#') 303 break; 304 idx++; 305 } 306 307 path = uri[path_start .. idx]; 308 309 if(idx == uri.length) 310 return; // nothing more to examine... 311 312 if(uri[idx] == '?') { 313 idx++; 314 auto query_start = idx; 315 foreach(char c; uri[idx .. $]) { 316 if(c == '#') 317 break; 318 idx++; 319 } 320 query = uri[query_start .. idx]; 321 } 322 323 if(idx < uri.length && uri[idx] == '#') { 324 idx++; 325 fragment = uri[idx .. $]; 326 } 327 328 // uriInvalidated = false; 329 } 330 331 private string rebuildUri() const { 332 string ret; 333 if(scheme.length) 334 ret ~= scheme ~ ":"; 335 if(userinfo.length || host.length) 336 ret ~= "//"; 337 if(userinfo.length) 338 ret ~= userinfo ~ "@"; 339 if(host.length) 340 ret ~= host; 341 if(port) 342 ret ~= ":" ~ to!string(port); 343 344 ret ~= path; 345 346 if(query.length) 347 ret ~= "?" ~ query; 348 349 if(fragment.length) 350 ret ~= "#" ~ fragment; 351 352 // uri = ret; 353 // uriInvalidated = false; 354 return ret; 355 } 356 357 /// Converts the broken down parts back into a complete string 358 string toString() const { 359 // if(uriInvalidated) 360 return rebuildUri(); 361 } 362 363 /// Returns a new absolute Uri given a base. It treats this one as 364 /// relative where possible, but absolute if not. (If protocol, domain, or 365 /// other info is not set, the new one inherits it from the base.) 366 /// 367 /// Browsers use a function like this to figure out links in html. 368 Uri basedOn(in Uri baseUrl) const { 369 Uri n = this; // copies 370 if(n.scheme == "data") 371 return n; 372 // n.uriInvalidated = true; // make sure we regenerate... 373 374 // userinfo is not inherited... is this wrong? 375 376 // if anything is given in the existing url, we don't use the base anymore. 377 if(n.scheme.length == 0) { 378 n.scheme = baseUrl.scheme; 379 if(n.host.length == 0) { 380 n.host = baseUrl.host; 381 if(n.port == 0) { 382 n.port = baseUrl.port; 383 if(n.path.length > 0 && n.path[0] != '/') { 384 auto b = baseUrl.path[0 .. baseUrl.path.lastIndexOf("/") + 1]; 385 if(b.length == 0) 386 b = "/"; 387 n.path = b ~ n.path; 388 } else if(n.path.length == 0) { 389 n.path = baseUrl.path; 390 } 391 } 392 } 393 } 394 395 n.removeDots(); 396 397 // if still basically talking to the same thing, we should inherit the unix path 398 // too since basically the unix path is saying for this service, always use this override. 399 if(n.host == baseUrl.host && n.scheme == baseUrl.scheme && n.port == baseUrl.port) 400 n.unixSocketPath = baseUrl.unixSocketPath; 401 402 return n; 403 } 404 405 /++ 406 Resolves ../ and ./ parts of the path. Used in the implementation of [basedOn] and you could also use it to normalize things. 407 +/ 408 void removeDots() { 409 auto parts = this.path.split("/"); 410 string[] toKeep; 411 foreach(part; parts) { 412 if(part == ".") { 413 continue; 414 } else if(part == "..") { 415 //if(toKeep.length > 1) 416 toKeep = toKeep[0 .. $-1]; 417 //else 418 //toKeep = [""]; 419 continue; 420 } else { 421 //if(toKeep.length && toKeep[$-1].length == 0 && part.length == 0) 422 //continue; // skip a `//` situation 423 toKeep ~= part; 424 } 425 } 426 427 auto path = toKeep.join("/"); 428 if(path.length && path[0] != '/') 429 path = "/" ~ path; 430 431 this.path = path; 432 } 433 434 unittest { 435 auto uri = Uri("test.html"); 436 assert(uri.path == "test.html"); 437 uri = Uri("path/1/lol"); 438 assert(uri.path == "path/1/lol"); 439 uri = Uri("http://me@example.com"); 440 assert(uri.scheme == "http"); 441 assert(uri.userinfo == "me"); 442 assert(uri.host == "example.com"); 443 uri = Uri("http://example.com/#a"); 444 assert(uri.scheme == "http"); 445 assert(uri.host == "example.com"); 446 assert(uri.fragment == "a"); 447 uri = Uri("#foo"); 448 assert(uri.fragment == "foo"); 449 uri = Uri("?lol"); 450 assert(uri.query == "lol"); 451 uri = Uri("#foo?lol"); 452 assert(uri.fragment == "foo?lol"); 453 uri = Uri("?lol#foo"); 454 assert(uri.fragment == "foo"); 455 assert(uri.query == "lol"); 456 457 uri = Uri("http://127.0.0.1/"); 458 assert(uri.host == "127.0.0.1"); 459 assert(uri.port == 0); 460 461 uri = Uri("http://127.0.0.1:123/"); 462 assert(uri.host == "127.0.0.1"); 463 assert(uri.port == 123); 464 465 uri = Uri("http://[ff:ff::0]/"); 466 assert(uri.host == "[ff:ff::0]"); 467 468 uri = Uri("http://[ff:ff::0]:123/"); 469 assert(uri.host == "[ff:ff::0]"); 470 assert(uri.port == 123); 471 } 472 473 // This can sometimes be a big pain in the butt for me, so lots of copy/paste here to cover 474 // the possibilities. 475 unittest { 476 auto url = Uri("cool.html"); // checking relative links 477 478 assert(url.basedOn(Uri("http://test.com/what/test.html")) == "http://test.com/what/cool.html"); 479 assert(url.basedOn(Uri("https://test.com/what/test.html")) == "https://test.com/what/cool.html"); 480 assert(url.basedOn(Uri("http://test.com/what/")) == "http://test.com/what/cool.html"); 481 assert(url.basedOn(Uri("http://test.com/")) == "http://test.com/cool.html"); 482 assert(url.basedOn(Uri("http://test.com")) == "http://test.com/cool.html"); 483 assert(url.basedOn(Uri("http://test.com/what/test.html?a=b")) == "http://test.com/what/cool.html"); 484 assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d")) == "http://test.com/what/cool.html"); 485 assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d#what")) == "http://test.com/what/cool.html"); 486 assert(url.basedOn(Uri("http://test.com")) == "http://test.com/cool.html"); 487 488 url = Uri("/something/cool.html"); // same server, different path 489 assert(url.basedOn(Uri("http://test.com/what/test.html")) == "http://test.com/something/cool.html"); 490 assert(url.basedOn(Uri("https://test.com/what/test.html")) == "https://test.com/something/cool.html"); 491 assert(url.basedOn(Uri("http://test.com/what/")) == "http://test.com/something/cool.html"); 492 assert(url.basedOn(Uri("http://test.com/")) == "http://test.com/something/cool.html"); 493 assert(url.basedOn(Uri("http://test.com")) == "http://test.com/something/cool.html"); 494 assert(url.basedOn(Uri("http://test.com/what/test.html?a=b")) == "http://test.com/something/cool.html"); 495 assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d")) == "http://test.com/something/cool.html"); 496 assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d#what")) == "http://test.com/something/cool.html"); 497 assert(url.basedOn(Uri("http://test.com")) == "http://test.com/something/cool.html"); 498 499 url = Uri("?query=answer"); // same path. server, protocol, and port, just different query string and fragment 500 assert(url.basedOn(Uri("http://test.com/what/test.html")) == "http://test.com/what/test.html?query=answer"); 501 assert(url.basedOn(Uri("https://test.com/what/test.html")) == "https://test.com/what/test.html?query=answer"); 502 assert(url.basedOn(Uri("http://test.com/what/")) == "http://test.com/what/?query=answer"); 503 assert(url.basedOn(Uri("http://test.com/")) == "http://test.com/?query=answer"); 504 assert(url.basedOn(Uri("http://test.com")) == "http://test.com?query=answer"); 505 assert(url.basedOn(Uri("http://test.com/what/test.html?a=b")) == "http://test.com/what/test.html?query=answer"); 506 assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d")) == "http://test.com/what/test.html?query=answer"); 507 assert(url.basedOn(Uri("http://test.com/what/test.html?a=b&c=d#what")) == "http://test.com/what/test.html?query=answer"); 508 assert(url.basedOn(Uri("http://test.com")) == "http://test.com?query=answer"); 509 510 url = Uri("/test/bar"); 511 assert(Uri("./").basedOn(url) == "/test/", Uri("./").basedOn(url)); 512 assert(Uri("../").basedOn(url) == "/"); 513 514 url = Uri("http://example.com/"); 515 assert(Uri("../foo").basedOn(url) == "http://example.com/foo"); 516 517 //auto uriBefore = url; 518 url = Uri("#anchor"); // everything should remain the same except the anchor 519 //uriBefore.anchor = "anchor"); 520 //assert(url == uriBefore); 521 522 url = Uri("//example.com"); // same protocol, but different server. the path here should be blank. 523 524 url = Uri("//example.com/example.html"); // same protocol, but different server and path 525 526 url = Uri("http://example.com/test.html"); // completely absolute link should never be modified 527 528 url = Uri("http://example.com"); // completely absolute link should never be modified, even if it has no path 529 530 // FIXME: add something for port too 531 } 532 } 533 534 /// Makes a data:// uri that can be used as links in most newer browsers (IE8+). 535 string makeDataUrl()(string mimeType, in void[] data) { 536 import std.base64; // FIXME then i can remove the () template 537 auto data64 = Base64.encode(cast(const(ubyte[])) data); 538 return "data:" ~ mimeType ~ ";base64," ~ cast(string)(data64); 539 } 540 541 /// breaks down a url encoded string 542 string[][string] decodeVariables(string data, string separator = "&", string[]* namesInOrder = null, string[]* valuesInOrder = null) { 543 auto vars = data.split(separator); 544 string[][string] _get; 545 foreach(var; vars) { 546 auto equal = var.indexOf("="); 547 string name; 548 string value; 549 if(equal == -1) { 550 name = decodeUriComponent(var); 551 value = ""; 552 } else { 553 //_get[decodeUriComponent(var[0..equal])] ~= decodeUriComponent(var[equal + 1 .. $].replace("+", " ")); 554 // stupid + -> space conversion. 555 name = decodeUriComponent(var[0..equal].replace("+", " ")); 556 value = decodeUriComponent(var[equal + 1 .. $].replace("+", " ")); 557 } 558 559 _get[name] ~= value; 560 if(namesInOrder) 561 (*namesInOrder) ~= name; 562 if(valuesInOrder) 563 (*valuesInOrder) ~= value; 564 } 565 return _get; 566 } 567 568 /// breaks down a url encoded string, but only returns the last value of any array 569 string[string] decodeVariablesSingle(string data) { 570 string[string] va; 571 auto varArray = decodeVariables(data); 572 foreach(k, v; varArray) 573 va[k] = v[$-1]; 574 575 return va; 576 } 577 578 579 /// url encodes the whole string 580 string encodeVariables(in string[string] data) { 581 string ret; 582 583 bool outputted = false; 584 foreach(k, v; data) { 585 if(outputted) 586 ret ~= "&"; 587 else 588 outputted = true; 589 590 ret ~= encodeUriComponent(k) ~ "=" ~ encodeUriComponent(v); 591 } 592 593 return ret; 594 } 595 596 /// url encodes a whole string 597 string encodeVariables(in string[][string] data) { 598 string ret; 599 600 bool outputted = false; 601 foreach(k, arr; data) { 602 foreach(v; arr) { 603 if(outputted) 604 ret ~= "&"; 605 else 606 outputted = true; 607 ret ~= encodeUriComponent(k) ~ "=" ~ encodeUriComponent(v); 608 } 609 } 610 611 return ret; 612 } 613 614 /// Encodes all but the explicitly unreserved characters per rfc 3986 615 /// Alphanumeric and -_.~ are the only ones left unencoded 616 /// name is borrowed from php 617 string rawurlencode(in char[] data) { 618 string ret; 619 ret.reserve(data.length * 2); 620 foreach(char c; data) { 621 if( 622 (c >= 'a' && c <= 'z') || 623 (c >= 'A' && c <= 'Z') || 624 (c >= '0' && c <= '9') || 625 c == '-' || c == '_' || c == '.' || c == '~') 626 { 627 ret ~= c; 628 } else { 629 ret ~= '%'; 630 // since we iterate on char, this should give us the octets of the full utf8 string 631 ret ~= toHexUpper(c); 632 } 633 } 634 635 return ret; 636 } 637 638 639 char[2] toHexUpper(ubyte num) { 640 char[2] ret = 0; 641 ret[0] = num / 16; 642 ret[1] = num % 16; 643 ret[0] += cast(char)(ret[0] >= 10 ? 'A' : '0'); 644 ret[1] += cast(char)(ret[1] >= 10 ? 'A' : '0'); 645 return ret; 646 } 647 648