mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-06-06 20:52:29 +00:00
- 10.5% reduction of o//depend dependency graph - 8.8% reduction in latency of make command - Fix issue with temporary file cleanup There's a new -w option in compile.com that turns off the recent Landlock output path workaround for "good commands" which do not unlink() the output file like GNU tooling does. Our new GNU Make unveil sandboxing appears to have zero overhead in the grand scheme of things. Full builds are pretty fast since the only thing that's actually slowed us down is probably libcxx make -j16 MODE=rel RL: took 85,732,063µs wall time RL: ballooned to 323,612kb in size RL: needed 828,560,521µs cpu (11% kernel) RL: caused 39,080,670 page faults (99% memcpy) RL: 350,073 context switches (72% consensual) RL: performed 0 reads and 11,494,960 write i/o operations pledge() and unveil() no longer consider ENOSYS to be an error. These functions have also been added to Python's cosmo module. This change also removes some WIN32 APIs and System Five magnums which we're not using and it's doubtful anyone else would be too
409 lines
13 KiB
C
409 lines
13 KiB
C
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
|
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
|
╞══════════════════════════════════════════════════════════════════════════════╡
|
|
│ Copyright 2021 Justine Alexandra Roberts Tunney │
|
|
│ │
|
|
│ Permission to use, copy, modify, and/or distribute this software for │
|
|
│ any purpose with or without fee is hereby granted, provided that the │
|
|
│ above copyright notice and this permission notice appear in all copies. │
|
|
│ │
|
|
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
|
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
|
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
|
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
|
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
|
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
|
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
|
│ PERFORMANCE OF THIS SOFTWARE. │
|
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
|
#include "libc/bits/likely.h"
|
|
#include "libc/limits.h"
|
|
#include "libc/mem/mem.h"
|
|
#include "libc/str/str.h"
|
|
#include "libc/x/x.h"
|
|
#include "net/http/escape.h"
|
|
#include "net/http/url.h"
|
|
|
|
struct UrlParser {
|
|
char *p, *q;
|
|
const char *s;
|
|
unsigned c, i, n;
|
|
char isform, islatin1, isopaque;
|
|
};
|
|
|
|
static void EmitLatin1(char **p, int c) {
|
|
(*p)[0] = 0300 | c >> 6;
|
|
(*p)[1] = 0200 | c & 077;
|
|
*p += 2;
|
|
}
|
|
|
|
static bool EmitKey(struct UrlParser *u, struct UrlParams *h) {
|
|
struct UrlParam *p;
|
|
if ((p = realloc(h->p, ++h->n * sizeof(*h->p)))) {
|
|
p[h->n - 1].key.p = u->q;
|
|
p[h->n - 1].key.n = u->p - u->q;
|
|
u->q = u->p;
|
|
h->p = p;
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
static void EmitVal(struct UrlParser *u, struct UrlParams *h, bool t) {
|
|
if (!t) {
|
|
if (u->p > u->q || u->c != '?') {
|
|
if (EmitKey(u, h)) {
|
|
h->p[h->n - 1].val.p = NULL;
|
|
h->p[h->n - 1].val.n = 0;
|
|
}
|
|
}
|
|
} else {
|
|
h->p[h->n - 1].val.p = u->q;
|
|
h->p[h->n - 1].val.n = u->p - u->q;
|
|
u->q = u->p;
|
|
}
|
|
}
|
|
|
|
static void ParseEscape(struct UrlParser *u) {
|
|
int a, b, c = '%';
|
|
if (u->i + 2 <= u->n && ((a = kHexToInt[u->s[u->i + 0] & 255]) != -1 &&
|
|
(b = kHexToInt[u->s[u->i + 1] & 255]) != -1)) {
|
|
c = a << 4 | b;
|
|
u->i += 2;
|
|
}
|
|
*u->p++ = c;
|
|
}
|
|
|
|
static bool ParseScheme(struct UrlParser *u, struct Url *h) {
|
|
while (u->i < u->n) {
|
|
u->c = u->s[u->i++] & 255;
|
|
if (u->c == '/') {
|
|
if (u->i == 1 && u->i < u->n && u->s[u->i] == '/') {
|
|
++u->i;
|
|
return true;
|
|
} else {
|
|
*u->p++ = '/';
|
|
return false;
|
|
}
|
|
} else if (u->c == ':' && u->i > 1) {
|
|
h->scheme.p = u->q;
|
|
h->scheme.n = u->p - u->q;
|
|
u->q = u->p;
|
|
if (u->i < u->n && u->s[u->i] == '/') {
|
|
if (u->i + 1 < u->n && u->s[u->i + 1] == '/') {
|
|
u->i += 2;
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
} else {
|
|
u->isopaque = true;
|
|
return false;
|
|
}
|
|
} else if (u->c == '#' || u->c == '?') {
|
|
h->path.p = u->q;
|
|
h->path.n = u->p - u->q;
|
|
u->q = u->p;
|
|
return false;
|
|
} else if (u->c == '%') {
|
|
ParseEscape(u);
|
|
return false;
|
|
} else if (u->c >= 0200 && u->islatin1) {
|
|
EmitLatin1(&u->p, u->c);
|
|
return false;
|
|
} else {
|
|
*u->p++ = u->c;
|
|
if (u->i == 1) {
|
|
if (!(('A' <= u->c && u->c <= 'Z') || ('a' <= u->c && u->c <= 'z'))) {
|
|
return false;
|
|
}
|
|
} else {
|
|
if (!(('0' <= u->c && u->c <= '9') || ('A' <= u->c && u->c <= 'Z') ||
|
|
('a' <= u->c && u->c <= 'z') || u->c == '+' || u->c == '-' ||
|
|
u->c == '.')) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static void ParseAuthority(struct UrlParser *u, struct Url *h) {
|
|
unsigned t = 1;
|
|
const char *c = NULL;
|
|
while (u->i < u->n) {
|
|
u->c = u->s[u->i++] & 255;
|
|
if (u->c == '/' || u->c == '#' || u->c == '?') {
|
|
break;
|
|
} else if (u->c == '[') {
|
|
t = 0;
|
|
} else if (u->c == ']') {
|
|
t = 1;
|
|
} else if (u->c == ':' && t > 0) {
|
|
*u->p++ = ':';
|
|
c = u->p;
|
|
++t;
|
|
} else if (u->c == '@') {
|
|
if (c) {
|
|
h->user.p = u->q;
|
|
h->user.n = c - 1 - u->q;
|
|
h->pass.p = c;
|
|
h->pass.n = u->p - c;
|
|
c = NULL;
|
|
t = 1;
|
|
} else {
|
|
h->user.p = u->q;
|
|
h->user.n = u->p - u->q;
|
|
}
|
|
u->q = u->p;
|
|
} else if (u->c == '%') {
|
|
ParseEscape(u);
|
|
} else if (u->c >= 0200 && u->islatin1) {
|
|
EmitLatin1(&u->p, u->c);
|
|
} else {
|
|
*u->p++ = u->c;
|
|
}
|
|
}
|
|
if (t == 2) {
|
|
h->host.p = u->q;
|
|
h->host.n = c - 1 - u->q;
|
|
h->port.p = c;
|
|
h->port.n = u->p - c;
|
|
c = NULL;
|
|
} else {
|
|
h->host.p = u->q;
|
|
h->host.n = u->p - u->q;
|
|
}
|
|
u->q = u->p;
|
|
if (u->c == '/') {
|
|
*u->p++ = u->c;
|
|
}
|
|
}
|
|
|
|
static void ParsePath(struct UrlParser *u, struct UrlView *h) {
|
|
while (u->i < u->n) {
|
|
u->c = u->s[u->i++] & 255;
|
|
if (u->c == '#') {
|
|
break;
|
|
} else if (u->c == '?' && !u->isopaque) {
|
|
break;
|
|
} else if (u->c == '%') {
|
|
ParseEscape(u);
|
|
} else if (u->c >= 0200 && u->islatin1) {
|
|
EmitLatin1(&u->p, u->c);
|
|
} else {
|
|
*u->p++ = u->c;
|
|
}
|
|
}
|
|
h->p = u->q;
|
|
h->n = u->p - u->q;
|
|
u->q = u->p;
|
|
}
|
|
|
|
static void ParseQuery(struct UrlParser *u, struct UrlParams *h) {
|
|
bool t = false;
|
|
if (!h->p) h->p = malloc(0);
|
|
while (u->i < u->n) {
|
|
u->c = u->s[u->i++] & 255;
|
|
if (u->c == '#') {
|
|
break;
|
|
} else if (u->c == '%') {
|
|
ParseEscape(u);
|
|
} else if (u->c == '+') {
|
|
*u->p++ = u->isform ? ' ' : '+';
|
|
} else if (u->c == '&') {
|
|
EmitVal(u, h, t);
|
|
t = false;
|
|
} else if (u->c == '=') {
|
|
if (!t) {
|
|
t = EmitKey(u, h);
|
|
} else {
|
|
*u->p++ = '=';
|
|
}
|
|
} else if (u->c >= 0200 && u->islatin1) {
|
|
EmitLatin1(&u->p, u->c);
|
|
} else {
|
|
*u->p++ = u->c;
|
|
}
|
|
}
|
|
EmitVal(u, h, t);
|
|
}
|
|
|
|
static void ParseFragment(struct UrlParser *u, struct UrlView *h) {
|
|
while (u->i < u->n) {
|
|
u->c = u->s[u->i++] & 255;
|
|
if (u->c == '%') {
|
|
ParseEscape(u);
|
|
} else if (u->c >= 0200 && u->islatin1) {
|
|
EmitLatin1(&u->p, u->c);
|
|
} else {
|
|
*u->p++ = u->c;
|
|
}
|
|
}
|
|
h->p = u->q;
|
|
h->n = u->p - u->q;
|
|
u->q = u->p;
|
|
}
|
|
|
|
static char *ParseUrlImpl(const char *s, size_t n, struct Url *h, bool latin1) {
|
|
char *m;
|
|
struct UrlParser u;
|
|
if (n == -1) n = s ? strlen(s) : 0;
|
|
u.i = 0;
|
|
u.c = 0;
|
|
u.s = s;
|
|
u.n = n;
|
|
u.isform = false;
|
|
u.isopaque = false;
|
|
u.islatin1 = latin1;
|
|
bzero(h, sizeof(*h));
|
|
if ((m = malloc(latin1 ? u.n * 2 : u.n))) {
|
|
u.q = u.p = m;
|
|
if (ParseScheme(&u, h)) ParseAuthority(&u, h);
|
|
if (u.c != '#' && u.c != '?') ParsePath(&u, &h->path);
|
|
if (u.c == '?') ParseQuery(&u, &h->params);
|
|
if (u.c == '#') ParseFragment(&u, &h->fragment);
|
|
}
|
|
return m;
|
|
}
|
|
|
|
/**
|
|
* Parses URL.
|
|
*
|
|
* This parser is charset agnostic. Percent encoded bytes are decoded
|
|
* for all fields (with the exception of scheme). Returned values might
|
|
* contain things like NUL characters, spaces, control codes, and
|
|
* non-canonical encodings. Absent can be discerned from empty by
|
|
* checking if the pointer is set.
|
|
*
|
|
* There's no failure condition for this routine. This is a permissive
|
|
* parser. This doesn't normalize path segments like `.` or `..` so use
|
|
* IsAcceptablePath() to check for those. No restrictions are imposed
|
|
* beyond that which is strictly necessary for parsing. All the s
|
|
* that is provided will be consumed to the one of the fields. Strict
|
|
* conformance is enforced on some fields more than others, like scheme,
|
|
* since it's the most non-deterministically defined field of them all.
|
|
*
|
|
* Please note this is a URL parser, not a URI parser. Which means we
|
|
* support everything everything the URI spec says we should do except
|
|
* for the things we won't do, like tokenizing path segments into an
|
|
* array and then nesting another array beneath each of those for
|
|
* storing semicolon parameters. So this parser won't make SIP easy.
|
|
* What it can do is parse HTTP URLs and most URIs like s:opaque,
|
|
* better in fact than most things which claim to be URI parsers.
|
|
*
|
|
* @param s is value like `/hi?x=y&z` or `http://a.example/hi#x`
|
|
* @param n is byte length and -1 implies strlen
|
|
* @param h is assumed to be uninitialized
|
|
* @return memory backing UrlView needing free (and h.params.p too)
|
|
* @see URI Generic Syntax RFC3986 RFC2396
|
|
* @see EncodeUrl()
|
|
*/
|
|
char *ParseUrl(const char *s, size_t n, struct Url *h) {
|
|
return ParseUrlImpl(s, n, h, false);
|
|
}
|
|
|
|
/**
|
|
* Parses HTTP Request-URI.
|
|
*
|
|
* The input is ISO-8859-1 which is transcoded to UTF-8. Therefore we
|
|
* assume percent-encoded bytes are expressed as UTF-8. Returned values
|
|
* might contain things like NUL characters, C0, and C1 control codes.
|
|
* UTF-8 isn't checked for validity and may contain overlong values.
|
|
* Absent can be discerned from empty by checking if the pointer is set.
|
|
*
|
|
* There's no failure condition for this routine. This is a permissive
|
|
* parser that doesn't impose character restrictions beyond what is
|
|
* necessary for parsing. This doesn't normalize path segments like `.`
|
|
* or `..`. Use IsAcceptablePath() to check for those.
|
|
*
|
|
* @param s is value like `/hi?x=y&z` or `http://a.example/hi#x`
|
|
* @param n is byte length and -1 implies strlen
|
|
* @param h is assumed to be uninitialized
|
|
* @return memory backing UrlView needing free (and h.params.p too)
|
|
*/
|
|
char *ParseRequestUri(const char *s, size_t n, struct Url *h) {
|
|
return ParseUrlImpl(s, n, h, true);
|
|
}
|
|
|
|
/**
|
|
* Parses HTTP POST key-value params.
|
|
*
|
|
* These are similar to the parameters found in a Request-URI. The main
|
|
* difference is that `+` is translated into space here. The mime type
|
|
* for this is application/x-www-form-urlencoded.
|
|
*
|
|
* This parser is charset agnostic. Returned values might contain things
|
|
* like NUL characters, NUL, control codes, and non-canonical encodings.
|
|
* Absent can be discerned from empty by checking if the pointer is set.
|
|
*
|
|
* There's no failure condition for this routine. This is a permissive
|
|
* parser that doesn't impose character restrictions beyond what is
|
|
* necessary for parsing.
|
|
*
|
|
* @param s is value like `foo=bar&x=y&z`
|
|
* @param n is byte length and -1 implies strlen
|
|
* @param h must be zeroed by caller and this appends if reused
|
|
* @return UrlView memory with same n needing free (h.p needs free too)
|
|
*/
|
|
char *ParseParams(const char *s, size_t n, struct UrlParams *h) {
|
|
char *m;
|
|
struct UrlParser u;
|
|
if (n == -1) n = s ? strlen(s) : 0;
|
|
u.i = 0;
|
|
u.s = s;
|
|
u.n = n;
|
|
u.c = '?';
|
|
u.isform = true;
|
|
u.islatin1 = false;
|
|
u.isopaque = false;
|
|
if ((m = malloc(u.n))) {
|
|
u.q = u.p = m;
|
|
ParseQuery(&u, h);
|
|
}
|
|
return m;
|
|
}
|
|
|
|
/**
|
|
* Parses HTTP Host header.
|
|
*
|
|
* The input is ISO-8859-1 which is transcoded to UTF-8. Therefore we
|
|
* assume percent-encoded bytes are expressed as UTF-8. Returned values
|
|
* might contain things like NUL characters, C0, and C1 control codes.
|
|
* UTF-8 isn't checked for validity and may contain overlong values.
|
|
* Absent can be discerned from empty by checking if the pointer is set.
|
|
*
|
|
* This function turns an HTTP header HOST[:PORT] into two strings, one
|
|
* for host and the other for port. You may then call IsAcceptableHost()
|
|
* and IsAcceptablePort() to see if they are valid values. After that a
|
|
* function like sscanf() can be used to do the thing you likely thought
|
|
* this function would do.
|
|
*
|
|
* This function doesn't initialize h since it's assumed this will be
|
|
* called conditionally after ParseRequestUri() if the host is absent.
|
|
* Fields unrelated to authority won't be impacted by this function.
|
|
*
|
|
* @param s is value like `127.0.0.1` or `foo.example:80`
|
|
* @param n is byte length and -1 implies strlen
|
|
* @param h is needs to be initialized by caller
|
|
* @return memory backing UrlView needing free
|
|
*/
|
|
char *ParseHost(const char *s, size_t n, struct Url *h) {
|
|
char *m;
|
|
struct UrlParser u;
|
|
if (n == -1) n = s ? strlen(s) : 0;
|
|
u.i = 0;
|
|
u.c = 0;
|
|
u.s = s;
|
|
u.n = n;
|
|
u.isform = false;
|
|
u.islatin1 = true;
|
|
u.isopaque = false;
|
|
if ((m = malloc(u.n * 2))) {
|
|
u.q = u.p = m;
|
|
ParseAuthority(&u, h);
|
|
}
|
|
return m;
|
|
}
|