diff --git a/examples/curl.c b/examples/curl.c index 4d3007348..88d1c41e4 100644 --- a/examples/curl.c +++ b/examples/curl.c @@ -183,7 +183,7 @@ int main(int argc, char *argv[]) { struct Url url; char *host, *port; bool usessl = false; - _gc(ParseUrl(urlarg, -1, &url)); + _gc(ParseUrl(urlarg, -1, &url, kUrlPlus)); _gc(url.params.p); if (url.scheme.n) { if (url.scheme.n == 5 && !memcasecmp(url.scheme.p, "https", 5)) { diff --git a/net/http/parseurl.c b/net/http/parseurl.c index 7fa60fd52..d46d0263e 100644 --- a/net/http/parseurl.c +++ b/net/http/parseurl.c @@ -28,8 +28,7 @@ struct UrlParser { char *p, *q; const char *s; - unsigned c, i, n; - char isform, islatin1, isopaque; + unsigned c, i, n, f; }; static void EmitLatin1(char **p, int c) { @@ -99,7 +98,7 @@ static bool ParseScheme(struct UrlParser *u, struct Url *h) { return false; } } else { - u->isopaque = true; + u->f |= kUrlOpaque; return false; } } else if (u->c == '#' || u->c == '?') { @@ -110,7 +109,7 @@ static bool ParseScheme(struct UrlParser *u, struct Url *h) { } else if (u->c == '%') { ParseEscape(u); return false; - } else if (u->c >= 0200 && u->islatin1) { + } else if (u->c >= 0200 && (u->f & kUrlLatin1)) { EmitLatin1(&u->p, u->c); return false; } else { @@ -161,7 +160,7 @@ static void ParseAuthority(struct UrlParser *u, struct Url *h) { u->q = u->p; } else if (u->c == '%') { ParseEscape(u); - } else if (u->c >= 0200 && u->islatin1) { + } else if (u->c >= 0200 && (u->f & kUrlLatin1)) { EmitLatin1(&u->p, u->c); } else { *u->p++ = u->c; @@ -188,11 +187,11 @@ static void ParsePath(struct UrlParser *u, struct UrlView *h) { u->c = u->s[u->i++] & 255; if (u->c == '#') { break; - } else if (u->c == '?' && !u->isopaque) { + } else if (u->c == '?' && !(u->f & kUrlOpaque)) { break; } else if (u->c == '%') { ParseEscape(u); - } else if (u->c >= 0200 && u->islatin1) { + } else if (u->c >= 0200 && (u->f & kUrlLatin1)) { EmitLatin1(&u->p, u->c); } else { *u->p++ = u->c; @@ -213,7 +212,7 @@ static void ParseQuery(struct UrlParser *u, struct UrlParams *h) { } else if (u->c == '%') { ParseEscape(u); } else if (u->c == '+') { - *u->p++ = u->isform ? ' ' : '+'; + *u->p++ = (u->f & kUrlPlus) ? ' ' : '+'; } else if (u->c == '&') { EmitVal(u, h, t); t = false; @@ -223,7 +222,7 @@ static void ParseQuery(struct UrlParser *u, struct UrlParams *h) { } else { *u->p++ = '='; } - } else if (u->c >= 0200 && u->islatin1) { + } else if (u->c >= 0200 && (u->f & kUrlLatin1)) { EmitLatin1(&u->p, u->c); } else { *u->p++ = u->c; @@ -237,7 +236,7 @@ static void ParseFragment(struct UrlParser *u, struct UrlView *h) { u->c = u->s[u->i++] & 255; if (u->c == '%') { ParseEscape(u); - } else if (u->c >= 0200 && u->islatin1) { + } else if (u->c >= 0200 && (u->f & kUrlLatin1)) { EmitLatin1(&u->p, u->c); } else { *u->p++ = u->c; @@ -248,28 +247,6 @@ static void ParseFragment(struct UrlParser *u, struct UrlView *h) { u->q = u->p; } -static char *ParseUrlImpl(const char *s, size_t n, struct Url *h, bool latin1) { - char *m; - struct UrlParser u; - if (n == -1) n = s ? strlen(s) : 0; - u.i = 0; - u.c = 0; - u.s = s; - u.n = n; - u.isform = false; - u.isopaque = false; - u.islatin1 = latin1; - bzero(h, sizeof(*h)); - if ((m = malloc(latin1 ? u.n * 2 : u.n))) { - u.q = u.p = m; - if (ParseScheme(&u, h)) ParseAuthority(&u, h); - if (u.c != '#' && u.c != '?') ParsePath(&u, &h->path); - if (u.c == '?') ParseQuery(&u, &h->params); - if (u.c == '#') ParseFragment(&u, &h->fragment); - } - return m; -} - /** * Parses URL. * @@ -298,43 +275,39 @@ static char *ParseUrlImpl(const char *s, size_t n, struct Url *h, bool latin1) { * @param s is value like `/hi?x=y&z` or `http://a.example/hi#x` * @param n is byte length and -1 implies strlen * @param h is assumed to be uninitialized + * @param f is flags which may have: + * - `FLAGS_PLUS` to turn `+` into space in query params + * - `FLAGS_LATIN1` to transcode ISO-8859-1 input into UTF-8 * @return memory backing UrlView needing free (and h.params.p too) * @see URI Generic Syntax RFC3986 RFC2396 * @see EncodeUrl() */ -char *ParseUrl(const char *s, size_t n, struct Url *h) { - return ParseUrlImpl(s, n, h, false); -} - -/** - * Parses HTTP Request-URI. - * - * The input is ISO-8859-1 which is transcoded to UTF-8. Therefore we - * assume percent-encoded bytes are expressed as UTF-8. Returned values - * might contain things like NUL characters, C0, and C1 control codes. - * UTF-8 isn't checked for validity and may contain overlong values. - * Absent can be discerned from empty by checking if the pointer is set. - * - * There's no failure condition for this routine. This is a permissive - * parser that doesn't impose character restrictions beyond what is - * necessary for parsing. This doesn't normalize path segments like `.` - * or `..`. Use IsAcceptablePath() to check for those. - * - * @param s is value like `/hi?x=y&z` or `http://a.example/hi#x` - * @param n is byte length and -1 implies strlen - * @param h is assumed to be uninitialized - * @return memory backing UrlView needing free (and h.params.p too) - */ -char *ParseRequestUri(const char *s, size_t n, struct Url *h) { - return ParseUrlImpl(s, n, h, true); +char *ParseUrl(const char *s, size_t n, struct Url *h, int f) { + char *m; + struct UrlParser u; + if (n == -1) n = s ? strlen(s) : 0; + u.i = 0; + u.c = 0; + u.s = s; + u.n = n; + u.f = f; + bzero(h, sizeof(*h)); + if ((m = malloc((f & kUrlLatin1) ? u.n * 2 : u.n))) { + u.q = u.p = m; + if (ParseScheme(&u, h)) ParseAuthority(&u, h); + if (u.c != '#' && u.c != '?') ParsePath(&u, &h->path); + if (u.c == '?') ParseQuery(&u, &h->params); + if (u.c == '#') ParseFragment(&u, &h->fragment); + } + return m; } /** * Parses HTTP POST key-value params. * - * These are similar to the parameters found in a Request-URI. The main - * difference is that `+` is translated into space here. The mime type - * for this is application/x-www-form-urlencoded. + * These are similar to the parameters found in a Request-URI, except + * usually submitted via an HTTP POST request. We translate `+` into + * space. The mime type is application/x-www-form-urlencoded. * * This parser is charset agnostic. Returned values might contain things * like NUL characters, NUL, control codes, and non-canonical encodings. @@ -357,9 +330,7 @@ char *ParseParams(const char *s, size_t n, struct UrlParams *h) { u.s = s; u.n = n; u.c = '?'; - u.isform = true; - u.islatin1 = false; - u.isopaque = false; + u.f = kUrlPlus; if ((m = malloc(u.n))) { u.q = u.p = m; ParseQuery(&u, h); @@ -399,9 +370,7 @@ char *ParseHost(const char *s, size_t n, struct Url *h) { u.c = 0; u.s = s; u.n = n; - u.isform = false; - u.islatin1 = true; - u.isopaque = false; + u.f = kUrlLatin1; if ((m = malloc(u.n * 2))) { u.q = u.p = m; ParseAuthority(&u, h); diff --git a/net/http/url.h b/net/http/url.h index 77c047145..58b545a8f 100644 --- a/net/http/url.h +++ b/net/http/url.h @@ -1,5 +1,10 @@ #ifndef COSMOPOLITAN_NET_HTTP_URL_H_ #define COSMOPOLITAN_NET_HTTP_URL_H_ + +#define kUrlPlus 1 +#define kUrlLatin1 2 +#define kUrlOpaque 4 + #if !(__ASSEMBLER__ + __LINKER__ + 0) COSMOPOLITAN_C_START_ @@ -28,9 +33,8 @@ struct Url { }; char *EncodeUrl(struct Url *, size_t *); -char *ParseUrl(const char *, size_t, struct Url *); +char *ParseUrl(const char *, size_t, struct Url *, int); char *ParseParams(const char *, size_t, struct UrlParams *); -char *ParseRequestUri(const char *, size_t, struct Url *); char *ParseHost(const char *, size_t, struct Url *); char *EscapeUrlView(char *, struct UrlView *, const char[256]); diff --git a/test/net/http/parseurl_test.c b/test/net/http/parseurl_test.c index cb8b48720..fb0db51e9 100644 --- a/test/net/http/parseurl_test.c +++ b/test/net/http/parseurl_test.c @@ -16,6 +16,7 @@ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/fmt/internal.h" #include "libc/limits.h" #include "libc/mem/gc.internal.h" #include "libc/mem/mem.h" @@ -29,7 +30,7 @@ TEST(ParseUrl, testEmpty) { struct Url h; - gc(ParseUrl(0, 0, &h)); + gc(ParseUrl(0, 0, &h, 0)); gc(h.params.p); ASSERT_EQ(0, h.params.n); ASSERT_STREQ("", gc(EncodeUrl(&h, 0))); @@ -37,7 +38,7 @@ TEST(ParseUrl, testEmpty) { TEST(ParseUrl, testFragment) { struct Url h; - gc(ParseUrl("#x", -1, &h)); + gc(ParseUrl("#x", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(0, h.path.n); ASSERT_EQ(1, h.fragment.n); @@ -47,7 +48,7 @@ TEST(ParseUrl, testFragment) { TEST(ParseUrl, testFragmentAbsent_isNull) { struct Url h; - gc(ParseUrl("", -1, &h)); + gc(ParseUrl("", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(0, h.fragment.p); ASSERT_EQ(0, h.fragment.n); @@ -56,7 +57,7 @@ TEST(ParseUrl, testFragmentAbsent_isNull) { TEST(ParseUrl, testFragmentEmpty_isNonNull) { struct Url h; - gc(ParseUrl("#", -1, &h)); /* python's uri parser is wrong here */ + gc(ParseUrl("#", -1, &h, 0)); /* python's uri parser is wrong here */ gc(h.params.p); ASSERT_NE(0, h.fragment.p); ASSERT_EQ(0, h.fragment.n); @@ -65,7 +66,7 @@ TEST(ParseUrl, testFragmentEmpty_isNonNull) { TEST(ParseUrl, testPathFragment) { struct Url h; - gc(ParseUrl("x#y", -1, &h)); + gc(ParseUrl("x#y", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(1, h.path.n); ASSERT_EQ('x', h.path.p[0]); @@ -76,7 +77,7 @@ TEST(ParseUrl, testPathFragment) { TEST(ParseUrl, testAbsolutePath) { struct Url h; - gc(ParseUrl("/x/y", -1, &h)); + gc(ParseUrl("/x/y", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(4, h.path.n); ASSERT_BINEQ(u"/x/y", h.path.p); @@ -85,7 +86,7 @@ TEST(ParseUrl, testAbsolutePath) { TEST(ParseUrl, testRelativePath1) { struct Url h; - gc(ParseUrl("x", -1, &h)); + gc(ParseUrl("x", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(1, h.path.n); ASSERT_EQ('x', h.path.p[0]); @@ -94,7 +95,7 @@ TEST(ParseUrl, testRelativePath1) { TEST(ParseUrl, testOptions) { struct Url h; - gc(ParseUrl("*", -1, &h)); + gc(ParseUrl("*", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(1, h.path.n); ASSERT_EQ('*', h.path.p[0]); @@ -103,7 +104,7 @@ TEST(ParseUrl, testOptions) { TEST(ParseUrl, testRelativePath2) { struct Url h; - gc(ParseUrl("x/y", -1, &h)); + gc(ParseUrl("x/y", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(3, h.path.n); ASSERT_BINEQ(u"x/y", h.path.p); @@ -112,7 +113,7 @@ TEST(ParseUrl, testRelativePath2) { TEST(ParseUrl, testRoot) { struct Url h; - gc(ParseUrl("/", -1, &h)); + gc(ParseUrl("/", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(1, h.path.n); ASSERT_EQ('/', h.path.p[0]); @@ -121,7 +122,7 @@ TEST(ParseUrl, testRoot) { TEST(ParseUrl, testSchemePath) { struct Url h; - gc(ParseUrl("x:y", -1, &h)); + gc(ParseUrl("x:y", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(1, h.scheme.n); ASSERT_BINEQ(u"x", h.scheme.p); @@ -132,7 +133,7 @@ TEST(ParseUrl, testSchemePath) { TEST(ParseUrl, testSchemeAuthority) { struct Url h; - gc(ParseUrl("x://y", -1, &h)); + gc(ParseUrl("x://y", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(1, h.scheme.n); ASSERT_EQ('x', h.scheme.p[0]); @@ -141,9 +142,37 @@ TEST(ParseUrl, testSchemeAuthority) { ASSERT_STREQ("x://y", gc(EncodeUrl(&h, 0))); } +TEST(ParseUrl, testParamsPlus_maybeYes) { + struct Url h; + gc(ParseUrl("x?q=hi+there", -1, &h, kUrlPlus)); + gc(h.params.p); + ASSERT_EQ(1, h.path.n); + ASSERT_BINEQ(u"x", h.path.p); + ASSERT_EQ(1, h.params.n); + ASSERT_EQ(1, h.params.p[0].key.n); + ASSERT_EQ(8, h.params.p[0].val.n); + ASSERT_BINEQ(u"q", h.params.p[0].key.p); + ASSERT_BINEQ(u"hi there", h.params.p[0].val.p); + ASSERT_STREQ("x?q=hi%20there", gc(EncodeUrl(&h, 0))); +} + +TEST(ParseUrl, testParamsPlus_maybeNot) { + struct Url h; + gc(ParseUrl("x?q=hi+there", -1, &h, 0)); + gc(h.params.p); + ASSERT_EQ(1, h.path.n); + ASSERT_BINEQ(u"x", h.path.p); + ASSERT_EQ(1, h.params.n); + ASSERT_EQ(1, h.params.p[0].key.n); + ASSERT_EQ(8, h.params.p[0].val.n); + ASSERT_BINEQ(u"q", h.params.p[0].key.p); + ASSERT_BINEQ(u"hi+there", h.params.p[0].val.p); + ASSERT_STREQ("x?q=hi%2Bthere", gc(EncodeUrl(&h, 0))); +} + TEST(ParseUrl, testParamsQuestion_doesntTurnIntoSpace) { struct Url h; - gc(ParseUrl("x?+", -1, &h)); + gc(ParseUrl("x?+", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(1, h.path.n); ASSERT_BINEQ(u"x", h.path.p); @@ -155,7 +184,7 @@ TEST(ParseUrl, testParamsQuestion_doesntTurnIntoSpace) { TEST(ParseUrl, testUrl) { struct Url h; - gc(ParseUrl("a://b:B@c:C/d?e#f", -1, &h)); + gc(ParseUrl("a://b:B@c:C/d?e#f", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(1, h.scheme.n); ASSERT_EQ('a', h.scheme.p[0]); @@ -180,7 +209,7 @@ TEST(ParseUrl, testUrl) { TEST(ParseUrl, testEmptyQueryKeyVal_decodesToEmptyStrings) { struct Url h; - gc(ParseUrl("?=", -1, &h)); + gc(ParseUrl("?=", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(1, h.params.n); ASSERT_EQ(0, h.params.p[0].key.n); @@ -192,7 +221,7 @@ TEST(ParseUrl, testEmptyQueryKeyVal_decodesToEmptyStrings) { TEST(ParseUrl, testMultipleEquals_goesIntoValue) { struct Url h; - gc(ParseUrl("?==", -1, &h)); + gc(ParseUrl("?==", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(1, h.params.n); ASSERT_EQ(0, h.params.p[0].key.n); @@ -204,7 +233,7 @@ TEST(ParseUrl, testMultipleEquals_goesIntoValue) { TEST(ParseUrl, testUrlWithoutScheme) { struct Url h; - gc(ParseUrl("//b@c/d?e#f", -1, &h)); + gc(ParseUrl("//b@c/d?e#f", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(0, h.scheme.n); ASSERT_EQ(1, h.user.n); @@ -225,7 +254,7 @@ TEST(ParseUrl, testUrlWithoutScheme) { TEST(ParseUrl, testUrlWithoutUser) { struct Url h; - gc(ParseUrl("a://c/d?e#f", -1, &h)); + gc(ParseUrl("a://c/d?e#f", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(1, h.scheme.n); ASSERT_EQ('a', h.scheme.p[0]); @@ -248,11 +277,11 @@ TEST(ParseUrl, testUrlWithoutUser) { TEST(ParseUrl, testEmptyParams_absentCanBeDiscerned) { struct Url h; - gc(ParseUrl("", -1, &h)); + gc(ParseUrl("", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(0, h.params.n); ASSERT_EQ(NULL, h.params.p); - gc(ParseUrl("?", -1, &h)); /* python's uri parser is wrong here */ + gc(ParseUrl("?", -1, &h, 0)); /* python's uri parser is wrong here */ gc(h.params.p); ASSERT_EQ(0, h.params.n); ASSERT_NE(NULL, h.params.p); @@ -260,7 +289,7 @@ TEST(ParseUrl, testEmptyParams_absentCanBeDiscerned) { TEST(ParseUrl, testWeirdAmps_areReproducible) { struct Url h; - gc(ParseUrl("?&&", -1, &h)); + gc(ParseUrl("?&&", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(3, h.params.n); ASSERT_EQ(0, h.params.p[0].key.n); @@ -280,7 +309,7 @@ TEST(ParseUrl, testWeirdAmps_areReproducible) { TEST(ParseUrl, testOpaquePart_canLetQuestionMarkGoInPath) { struct Url h; /* python's uri parser is wrong here */ - gc(ParseUrl("s:o!$%&'()*+,-./09:;=?@AZ_az#fragged", -1, &h)); + gc(ParseUrl("s:o!$%&'()*+,-./09:;=?@AZ_az#fragged", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(26, h.path.n); ASSERT_EQ(0, memcmp(h.path.p, "o!$%&'()*+,-./09:;=?@AZ_az", 26)); @@ -292,7 +321,7 @@ TEST(ParseUrl, testOpaquePart_canLetQuestionMarkGoInPath) { TEST(ParseUrl, testSchemePathWithoutAuthority_paramsAreAllowed) { struct Url h; - gc(ParseUrl("s:/o!$%&'()*+,-./09:;=?@AZ_az#fragged", -1, &h)); + gc(ParseUrl("s:/o!$%&'()*+,-./09:;=?@AZ_az#fragged", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(20, h.path.n); ASSERT_EQ(0, memcmp(h.path.p, "/o!$%&'()*+,-./09:;=", 20)); @@ -303,7 +332,7 @@ TEST(ParseUrl, testSchemePathWithoutAuthority_paramsAreAllowed) { TEST(ParseUrl, testOpaquePart_permitsPercentEncoding) { struct Url h; - gc(ParseUrl("s:%2Fo!$%&'()*+,-./09:;=?@AZ_az#fragged", -1, &h)); + gc(ParseUrl("s:%2Fo!$%&'()*+,-./09:;=?@AZ_az#fragged", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(27, h.path.n); ASSERT_EQ(0, memcmp(h.path.p, "/o!$%&'()*+,-./09:;=?@AZ_az", 27)); @@ -314,7 +343,7 @@ TEST(ParseUrl, testOpaquePart_permitsPercentEncoding) { TEST(ParseUrl, testTelephone) { struct Url h; - gc(ParseUrl("tel:+1-212-867-5309", -1, &h)); + gc(ParseUrl("tel:+1-212-867-5309", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(15, h.path.n); ASSERT_BINEQ(u"+1-212-867-5309", h.path.p); @@ -323,7 +352,7 @@ TEST(ParseUrl, testTelephone) { TEST(ParseUrl, testLolv6) { struct Url h; - gc(ParseUrl("//[::1]:31337", -1, &h)); + gc(ParseUrl("//[::1]:31337", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(3, h.host.n); ASSERT_BINEQ(u"::1", h.host.p); @@ -334,14 +363,14 @@ TEST(ParseUrl, testLolv6) { TEST(ParseUrl, testLolV6_withoutPort) { struct Url h; - gc(ParseUrl("//[::1]", -1, &h)); + gc(ParseUrl("//[::1]", -1, &h, 0)); gc(h.params.p); ASSERT_STREQ("//[::1]", gc(EncodeUrl(&h, 0))); } TEST(ParseUrl, testLolv7) { struct Url h; - gc(ParseUrl("//[vf.::1]", -1, &h)); + gc(ParseUrl("//[vf.::1]", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(6, h.host.n); ASSERT_BINEQ(u"vf.::1", h.host.p); @@ -352,14 +381,14 @@ TEST(ParseUrl, testLolv7) { TEST(ParseUrl, testLolv7WithoutColon_weCantProduceLegalEncodingSadly) { struct Url h; - gc(ParseUrl("//[v7.7.7.7]", -1, &h)); + gc(ParseUrl("//[v7.7.7.7]", -1, &h, 0)); gc(h.params.p); ASSERT_STREQ("//v7.7.7.7", gc(EncodeUrl(&h, 0))); } TEST(ParseUrl, testObviouslyIllegalIpLiteral_getsTreatedAsRegName) { struct Url h; - gc(ParseUrl("//[vf.::1%00]", -1, &h)); + gc(ParseUrl("//[vf.::1%00]", -1, &h, 0)); gc(h.params.p); ASSERT_STREQ("//vf.%3A%3A1%00", gc(EncodeUrl(&h, 0))); } @@ -411,7 +440,7 @@ TEST(EncodeUrl, testHostPortPlacedInHostField_ungoodIdea) { TEST(ParseUrl, testUrlWithoutParams) { struct Url h; - gc(ParseUrl("a://b@c/d#f", -1, &h)); + gc(ParseUrl("a://b@c/d#f", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(1, h.scheme.n); ASSERT_EQ('a', h.scheme.p[0]); @@ -430,7 +459,7 @@ TEST(ParseUrl, testUrlWithoutParams) { TEST(ParseUrl, testLatin1_doesNothing) { struct Url h; const char b[1] = {0377}; - gc(ParseUrl(b, 1, &h)); + gc(ParseUrl(b, 1, &h, 0)); gc(h.params.p); ASSERT_EQ(1, h.path.n); ASSERT_EQ(0, memcmp("\377", h.path.p, 1)); @@ -440,7 +469,7 @@ TEST(ParseUrl, testLatin1_doesNothing) { TEST(ParseRequestUri, testLatin1_expandsMemoryToUtf8) { struct Url h; const char b[1] = {0377}; - gc(ParseRequestUri(b, 1, &h)); + gc(ParseUrl(b, 1, &h, kUrlPlus | kUrlLatin1)); gc(h.params.p); ASSERT_EQ(2, h.path.n); ASSERT_EQ(0, memcmp("\303\277", h.path.p, 2)); @@ -448,7 +477,7 @@ TEST(ParseRequestUri, testLatin1_expandsMemoryToUtf8) { TEST(ParseUrl, testPercentShrinkingMemory) { struct Url h; - gc(ParseUrl("%Ff", 3, &h)); + gc(ParseUrl("%Ff", 3, &h, 0)); gc(h.params.p); ASSERT_EQ(1, h.path.n); ASSERT_EQ(0, memcmp("\377", h.path.p, 1)); @@ -458,7 +487,7 @@ TEST(ParseUrl, testPercentShrinkingMemory) { TEST(ParseUrl, testEscapingWontOverrun) { struct Url h; char b[1] = {'%'}; - gc(ParseUrl(b, 1, &h)); + gc(ParseUrl(b, 1, &h, 0)); gc(h.params.p); ASSERT_EQ(1, h.path.n); ASSERT_EQ(0, memcmp("%", h.path.p, 1)); @@ -467,7 +496,7 @@ TEST(ParseUrl, testEscapingWontOverrun) { TEST(ParseUrl, testBadPercent_getsIgnored) { struct Url h; - gc(ParseUrl("%FZ", 3, &h)); + gc(ParseUrl("%FZ", 3, &h, 0)); gc(h.params.p); ASSERT_EQ(3, h.path.n); ASSERT_EQ(0, memcmp("%FZ", h.path.p, 3)); @@ -475,7 +504,7 @@ TEST(ParseUrl, testBadPercent_getsIgnored) { TEST(ParseUrl, testFileUrl) { struct Url h; - gc(ParseUrl("file:///etc/passwd", -1, &h)); + gc(ParseUrl("file:///etc/passwd", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(4, h.scheme.n); ASSERT_BINEQ(u"file", h.scheme.p); @@ -491,7 +520,7 @@ TEST(ParseUrl, testFileUrl) { TEST(EncodeUrl, testModifyingParseResultAndReencoding_addsStructure) { size_t n; struct Url h; - gc(ParseUrl("rel", -1, &h)); + gc(ParseUrl("rel", -1, &h, 0)); gc(h.params.p); h.host.n = 7; h.host.p = "justine"; @@ -580,14 +609,14 @@ TEST(EncodeUrl, testEmptyRegName_isLegal) { TEST(ParseUrl, testEmptyScheme_isNotPossible) { struct Url h; - gc(ParseUrl(":", -1, &h)); + gc(ParseUrl(":", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(0, h.scheme.n); ASSERT_EQ(0, h.scheme.p); ASSERT_EQ(1, h.path.n); ASSERT_EQ(':', h.path.p[0]); ASSERT_STREQ(":", gc(EncodeUrl(&h, 0))); - gc(ParseUrl("://hi", -1, &h)); + gc(ParseUrl("://hi", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(0, h.scheme.n); ASSERT_EQ(0, h.scheme.p); @@ -598,7 +627,7 @@ TEST(ParseUrl, testEmptyScheme_isNotPossible) { TEST(ParseUrl, testDataUri) { struct Url h; - gc(ParseUrl("data:image/png;base64,09AZaz+/==", -1, &h)); + gc(ParseUrl("data:image/png;base64,09AZaz+/==", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(0, h.host.n); ASSERT_EQ(0, h.host.p); @@ -611,7 +640,7 @@ TEST(ParseUrl, testDataUri) { TEST(ParseUrl, testBadSchemeCharacter_parserAssumesItsPath) { struct Url h; - gc(ParseUrl("fil\e://hi", -1, &h)); + gc(ParseUrl("fil\e://hi", -1, &h, 0)); gc(h.params.p); ASSERT_EQ(0, h.scheme.n); ASSERT_EQ(0, h.scheme.p); @@ -673,7 +702,7 @@ TEST(ParseRequestUri, fuzz) { for (j = 0; j < sizeof(B); ++j) { B[j] = C[rand() % sizeof(C)]; } - free(ParseRequestUri(B, 8, &h)); + free(ParseUrl(B, 8, &h, kUrlPlus | kUrlLatin1)); free(h.params.p); } } @@ -687,11 +716,11 @@ void A(void) { BENCH(ParseUrl, bench) { struct Url h; EZBENCH2("ParseParams hyperion", donothing, A()); - EZBENCH2("ParseUrl a", donothing, free(ParseUrl("a", -1, &h))); + EZBENCH2("ParseUrl a", donothing, free(ParseUrl("a", -1, &h, false))); EZBENCH2("ParseUrl a://b@c/d#f", donothing, - free(ParseUrl("a://b@c/d#f", -1, &h))); + free(ParseUrl("a://b@c/d#f", -1, &h, false))); EZBENCH2("ParseUrl a://b@c/d?z#f", donothing, ({ - free(ParseUrl("a://b@c/?zd#f", -1, &h)); + free(ParseUrl("a://b@c/?zd#f", -1, &h, 0)); free(h.params.p); })); EZBENCH2("ParseHost", donothing, free(ParseHost("127.0.0.1:34832", 15, &h))); @@ -700,14 +729,14 @@ BENCH(ParseUrl, bench) { BENCH(EncodeUrl, bench) { struct Url h; - gc(ParseUrl("a", -1, &h)); + gc(ParseUrl("a", -1, &h, 0)); EZBENCH2("EncodeUrl a", donothing, free(EncodeUrl(&h, 0))); - gc(ParseUrl("a://b@c/d#f", -1, &h)); + gc(ParseUrl("a://b@c/d#f", -1, &h, 0)); EZBENCH2("EncodeUrl a://b@c/d#f", donothing, free(EncodeUrl(&h, 0))); - gc(ParseUrl("a://b@c/?zd#f", -1, &h)); + gc(ParseUrl("a://b@c/?zd#f", -1, &h, 0)); gc(h.params.p); EZBENCH2("EncodeUrl a://b@c/d?z#f", donothing, free(EncodeUrl(&h, 0))); - gc(ParseUrl(kHyperion, kHyperionSize, &h)); + gc(ParseUrl(kHyperion, kHyperionSize, &h, 0)); gc(h.params.p); EZBENCH2("EncodeUrl hyperion", donothing, free(EncodeUrl(&h, 0))); } diff --git a/third_party/lua/luaparseurl.c b/third_party/lua/luaparseurl.c index a3310669d..98dbec592 100644 --- a/third_party/lua/luaparseurl.c +++ b/third_party/lua/luaparseurl.c @@ -36,12 +36,14 @@ static void LuaSetUrlView(lua_State *L, struct UrlView *v, const char *k) { } int LuaParseUrl(lua_State *L) { + int f; void *m; size_t n; struct Url h; const char *p; p = luaL_checklstring(L, 1, &n); - m = ParseUrl(p, n, &h); + f = luaL_optinteger(L, 2, 0); + m = ParseUrl(p, n, &h, f); lua_newtable(L); LuaSetUrlView(L, &h.scheme, "scheme"); LuaSetUrlView(L, &h.user, "user"); diff --git a/tool/net/fetch.inc b/tool/net/fetch.inc index 46f0e4a6a..cff031720 100644 --- a/tool/net/fetch.inc +++ b/tool/net/fetch.inc @@ -107,7 +107,7 @@ static int LuaFetch(lua_State *L) { /* * Parse URL. */ - _gc(ParseUrl(urlarg, urlarglen, &url)); + _gc(ParseUrl(urlarg, urlarglen, &url, true)); _gc(url.params.p); usingssl = false; if (url.scheme.n) { diff --git a/tool/net/help.txt b/tool/net/help.txt index b59f05097..d65e8badd 100644 --- a/tool/net/help.txt +++ b/tool/net/help.txt @@ -1335,28 +1335,50 @@ FUNCTIONS Converts RFC1123 string that looks like this: Mon, 29 Mar 2021 15:37:13 GMT to a UNIX timestamp. See parsehttpdatetime.c. - ParseUrl(str) → URL - Parses URL, returning object having the following fields: scheme, - user, pass, host, port, path, params, fragment. This parser is - charset agnostic. Percent encoded bytes are decoded for all - fields. Returned values might contain things like NUL characters, - spaces, control codes, and non-canonical encodings. Absent can be - discerned from empty by checking if the pointer is set. There's no - failure condition for this routine. This is a permissive parser. - This doesn't normalize path segments like `.` or `..` so use - IsAcceptablePath() to check for those. No restrictions are imposed - beyond that which is strictly necessary for parsing. All the data - that is provided will be consumed to the one of the fields. Strict - conformance is enforced on some fields more than others, like - scheme, since it's the most non-deterministically defined field of - them all. Please note this is a URL parser, not a URI parser. - Which means we support everything everything the URI spec says we - should do except for the things we won't do, like tokenizing path - segments into an array and then nesting another array beneath each - of those for storing semicolon parameters. So this parser won't - make SIP easy. What it can do is parse HTTP URLs and most URIs - like data:opaque, better in fact than most things which claim to - be URI parsers. + ParseUrl(url:str[, flags:int]) → URL + + Parses URL. + + An object containing the following fields is returned: + + - `scheme` is a string, e.g. `"http"` + - `user` is the username string, or nil if absent + - `pass` is the password string, or nil if absent + - `host` is the hostname string, or nil if `url` was a path + - `port` is the port string, or nil if absent + - `path` is the path string, or nil if absent + - `params` is the URL paramaters, e.g. `/?a=b&c` would be + represented as the data structure `{{"a", "b"}, {"c"}, ...}` + - `fragment` is the stuff after the `#` character + + `flags` may have: + + - `kUrlPlus` to turn `+` into space + - `kUrlLatin1` to transcode ISO-8859-1 input into UTF-8 + + This parser is charset agnostic. Percent encoded bytes are + decoded for all fields. Returned values might contain things + like NUL characters, spaces, control codes, and non-canonical + encodings. Absent can be discerned from empty by checking if + the pointer is set. + + There's no failure condition for this routine. This is a + permissive parser. This doesn't normalize path segments like + `.` or `..` so use IsAcceptablePath() to check for those. No + restrictions are imposed beyond that which is strictly + necessary for parsing. All the data that is provided will be + consumed to the one of the fields. Strict conformance is + enforced on some fields more than others, like scheme, since + it's the most non-deterministically defined field of them all. + + Please note this is a URL parser, not a URI parser. Which + means we support everything everything the URI spec says we + should do except for the things we won't do, like tokenizing + path segments into an array and then nesting another array + beneath each of those for storing semicolon parameters. So + this parser won't make SIP easy. What it can do is parse HTTP + URLs and most URIs like data:opaque, better in fact than most + things which claim to be URI parsers. IsAcceptablePath(str) → bool Returns true if path doesn't contain ".", ".." or "//" segments diff --git a/tool/net/redbean.c b/tool/net/redbean.c index 976a0bd05..ef8dd48c3 100644 --- a/tool/net/redbean.c +++ b/tool/net/redbean.c @@ -99,6 +99,7 @@ #include "net/http/escape.h" #include "net/http/http.h" #include "net/http/ip.h" +#include "net/http/url.h" #include "net/https/https.h" #include "third_party/getopt/getopt.h" #include "third_party/lua/cosmo.h" @@ -5123,6 +5124,8 @@ static void LuaStart(void) { LuaSetConstant(L, "kLogWarn", kLogWarn); LuaSetConstant(L, "kLogError", kLogError); LuaSetConstant(L, "kLogFatal", kLogFatal); + LuaSetConstant(L, "kUrlPlus", kUrlPlus); + LuaSetConstant(L, "kUrlLatin1", kUrlLatin1); // create a list of custom content types lua_pushlightuserdata(L, (void *)&ctIdx); // push address as unique key lua_newtable(L); @@ -5673,8 +5676,8 @@ static char *SynchronizeStream(void) { static void ParseRequestParameters(void) { uint32_t ip; - FreeLater(ParseRequestUri(inbuf.p + cpm.msg.uri.a, - cpm.msg.uri.b - cpm.msg.uri.a, &url)); + FreeLater(ParseUrl(inbuf.p + cpm.msg.uri.a, cpm.msg.uri.b - cpm.msg.uri.a, + &url, kUrlPlus | kUrlLatin1)); if (!url.host.p) { if (HasHeader(kHttpXForwardedHost) && // !GetRemoteAddr(&ip, 0) && IsTrustedProxy(ip)) { diff --git a/tool/net/wb.c b/tool/net/wb.c index 25e6c0a7a..2c0a01b00 100644 --- a/tool/net/wb.c +++ b/tool/net/wb.c @@ -401,7 +401,7 @@ int main(int argc, char *argv[]) { /* * Parse URL. */ - _gc(ParseUrl(urlarg, -1, &url)); + _gc(ParseUrl(urlarg, -1, &url, kUrlPlus)); _gc(url.params.p); usessl = false; if (url.scheme.n) {