implement a valid RFC3986 (URI) parser

Up until now I used a "poor man" approach: the uri parser is barely a
parser, it tries to extract the path from the request, with some minor
checking, and that's all.  This obviously is not RFC3986-compliant.

The new RFC3986 (URI) parser should be fully compliant.  It may accept
some invalid URI, but shouldn't reject or mis-parse valid URI.  (in
particular, the rule for the path is way more relaxed in this parser
than it is in the RFC text).

A difference with RFC3986 is that we don't even try to parse the
(optional) userinfo part of a URI: following the Gemini spec we treat
it as an error.

A further caveats is that %2F in the path part of the URI is
indistinguishable from a literal '/': this is NOT conforming, but due
to the scope and use of gmid, I don't see how treat a %2F sequence in
the path (reject the URI?).
This commit is contained in:
Omar Polo 2020-12-25 13:13:12 +01:00
parent d5aba4c791
commit 33d32d1fd6
No known key found for this signature in database
GPG Key ID: 35F98C96A1786F0D
6 changed files with 659 additions and 154 deletions

1
.gitignore vendored
View File

@ -2,5 +2,6 @@ cert.pem
key.pem
TAGS
gmid
uri_test
*.o
docs

View File

@ -2,18 +2,24 @@ CC = cc
CFLAGS = -Wall -Wextra -g
LDFLAGS = -ltls
.PHONY: all clean
.PHONY: all clean test
all: gmid TAGS README.md
gmid: gmid.o
${CC} gmid.o -o gmid ${LDFLAGS}
gmid: gmid.o uri.o
${CC} gmid.o uri.o -o gmid ${LDFLAGS}
TAGS: gmid.c
-etags gmid.c || true
TAGS: gmid.c uri.c
-etags gmid.c uri.c || true
README.md: gmid.1
mandoc -Tmarkdown gmid.1 | sed -e '1d' -e '$$d' > README.md
clean:
rm -f gmid.o gmid
rm -f *.o gmid
uri_test: uri_test.o uri.o
${CC} uri_test.o uri.o -o uri_test ${LDFLAGS}
test: uri_test
./uri_test

155
gmid.c
View File

@ -34,6 +34,28 @@ int port;
int foreground;
int connected_clients;
struct etm { /* file extension to mime */
const char *mime;
const char *ext;
} filetypes[] = {
{"application/pdf", "pdf"},
{"image/gif", "gif"},
{"image/jpeg", "jpg"},
{"image/jpeg", "jpeg"},
{"image/png", "png"},
{"image/svg+xml", "svg"},
{"text/gemini", "gemini"},
{"text/gemini", "gmi"},
{"text/markdown", "markdown"},
{"text/markdown", "md"},
{"text/plain", "txt"},
{"text/xml", "xml"},
{NULL, NULL}
};
void
siginfo_handler(int sig)
{
@ -51,102 +73,6 @@ starts_with(const char *str, const char *prefix)
return 1;
}
char *
url_after_proto(char *url)
{
char *s;
const char *proto = "gemini:";
const char *marker = "//";
/* a relative URL */
if ((s = strstr(url, marker)) == NULL)
return url;
/*
* if a protocol is not specified, gemini should be implied:
* this handles the case of //example.com
*/
if (s == url)
return s + strlen(marker);
if (s - strlen(proto) != url)
return NULL;
if (!starts_with(url, proto))
return NULL;
return s + strlen(marker);
}
char *
url_start_of_request(char *url)
{
char *s, *t;
if ((s = url_after_proto(url)) == NULL)
return NULL;
/* non-absolute URL */
if (s == url)
return s;
if ((t = strstr(s, "/")) == NULL)
return s + strlen(s);
return t;
}
int
url_trim(struct client *c, char *url)
{
const char *e = "\r\n";
char *s;
if ((s = strstr(url, e)) == NULL)
return 0;
s[0] = '\0';
s[1] = '\0';
if (s[2] != '\0') {
LOGE(c, "%s", "request longer than 1024 bytes");
return 0;
}
return 1;
}
char *
adjust_path(char *path)
{
char *s, *query;
size_t len;
if ((query = strchr(path, '?')) != NULL) {
*query = '\0';
query++;
}
/* /.. -> / */
len = strlen(path);
if (len >= 3) {
if (!strcmp(&path[len-3], "/..")) {
path[len-2] = '\0';
}
}
/* if the path is only `..` trim out and exit */
if (!strcmp(path, "..")) {
path[0] = '\0';
return query;
}
/* remove every ../ in the path */
while (1) {
if ((s = strstr(path, "../")) == NULL)
return query;
memmove(s, s+3, strlen(s)+1); /* copy also the \0 */
}
}
int
start_reply(struct pollfd *pfd, struct client *client, int code, const char *reason)
{
@ -224,7 +150,7 @@ check_path(struct client *c, const char *path, int *fd)
struct stat sb;
assert(path != NULL);
if ((*fd = openat(dirfd, path,
if ((*fd = openat(dirfd, *path ? path : ".",
O_RDONLY | O_NOFOLLOW | O_CLOEXEC)) == -1) {
return FILE_MISSING;
}
@ -288,16 +214,8 @@ err:
int
open_file(char *path, char *query, struct pollfd *fds, struct client *c)
open_file(char *fpath, char *query, struct pollfd *fds, struct client *c)
{
char fpath[PATHBUF];
bzero(fpath, sizeof(fpath));
if (*path != '.')
fpath[0] = '.';
strlcat(fpath, path, PATHBUF);
switch (check_path(c, fpath, &c->fd)) {
case FILE_EXECUTABLE:
/* +2 to skip the ./ */
@ -578,8 +496,8 @@ void
handle(struct pollfd *fds, struct client *client)
{
char buf[GEMINI_URL_LEN];
char *path;
char *query;
const char *parse_err;
struct uri uri;
switch (client->state) {
case S_OPEN:
@ -599,26 +517,19 @@ handle(struct pollfd *fds, struct client *client)
return;
}
if (!url_trim(client, buf)) {
if (!start_reply(fds, client, BAD_REQUEST, "bad request"))
if (!trim_req_uri(buf) || !parse_uri(buf, &uri, &parse_err)) {
if (!start_reply(fds, client, BAD_REQUEST, parse_err))
return;
goodbye(fds, client);
return;
}
if ((path = url_start_of_request(buf)) == NULL) {
if (!start_reply(fds, client, BAD_REQUEST, "bad request"))
return;
goodbye(fds, client);
return;
}
LOGI(client, "GET %s%s%s",
*uri.path ? uri.path : "/",
*uri.query ? "?" : "",
*uri.query ? uri.query : "");
query = adjust_path(path);
LOGI(client, "GET %s%s%s", path,
query ? "?" : "",
query ? query : "");
send_file(path, query, fds, client);
send_file(uri.path, uri.query, fds, client);
break;
case S_INITIALIZING:

42
gmid.h
View File

@ -107,6 +107,17 @@ struct client {
struct in_addr addr;
};
struct uri {
char *schema;
char *host;
char *port;
uint16_t port_no;
char *path;
char *query;
char *fragment;
};
enum {
FILE_EXISTS,
FILE_EXECUTABLE,
@ -114,35 +125,10 @@ enum {
FILE_MISSING,
};
struct etm { /* file extension to mime */
const char *mime;
const char *ext;
} filetypes[] = {
{"application/pdf", "pdf"},
{"image/gif", "gif"},
{"image/jpeg", "jpg"},
{"image/jpeg", "jpeg"},
{"image/png", "png"},
{"image/svg+xml", "svg"},
{"text/gemini", "gemini"},
{"text/gemini", "gmi"},
{"text/markdown", "markdown"},
{"text/markdown", "md"},
{"text/plain", "txt"},
{"text/xml", "xml"},
{NULL, NULL}
};
/* gmid.c */
void siginfo_handler(int);
int starts_with(const char*, const char*);
char *url_after_proto(char*);
char *url_start_of_request(char*);
int url_trim(struct client*, char*);
char *adjust_path(char*);
ssize_t filesize(int);
int start_reply(struct pollfd*, struct client*, int, const char*);
@ -167,4 +153,8 @@ void loop(struct tls*, int);
void usage(const char*);
/* uri.c */
int parse_uri(char*, struct uri*, const char**);
int trim_req_uri(char*);
#endif

413
uri.c Normal file
View File

@ -0,0 +1,413 @@
/*
* Copyright (c) 2020 Omar Polo <op@omarpolo.com>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <ctype.h>
#include <string.h>
#include "gmid.h"
/*
* Notes from RFC3986
*
* => gemini://tanso.net/rfc/rfc3986.txt
*
*
* ABNF
* ====
*
* pct-encoded "%" HEXDIG HEXDIG
*
* reserved = gen-delims / sub-delimis
* gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
* sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
* / "*" / "+" / "," / ";" / "="
*
* unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
*
* URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
*
* hier-part = "//" authority path-abempty
* / path-absolute
* / path-rootless
* / path-empty
*
* scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
*
* authority = [ userinfo "@" ] host [ ":" port ]
*
* (note that userinfo isn't used for Gemini URL)
*
* host = IP-literal / IPv4address / reg-name
* reg-name = *( unreserved / pct-encoded / sub-delims )
*
* port = *DIGIT
*
* path = path-abemty ; begins with "/" or is empty
* / path-absolute ; begins with "/" but not "//"
* / path-noscheme ; begins with a non-colon segment
* / path-rootless ; begins with a segment
* / path-empty ; zero characters
*
* path-abemty = *( "/" segment )
* path-absolute = "/" [ segment-nz *( "/" segment ) ]
* path-noscheme = ; not used
* path-rootless = ; not used
* path-empty = ; not used
*
* segment = *pchar
* segment-nz = 1*pchar
* segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
* pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
*
* query = *( pchar / "/" / "?" )
*
* fragment = *( pchar / "/" / "?" )
*
*
* EXAMPLE
* =======
*
* foo://example.com:8042/over/there?name=ferret#nose
* \_/ \______________/\_________/ \_________/ \__/
* | | | | |
* scheme authority path query fragment
*
*/
struct parser {
char *uri;
struct uri *parsed;
const char *err;
};
/* XXX: these macros will expand multiple times their argument */
#define UNRESERVED(p) \
(isalnum(p) \
|| p == '-' \
|| p == '.' \
|| p == '_' \
|| p == '~')
#define SUB_DELIMITERS(p) \
(p == '!' \
|| p == '$' \
|| p == '&' \
|| p == '\'' \
|| p == '(' \
|| p == ')' \
|| p == '*' \
|| p == '+' \
|| p == ',' \
|| p == ';' \
|| p == '=')
static int
parse_pct_encoded(struct parser *p)
{
if (*p->uri != '%')
return 0;
if (!isxdigit(*(p->uri+1)) || !isxdigit(*(p->uri+2))) {
p->err = "illegal percent-encoding";
return 0;
}
sscanf(p->uri+1, "%2hhx", p->uri);
memmove(p->uri+1, p->uri+3, strlen(p->uri+3)+1);
return 1;
}
/* ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) "://" */
static int
parse_scheme(struct parser *p)
{
p->parsed->schema = p->uri;
if (!isalpha(*p->uri)) {
p->err = "illegal character in scheme";
return 0;
}
p->uri++;
while (isalnum(*p->uri)
|| *p->uri == '+'
|| *p->uri == '-'
|| *p->uri == '.')
p->uri++;
if (*p->uri != ':') {
p->err = "illegal character in scheme";
return 0;
}
*p->uri = '\0';
if (*(++p->uri) != '/' || *(++p->uri) != '/') {
p->err = "invalid marker after scheme";
return 0;
}
p->uri++;
return 1;
}
/* *DIGIT */
static int
parse_port(struct parser *p)
{
uint32_t i = 0;
p->parsed->port = p->uri;
for (; isdigit(*p->uri); p->uri++) {
i = i * 10 + *p->uri - '0';
if (i > UINT16_MAX) {
p->err = "port number too large";
return 0;
}
}
if (*p->uri != '/' && *p->uri != '\0') {
p->err = "illegal character in port number";
return 0;
}
p->parsed->port_no = i;
if (*p->uri != '\0') {
*p->uri = '\0';
p->uri++;
}
return 1;
}
/* TODO: add support for ip-literal and ipv4addr ? */
/* *( unreserved / sub-delims / pct-encoded ) */
static int
parse_authority(struct parser *p)
{
p->parsed->host = p->uri;
while (UNRESERVED(*p->uri)
|| SUB_DELIMITERS(*p->uri)
|| parse_pct_encoded(p))
p->uri++;
if (*p->uri == ':') {
*p->uri = '\0';
p->uri++;
return parse_port(p);
}
if (*p->uri == '/') {
*p->uri = '\0';
p->uri++;
return 1;
}
if (*p->uri == '\0')
return 1;
p->err = "illegal character in authority section";
return 0;
}
/* Routine for path_clean. Elide the pointed .. with the preceding
* element. Return 0 if it's not possible. incr is the length of
* the increment, 3 for ../ and 2 for .. */
static int
path_elide_dotdot(char *path, char *i, int incr)
{
char *j;
if (i == path)
return 0;
for (j = i-2; j != path && *j != '/'; j--)
/* noop */ ;
if (*j == '/')
j++;
i += incr;
memmove(j, i, strlen(i)+1);
return 1;
}
/*
* Use an algorithm similar to the one implemented in go' path.Clean:
*
* 1. Replace multiple slashes with a single slash
* 2. Eliminate each . path name element
* 3. Eliminate each inner .. along with the non-.. element that precedes it
* 4. Eliminate trailing .. if possible or error (go would only discard)
*
* Unlike path.Clean, this function return the empty string if the
* original path is equivalent to "/".
*/
static int
path_clean(char *path)
{
char *i;
/* 1. replace multiple slashes with a single one */
for (i = path; *i; ++i) {
if (*i == '/' && *(i+1) == '/') {
memmove(i, i+1, strlen(i)); /* move also the \0 */
i--;
}
}
/* 2. eliminate each . path name element */
for (i = path; *i; ++i) {
if ((i == path || *i == '/') && *(i+1) == '.' &&
*(i+2) == '/') {
/* move also the \0 */
memmove(i, i+2, strlen(i)-1);
i--;
}
}
if (!strcmp(path, ".") || !strcmp(path, "/.")) {
*path = '\0';
return 1;
}
/* 3. eliminate each inner .. along with the preceding non-.. */
for (i = strstr(path, "../"); i != NULL; i = strstr(path, ".."))
if (!path_elide_dotdot(path, i, 3))
return 0;
/* 4. eliminate trailing ..*/
if ((i = strstr(path, "..")) != NULL)
if (!path_elide_dotdot(path, i, 2))
return 0;
return 1;
}
static int
parse_query(struct parser *p)
{
p->parsed->query = p->uri;
if (*p->uri == '\0')
return 1;
while (UNRESERVED(*p->uri)
|| SUB_DELIMITERS(*p->uri)
|| *p->uri == '/'
|| *p->uri == '?'
|| parse_pct_encoded(p))
p->uri++;
if (*p->uri != '\0' && *p->uri != '#') {
p->err = "illegal character in query";
return 0;
}
if (*p->uri != '\0') {
*p->uri = '\0';
p->uri++;
}
return 1;
}
/* don't even bother */
static int
parse_fragment(struct parser *p)
{
p->parsed->fragment = p->uri;
return 1;
}
/* XXX: is it too broad? */
/* *(pchar / "/") */
static int
parse_path(struct parser *p)
{
char c;
p->parsed->path = p->uri;
if (*p->uri == '\0') {
p->parsed->query = p->parsed->fragment = p->uri;
return 1;
}
while (UNRESERVED(*p->uri)
|| SUB_DELIMITERS(*p->uri)
|| *p->uri == '/'
|| parse_pct_encoded(p))
p->uri++;
if (*p->uri != '\0' && *p->uri != '?' && *p->uri != '#') {
p->err = "illegal character in path";
return 0;
}
if (*p->uri != '\0') {
c = *p->uri;
*p->uri = '\0';
p->uri++;
if (c == '#') {
if (!parse_fragment(p))
return 0;
} else
if (!parse_query(p) || !parse_fragment(p))
return 0;
}
if (!path_clean(p->parsed->path)) {
p->err = "illegal path";
return 0;
}
return 1;
}
int
parse_uri(char *uri, struct uri *ret, const char **err_ret)
{
char *end;
struct parser p = {uri, ret, NULL};
bzero(ret, sizeof(*ret));
/* initialize optional stuff to the empty string */
end = uri + strlen(uri);
p.parsed->port = end;
p.parsed->path = end;
p.parsed->query = end;
p.parsed->fragment = end;
if (!parse_scheme(&p) || !parse_authority(&p) || !parse_path(&p)) {
*err_ret = p.err;
return 0;
}
*err_ret = NULL;
return 1;
}
int
trim_req_uri(char *uri)
{
char *i;
if ((i = strstr(uri, "\r\n")) == NULL)
return 0;
*i = '\0';
return 1;
}

184
uri_test.c Normal file
View File

@ -0,0 +1,184 @@
/*
* Copyright (c) 2020 Omar Polo <op@omarpolo.com>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <err.h>
#include <stdio.h>
#include <string.h>
#include "gmid.h"
#define TEST(uri, fail, exp, descr) \
if (!run_test(uri, fail, exp)) { \
fprintf(stderr, "%s:%d: error: %s\n", \
__FILE__, __LINE__, descr); \
exit(1); \
}
#define URI(schema, host, port, path, query, frag) \
((struct uri){schema, host, port, 0, path, query, frag})
#define DIFF(wanted, got, field) \
if (wanted->field == NULL || got->field == NULL || \
strcmp(wanted->field, got->field)) { \
fprintf(stderr, #field ":\n\tgot: %s\n\twanted: %s\n", \
got->field, wanted->field); \
return 0; \
}
#define PASS 0
#define FAIL 1
int
diff_uri(struct uri *p, struct uri *exp)
{
DIFF(p, exp, schema);
DIFF(p, exp, host);
DIFF(p, exp, port);
DIFF(p, exp, path);
DIFF(p, exp, query);
DIFF(p, exp, fragment);
return 1;
}
int
run_test(const char *uri, int should_fail, struct uri expected)
{
int failed, ok = 1;
char *uri_copy;
struct uri parsed;
const char *error;
if ((uri_copy = strdup(uri)) == NULL)
err(1, "strdup");
fprintf(stderr, "=> %s\n", uri);
failed = !parse_uri(uri_copy, &parsed, &error);
if (failed && should_fail)
goto done;
if (error != NULL)
fprintf(stderr, "> %s\n", error);
ok = !failed && !should_fail;
if (ok)
ok = diff_uri(&expected, &parsed);
done:
free(uri_copy);
return ok;
}
int
main(void)
{
struct uri empty = {"", "", "", PASS, "", "", ""};
TEST("http://omarpolo.com",
PASS,
URI("http", "omarpolo.com", "", "", "", ""),
"can parse uri with empty path");
/* schema */
TEST("omarpolo.com", FAIL, empty, "FAIL when the schema is missing");
TEST("gemini:/omarpolo.com", FAIL, empty, "FAIL with invalid marker");
TEST("gemini//omarpolo.com", FAIL, empty, "FAIL with invalid marker");
TEST("h!!p://omarpolo.com", FAIL, empty, "FAIL with invalid schema");
/* authority */
TEST("gemini://omarpolo.com",
PASS,
URI("gemini", "omarpolo.com", "", "", "", ""),
"can parse authority with empty path");
TEST("gemini://omarpolo.com/",
PASS,
URI("gemini", "omarpolo.com", "", "", "", ""),
"can parse authority with empty path (alt)")
TEST("gemini://omarpolo.com:1965",
PASS,
URI("gemini", "omarpolo.com", "1965", "", "", ""),
"can parse with port and empty path");
TEST("gemini://omarpolo.com:1965/",
PASS,
URI("gemini", "omarpolo.com", "1965", "", "", ""),
"can parse with port and empty path")
TEST("gemini://omarpolo.com:196s",
FAIL,
empty,
"FAIL with invalid port number");
/* path */
TEST("gemini://omarpolo.com/foo/bar/baz",
PASS,
URI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""),
"parse simple paths");
TEST("gemini://omarpolo.com/foo//bar///baz",
PASS,
URI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""),
"parse paths with multiple slashes");
TEST("gemini://omarpolo.com/foo/./bar/./././baz",
PASS,
URI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""),
"parse paths with . elements");
TEST("gemini://omarpolo.com/foo/bar/../bar/baz",
PASS,
URI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""),
"parse paths with .. elements");
TEST("gemini://omarpolo.com/foo/../foo/bar/../bar/baz/../baz",
PASS,
URI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""),
"parse paths with multiple .. elements");
TEST("gemini://omarpolo.com/foo/..",
PASS,
URI("gemini", "omarpolo.com", "", "", "", ""),
"parse paths with a trailing ..");
TEST("gemini://omarpolo.com/foo/../",
PASS,
URI("gemini", "omarpolo.com", "", "", "", ""),
"parse paths with a trailing ..");
TEST("gemini://omarpolo.com/foo/../..",
FAIL,
empty,
"reject paths that would escape the root");
/* query */
TEST("foo://example.com/foo/?gne",
PASS,
URI("foo", "example.com", "", "foo/", "gne", ""),
"parse query strings");
TEST("foo://example.com/foo/?gne&foo",
PASS,
URI("foo", "example.com", "", "foo/", "gne&foo", ""),
"parse query strings");
TEST("foo://example.com/foo/?gne%2F",
PASS,
URI("foo", "example.com", "", "foo/", "gne/", ""),
"parse query strings");
/* fragment */
TEST("foo://bar.co/#foo",
PASS,
URI("foo", "bar.co", "", "", "", "foo"),
"can recognize fragments");
/* percent encoding */
TEST("foo://bar.com/caf%C3%A8.gmi",
PASS,
URI("foo", "bar.com", "", "cafè.gmi", "", ""),
"can decode");
return 0;
}