IRI support

This extends the URI parser so it supports full IRI (Internationalized
Resource Identifiers, RFC3987).  Some areas of it can/may be improved,
but here's a start.

Note: we assume UTF-8 encoded IRI.
This commit is contained in:
Omar Polo 2020-12-26 00:33:11 +01:00
parent 043acc97b1
commit df6ca41da3
No known key found for this signature in database
GPG Key ID: 35F98C96A1786F0D
4 changed files with 108 additions and 12 deletions

View File

@ -20,11 +20,8 @@ is a very simple and minimal gemini server that can serve static files
and execute CGI scripts.
**gmid**
will strip any sequence of
*../*
or trailing
*..*
in the requests made by clients and will refuse to follow symlinks.
won't serve files outside the given directory and won't follow
symlinks.
Furthermore, on
OpenBSD,
pledge(2)
@ -35,6 +32,10 @@ are used to ensure that
dosen't do anything else than read files from the given directory,
accept network connections and, optionally, execute CGI scripts.
**gmid**
fully supports IRIs (Internationalized Resource Identifiers, see
RFC3987).
It should be noted that
**gmid**
is very simple in its implementation, and so it may not be appropriate

11
gmid.1
View File

@ -33,11 +33,8 @@ is a very simple and minimal gemini server that can serve static files
and execute CGI scripts.
.Pp
.Nm
will strip any sequence of
.Pa ../
or trailing
.Pa ..
in the requests made by clients and will refuse to follow symlinks.
won't serve files outside the given directory and won't follow
symlinks.
Furthermore, on
.Ox ,
.Xr pledge 2
@ -48,6 +45,10 @@ are used to ensure that
dosen't do anything else than read files from the given directory,
accept network connections and, optionally, execute CGI scripts.
.Pp
.Nm
fully supports IRIs (Internationalized Resource Identifiers, see
RFC3987).
.Pp
It should be noted that
.Nm
is very simple in its implementation, and so it may not be appropriate

50
uri.c
View File

@ -93,6 +93,8 @@ struct parser {
const char *err;
};
#define CONT_BYTE(b) ((b & 0xC0) == 0x80)
/* XXX: these macros will expand multiple times their argument */
#define UNRESERVED(p) \
@ -115,6 +117,48 @@ struct parser {
|| p == ';' \
|| p == '=')
/* NOTE: the increment are one less what it should be, because the
* caller will add one byte after we return. */
static int
valid_multibyte_utf8(struct parser *p)
{
uint32_t c;
uint8_t s;
c = 0;
s = *p->uri;
if ((s & 0xE0) == 0xC0) {
if (!CONT_BYTE(*(p->uri+1)))
return 0;
c = ((s & 0x1F) << 6) | (*(p->uri+1) & 0x3F);
p->uri += 1;
} else if ((s & 0xF0) == 0xE0) {
if (!CONT_BYTE(*(p->uri+1)) ||
!CONT_BYTE(*(p->uri+2)))
return 0;
c = (s & 0x0F) << 12
| ((*(p->uri+1) & 0x3F) << 6)
| ((*(p->uri+2) & 0x3F));
p->uri += 2;
} else if ((s & 0xF8) == 0xF0) {
if (!CONT_BYTE(*(p->uri+1)) ||
!CONT_BYTE(*(p->uri+2)) ||
!CONT_BYTE(*(p->uri+3)))
return 0;
c = (s & 0x07) << 18
| ((*(p->uri+1) & 0x3F) << 12)
| ((*(p->uri+2) & 0x3F) << 6)
| ((*(p->uri+3) & 0x3F));
p->uri += 3;
} else
return 0;
return (((0x080 <= c) && (c <= 0x7FF))
|| (((0x800 <= c) && (c <= 0xFFFF)))
|| (((0x10000 <= c) && (c <= 0x10FFFF))));
}
static int
parse_pct_encoded(struct parser *p)
{
@ -308,7 +352,8 @@ parse_query(struct parser *p)
|| SUB_DELIMITERS(*p->uri)
|| *p->uri == '/'
|| *p->uri == '?'
|| parse_pct_encoded(p))
|| parse_pct_encoded(p)
|| valid_multibyte_utf8(p))
p->uri++;
if (*p->uri != '\0' && *p->uri != '#') {
@ -348,7 +393,8 @@ parse_path(struct parser *p)
while (UNRESERVED(*p->uri)
|| SUB_DELIMITERS(*p->uri)
|| *p->uri == '/'
|| parse_pct_encoded(p))
|| parse_pct_encoded(p)
|| valid_multibyte_utf8(p))
p->uri++;
if (*p->uri != '\0' && *p->uri != '?' && *p->uri != '#') {

View File

@ -87,6 +87,12 @@ main(void)
{
struct uri empty = {"", "", "", PASS, "", "", ""};
TEST("foo://bar.com/foo%00?baz",
FAIL,
empty,
"rejects %00");
return 0;
TEST("http://omarpolo.com",
PASS,
URI("http", "omarpolo.com", "", "", "", ""),
@ -153,6 +159,10 @@ main(void)
FAIL,
empty,
"reject paths that would escape the root");
TEST("gemini://omarpolo.com/foo/../foo/../././/bar/baz/.././.././/",
PASS,
URI("gemini", "omarpolo.com", "", "", "", ""),
"parse path with lots of cleaning available");
/* query */
TEST("foo://example.com/foo/?gne",
@ -179,6 +189,44 @@ main(void)
PASS,
URI("foo", "bar.com", "", "cafè.gmi", "", ""),
"can decode");
TEST("foo://bar.com/caff%C3%A8%20macchiato.gmi",
PASS,
URI("foo", "bar.com", "", "caffè macchiato.gmi", "", ""),
"can decode");
TEST("foo://bar.com/caff%C3%A8+macchiato.gmi",
PASS,
URI("foo", "bar.com", "", "caffè+macchiato.gmi", "", ""),
"can decode");
TEST("foo://bar.com/foo%2F..%2F..",
FAIL,
empty,
"conversion and checking are done in the correct order");
TEST("foo://bar.com/foo%00?baz",
FAIL,
empty,
"rejects %00");
/* IRI */
TEST("foo://bar.com/cafè.gmi",
PASS,
URI("foo", "bar.com", "", "cafè.gmi", "" , ""),
"decode IRI (with a 2-byte utf8 seq)");
TEST("foo://bar.com/世界.gmi",
PASS,
URI("foo", "bar.com", "", "世界.gmi", "" , ""),
"decode IRI");
TEST("foo://bar.com/😼.gmi",
PASS,
URI("foo", "bar.com", "", "😼.gmi", "" , ""),
"decode IRI (with a 3-byte utf8 seq)");
TEST("foo://bar.com/😼/𤭢.gmi",
PASS,
URI("foo", "bar.com", "", "😼/𤭢.gmi", "" , ""),
"decode IRI (with a 3-byte and a 4-byte utf8 seq)");
TEST("foo://bar.com/世界/\xC0\x80",
FAIL,
empty,
"reject invalid sequence (overlong NUL)");
return 0;
}