mirror of https://github.com/omar-polo/gmid.git
IRI support
This extends the URI parser so it supports full IRI (Internationalized Resource Identifiers, RFC3987). Some areas of it can/may be improved, but here's a start. Note: we assume UTF-8 encoded IRI.
This commit is contained in:
parent
043acc97b1
commit
df6ca41da3
11
README.md
11
README.md
|
@ -20,11 +20,8 @@ is a very simple and minimal gemini server that can serve static files
|
|||
and execute CGI scripts.
|
||||
|
||||
**gmid**
|
||||
will strip any sequence of
|
||||
*../*
|
||||
or trailing
|
||||
*..*
|
||||
in the requests made by clients and will refuse to follow symlinks.
|
||||
won't serve files outside the given directory and won't follow
|
||||
symlinks.
|
||||
Furthermore, on
|
||||
OpenBSD,
|
||||
pledge(2)
|
||||
|
@ -35,6 +32,10 @@ are used to ensure that
|
|||
dosen't do anything else than read files from the given directory,
|
||||
accept network connections and, optionally, execute CGI scripts.
|
||||
|
||||
**gmid**
|
||||
fully supports IRIs (Internationalized Resource Identifiers, see
|
||||
RFC3987).
|
||||
|
||||
It should be noted that
|
||||
**gmid**
|
||||
is very simple in its implementation, and so it may not be appropriate
|
||||
|
|
11
gmid.1
11
gmid.1
|
@ -33,11 +33,8 @@ is a very simple and minimal gemini server that can serve static files
|
|||
and execute CGI scripts.
|
||||
.Pp
|
||||
.Nm
|
||||
will strip any sequence of
|
||||
.Pa ../
|
||||
or trailing
|
||||
.Pa ..
|
||||
in the requests made by clients and will refuse to follow symlinks.
|
||||
won't serve files outside the given directory and won't follow
|
||||
symlinks.
|
||||
Furthermore, on
|
||||
.Ox ,
|
||||
.Xr pledge 2
|
||||
|
@ -48,6 +45,10 @@ are used to ensure that
|
|||
dosen't do anything else than read files from the given directory,
|
||||
accept network connections and, optionally, execute CGI scripts.
|
||||
.Pp
|
||||
.Nm
|
||||
fully supports IRIs (Internationalized Resource Identifiers, see
|
||||
RFC3987).
|
||||
.Pp
|
||||
It should be noted that
|
||||
.Nm
|
||||
is very simple in its implementation, and so it may not be appropriate
|
||||
|
|
50
uri.c
50
uri.c
|
@ -93,6 +93,8 @@ struct parser {
|
|||
const char *err;
|
||||
};
|
||||
|
||||
#define CONT_BYTE(b) ((b & 0xC0) == 0x80)
|
||||
|
||||
/* XXX: these macros will expand multiple times their argument */
|
||||
|
||||
#define UNRESERVED(p) \
|
||||
|
@ -115,6 +117,48 @@ struct parser {
|
|||
|| p == ';' \
|
||||
|| p == '=')
|
||||
|
||||
/* NOTE: the increment are one less what it should be, because the
|
||||
* caller will add one byte after we return. */
|
||||
static int
|
||||
valid_multibyte_utf8(struct parser *p)
|
||||
{
|
||||
uint32_t c;
|
||||
uint8_t s;
|
||||
|
||||
c = 0;
|
||||
s = *p->uri;
|
||||
|
||||
if ((s & 0xE0) == 0xC0) {
|
||||
if (!CONT_BYTE(*(p->uri+1)))
|
||||
return 0;
|
||||
c = ((s & 0x1F) << 6) | (*(p->uri+1) & 0x3F);
|
||||
p->uri += 1;
|
||||
} else if ((s & 0xF0) == 0xE0) {
|
||||
if (!CONT_BYTE(*(p->uri+1)) ||
|
||||
!CONT_BYTE(*(p->uri+2)))
|
||||
return 0;
|
||||
c = (s & 0x0F) << 12
|
||||
| ((*(p->uri+1) & 0x3F) << 6)
|
||||
| ((*(p->uri+2) & 0x3F));
|
||||
p->uri += 2;
|
||||
} else if ((s & 0xF8) == 0xF0) {
|
||||
if (!CONT_BYTE(*(p->uri+1)) ||
|
||||
!CONT_BYTE(*(p->uri+2)) ||
|
||||
!CONT_BYTE(*(p->uri+3)))
|
||||
return 0;
|
||||
c = (s & 0x07) << 18
|
||||
| ((*(p->uri+1) & 0x3F) << 12)
|
||||
| ((*(p->uri+2) & 0x3F) << 6)
|
||||
| ((*(p->uri+3) & 0x3F));
|
||||
p->uri += 3;
|
||||
} else
|
||||
return 0;
|
||||
|
||||
return (((0x080 <= c) && (c <= 0x7FF))
|
||||
|| (((0x800 <= c) && (c <= 0xFFFF)))
|
||||
|| (((0x10000 <= c) && (c <= 0x10FFFF))));
|
||||
}
|
||||
|
||||
static int
|
||||
parse_pct_encoded(struct parser *p)
|
||||
{
|
||||
|
@ -308,7 +352,8 @@ parse_query(struct parser *p)
|
|||
|| SUB_DELIMITERS(*p->uri)
|
||||
|| *p->uri == '/'
|
||||
|| *p->uri == '?'
|
||||
|| parse_pct_encoded(p))
|
||||
|| parse_pct_encoded(p)
|
||||
|| valid_multibyte_utf8(p))
|
||||
p->uri++;
|
||||
|
||||
if (*p->uri != '\0' && *p->uri != '#') {
|
||||
|
@ -348,7 +393,8 @@ parse_path(struct parser *p)
|
|||
while (UNRESERVED(*p->uri)
|
||||
|| SUB_DELIMITERS(*p->uri)
|
||||
|| *p->uri == '/'
|
||||
|| parse_pct_encoded(p))
|
||||
|| parse_pct_encoded(p)
|
||||
|| valid_multibyte_utf8(p))
|
||||
p->uri++;
|
||||
|
||||
if (*p->uri != '\0' && *p->uri != '?' && *p->uri != '#') {
|
||||
|
|
48
uri_test.c
48
uri_test.c
|
@ -87,6 +87,12 @@ main(void)
|
|||
{
|
||||
struct uri empty = {"", "", "", PASS, "", "", ""};
|
||||
|
||||
TEST("foo://bar.com/foo%00?baz",
|
||||
FAIL,
|
||||
empty,
|
||||
"rejects %00");
|
||||
return 0;
|
||||
|
||||
TEST("http://omarpolo.com",
|
||||
PASS,
|
||||
URI("http", "omarpolo.com", "", "", "", ""),
|
||||
|
@ -153,6 +159,10 @@ main(void)
|
|||
FAIL,
|
||||
empty,
|
||||
"reject paths that would escape the root");
|
||||
TEST("gemini://omarpolo.com/foo/../foo/../././/bar/baz/.././.././/",
|
||||
PASS,
|
||||
URI("gemini", "omarpolo.com", "", "", "", ""),
|
||||
"parse path with lots of cleaning available");
|
||||
|
||||
/* query */
|
||||
TEST("foo://example.com/foo/?gne",
|
||||
|
@ -179,6 +189,44 @@ main(void)
|
|||
PASS,
|
||||
URI("foo", "bar.com", "", "cafè.gmi", "", ""),
|
||||
"can decode");
|
||||
TEST("foo://bar.com/caff%C3%A8%20macchiato.gmi",
|
||||
PASS,
|
||||
URI("foo", "bar.com", "", "caffè macchiato.gmi", "", ""),
|
||||
"can decode");
|
||||
TEST("foo://bar.com/caff%C3%A8+macchiato.gmi",
|
||||
PASS,
|
||||
URI("foo", "bar.com", "", "caffè+macchiato.gmi", "", ""),
|
||||
"can decode");
|
||||
TEST("foo://bar.com/foo%2F..%2F..",
|
||||
FAIL,
|
||||
empty,
|
||||
"conversion and checking are done in the correct order");
|
||||
TEST("foo://bar.com/foo%00?baz",
|
||||
FAIL,
|
||||
empty,
|
||||
"rejects %00");
|
||||
|
||||
/* IRI */
|
||||
TEST("foo://bar.com/cafè.gmi",
|
||||
PASS,
|
||||
URI("foo", "bar.com", "", "cafè.gmi", "" , ""),
|
||||
"decode IRI (with a 2-byte utf8 seq)");
|
||||
TEST("foo://bar.com/世界.gmi",
|
||||
PASS,
|
||||
URI("foo", "bar.com", "", "世界.gmi", "" , ""),
|
||||
"decode IRI");
|
||||
TEST("foo://bar.com/😼.gmi",
|
||||
PASS,
|
||||
URI("foo", "bar.com", "", "😼.gmi", "" , ""),
|
||||
"decode IRI (with a 3-byte utf8 seq)");
|
||||
TEST("foo://bar.com/😼/𤭢.gmi",
|
||||
PASS,
|
||||
URI("foo", "bar.com", "", "😼/𤭢.gmi", "" , ""),
|
||||
"decode IRI (with a 3-byte and a 4-byte utf8 seq)");
|
||||
TEST("foo://bar.com/世界/\xC0\x80",
|
||||
FAIL,
|
||||
empty,
|
||||
"reject invalid sequence (overlong NUL)");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue