switch to Bjoern Hoehrmann UTF-8 decoder

It's correct, while my hacked valid_multibyte_utf8 would allow things
that aren't technically UTF8.
This commit is contained in:
Omar Polo 2021-01-09 20:32:23 +00:00
parent 578ba2d81b
commit ef04b55160
6 changed files with 104 additions and 56 deletions

View File

@ -6,11 +6,11 @@ LDFLAGS = -ltls
all: gmid TAGS README.md
gmid: gmid.o uri.o
${CC} gmid.o uri.o -o gmid ${LDFLAGS}
gmid: gmid.o uri.o utf8.o
${CC} gmid.o uri.o utf8.o -o gmid ${LDFLAGS}
TAGS: gmid.c uri.c
-etags gmid.c uri.c || true
TAGS: gmid.c uri.c utf8.c
-etags gmid.c uri.c utf8.c || true
README.md: gmid.1
mandoc -Tmarkdown gmid.1 | sed -e '1d' -e '$$d' > README.md
@ -18,8 +18,8 @@ README.md: gmid.1
clean:
rm -f *.o gmid
uri_test: uri_test.o uri.o
${CC} uri_test.o uri.o -o uri_test ${LDFLAGS}
uri_test: uri_test.o uri.o utf8.o
${CC} uri_test.o uri.o utf8.o -o uri_test ${LDFLAGS}
test: uri_test
./uri_test

View File

@ -212,6 +212,12 @@ and not
*docs/cgi-bin*,
since it's relative to the document root.
# ACKNOWLEDGEMENTS
**gmid**
uses the "Flexible and Economical" UTF-8 decoder written by
Bjoern Hoehrmann.
# CAVEATS
* it doesn't support virtual hosts: the host part of the request URL is

4
gmid.1
View File

@ -184,6 +184,10 @@ option is
and not
.Pa docs/cgi-bin ,
since it's relative to the document root.
.Sh ACKNOWLEDGEMENTS
.Nm
uses the "Flexible and Economical" UTF-8 decoder written by
.An Bjoern Hoehrmann .
.Sh CAVEATS
.Bl -bullet
.It

9
gmid.h
View File

@ -117,6 +117,12 @@ struct uri {
char *fragment;
};
struct parser {
char *uri;
struct uri *parsed;
const char *err;
};
enum {
FILE_EXISTS,
FILE_EXECUTABLE,
@ -151,6 +157,9 @@ void loop(struct tls*, int);
void usage(const char*);
/* utf8.c */
int valid_multibyte_utf8(struct parser*);
/* uri.c */
int parse_uri(char*, struct uri*, const char**);
int trim_req_uri(char*);

50
uri.c
View File

@ -87,14 +87,6 @@
*
*/
struct parser {
char *uri;
struct uri *parsed;
const char *err;
};
#define CONT_BYTE(b) ((b & 0xC0) == 0x80)
/* XXX: these macros will expand multiple times their argument */
#define UNRESERVED(p) \
@ -117,48 +109,6 @@ struct parser {
|| p == ';' \
|| p == '=')
/* NOTE: the increments are one less what they should be, because the
* caller will add one byte after we return. */
static int
valid_multibyte_utf8(struct parser *p)
{
uint32_t c;
uint8_t s;
c = 0;
s = *p->uri;
if ((s & 0xE0) == 0xC0) {
if (!CONT_BYTE(*(p->uri+1)))
return 0;
c = ((s & 0x1F) << 6) | (*(p->uri+1) & 0x3F);
p->uri += 1;
} else if ((s & 0xF0) == 0xE0) {
if (!CONT_BYTE(*(p->uri+1)) ||
!CONT_BYTE(*(p->uri+2)))
return 0;
c = (s & 0x0F) << 12
| ((*(p->uri+1) & 0x3F) << 6)
| ((*(p->uri+2) & 0x3F));
p->uri += 2;
} else if ((s & 0xF8) == 0xF0) {
if (!CONT_BYTE(*(p->uri+1)) ||
!CONT_BYTE(*(p->uri+2)) ||
!CONT_BYTE(*(p->uri+3)))
return 0;
c = (s & 0x07) << 18
| ((*(p->uri+1) & 0x3F) << 12)
| ((*(p->uri+2) & 0x3F) << 6)
| ((*(p->uri+3) & 0x3F));
p->uri += 3;
} else
return 0;
return (((0x080 <= c) && (c <= 0x7FF))
|| (((0x800 <= c) && (c <= 0xFFFF)))
|| (((0x10000 <= c) && (c <= 0x10FFFF))));
}
static int
parse_pct_encoded(struct parser *p)
{

79
utf8.c Normal file
View File

@ -0,0 +1,79 @@
/* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy,
* modify, merge, publish, distribute, sublicense, and/or sell copies
* of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <stddef.h>
#include <stdint.h>
#include "gmid.h"
#define UTF8_ACCEPT 0
#define UTF8_REJECT 1
static const uint8_t utf8d[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};
static uint32_t inline
utf8_decode(uint32_t* state, uint32_t* codep, uint8_t byte) {
uint32_t type = utf8d[byte];
*codep = (*state != UTF8_ACCEPT) ?
(byte & 0x3fu) | (*codep << 6) :
(0xff >> type) & (byte);
*state = utf8d[256 + *state*16 + type];
return *state;
}
/* for the iri parser. Modelled after printCodePoints */
int
valid_multibyte_utf8(struct parser *p)
{
uint32_t cp = 0, state = 0;
for (; *p->uri; p->uri++)
if (!utf8_decode(&state, &cp, *p->uri))
break;
/* reject the ASCII range */
if (state || cp <= 0x7F) {
/* XXX: do some error recovery? */
if (state)
p->err = "invalid UTF-8 character";
return 0;
}
return 1;
}