mirror of https://github.com/omar-polo/gmid.git
switch to Bjoern Hoehrmann UTF-8 decoder
It's correct, while my hacked valid_multibyte_utf8 would allow things that aren't technically UTF8.
This commit is contained in:
parent
578ba2d81b
commit
ef04b55160
12
Makefile
12
Makefile
|
@ -6,11 +6,11 @@ LDFLAGS = -ltls
|
|||
|
||||
all: gmid TAGS README.md
|
||||
|
||||
gmid: gmid.o uri.o
|
||||
${CC} gmid.o uri.o -o gmid ${LDFLAGS}
|
||||
gmid: gmid.o uri.o utf8.o
|
||||
${CC} gmid.o uri.o utf8.o -o gmid ${LDFLAGS}
|
||||
|
||||
TAGS: gmid.c uri.c
|
||||
-etags gmid.c uri.c || true
|
||||
TAGS: gmid.c uri.c utf8.c
|
||||
-etags gmid.c uri.c utf8.c || true
|
||||
|
||||
README.md: gmid.1
|
||||
mandoc -Tmarkdown gmid.1 | sed -e '1d' -e '$$d' > README.md
|
||||
|
@ -18,8 +18,8 @@ README.md: gmid.1
|
|||
clean:
|
||||
rm -f *.o gmid
|
||||
|
||||
uri_test: uri_test.o uri.o
|
||||
${CC} uri_test.o uri.o -o uri_test ${LDFLAGS}
|
||||
uri_test: uri_test.o uri.o utf8.o
|
||||
${CC} uri_test.o uri.o utf8.o -o uri_test ${LDFLAGS}
|
||||
|
||||
test: uri_test
|
||||
./uri_test
|
||||
|
|
|
@ -212,6 +212,12 @@ and not
|
|||
*docs/cgi-bin*,
|
||||
since it's relative to the document root.
|
||||
|
||||
# ACKNOWLEDGEMENTS
|
||||
|
||||
**gmid**
|
||||
uses the "Flexible and Economical" UTF-8 decoder written by
|
||||
Bjoern Hoehrmann.
|
||||
|
||||
# CAVEATS
|
||||
|
||||
* it doesn't support virtual hosts: the host part of the request URL is
|
||||
|
|
4
gmid.1
4
gmid.1
|
@ -184,6 +184,10 @@ option is
|
|||
and not
|
||||
.Pa docs/cgi-bin ,
|
||||
since it's relative to the document root.
|
||||
.Sh ACKNOWLEDGEMENTS
|
||||
.Nm
|
||||
uses the "Flexible and Economical" UTF-8 decoder written by
|
||||
.An Bjoern Hoehrmann .
|
||||
.Sh CAVEATS
|
||||
.Bl -bullet
|
||||
.It
|
||||
|
|
9
gmid.h
9
gmid.h
|
@ -117,6 +117,12 @@ struct uri {
|
|||
char *fragment;
|
||||
};
|
||||
|
||||
struct parser {
|
||||
char *uri;
|
||||
struct uri *parsed;
|
||||
const char *err;
|
||||
};
|
||||
|
||||
enum {
|
||||
FILE_EXISTS,
|
||||
FILE_EXECUTABLE,
|
||||
|
@ -151,6 +157,9 @@ void loop(struct tls*, int);
|
|||
|
||||
void usage(const char*);
|
||||
|
||||
/* utf8.c */
|
||||
int valid_multibyte_utf8(struct parser*);
|
||||
|
||||
/* uri.c */
|
||||
int parse_uri(char*, struct uri*, const char**);
|
||||
int trim_req_uri(char*);
|
||||
|
|
50
uri.c
50
uri.c
|
@ -87,14 +87,6 @@
|
|||
*
|
||||
*/
|
||||
|
||||
struct parser {
|
||||
char *uri;
|
||||
struct uri *parsed;
|
||||
const char *err;
|
||||
};
|
||||
|
||||
#define CONT_BYTE(b) ((b & 0xC0) == 0x80)
|
||||
|
||||
/* XXX: these macros will expand multiple times their argument */
|
||||
|
||||
#define UNRESERVED(p) \
|
||||
|
@ -117,48 +109,6 @@ struct parser {
|
|||
|| p == ';' \
|
||||
|| p == '=')
|
||||
|
||||
/* NOTE: the increments are one less what they should be, because the
|
||||
* caller will add one byte after we return. */
|
||||
static int
|
||||
valid_multibyte_utf8(struct parser *p)
|
||||
{
|
||||
uint32_t c;
|
||||
uint8_t s;
|
||||
|
||||
c = 0;
|
||||
s = *p->uri;
|
||||
|
||||
if ((s & 0xE0) == 0xC0) {
|
||||
if (!CONT_BYTE(*(p->uri+1)))
|
||||
return 0;
|
||||
c = ((s & 0x1F) << 6) | (*(p->uri+1) & 0x3F);
|
||||
p->uri += 1;
|
||||
} else if ((s & 0xF0) == 0xE0) {
|
||||
if (!CONT_BYTE(*(p->uri+1)) ||
|
||||
!CONT_BYTE(*(p->uri+2)))
|
||||
return 0;
|
||||
c = (s & 0x0F) << 12
|
||||
| ((*(p->uri+1) & 0x3F) << 6)
|
||||
| ((*(p->uri+2) & 0x3F));
|
||||
p->uri += 2;
|
||||
} else if ((s & 0xF8) == 0xF0) {
|
||||
if (!CONT_BYTE(*(p->uri+1)) ||
|
||||
!CONT_BYTE(*(p->uri+2)) ||
|
||||
!CONT_BYTE(*(p->uri+3)))
|
||||
return 0;
|
||||
c = (s & 0x07) << 18
|
||||
| ((*(p->uri+1) & 0x3F) << 12)
|
||||
| ((*(p->uri+2) & 0x3F) << 6)
|
||||
| ((*(p->uri+3) & 0x3F));
|
||||
p->uri += 3;
|
||||
} else
|
||||
return 0;
|
||||
|
||||
return (((0x080 <= c) && (c <= 0x7FF))
|
||||
|| (((0x800 <= c) && (c <= 0xFFFF)))
|
||||
|| (((0x10000 <= c) && (c <= 0x10FFFF))));
|
||||
}
|
||||
|
||||
static int
|
||||
parse_pct_encoded(struct parser *p)
|
||||
{
|
||||
|
|
|
@ -0,0 +1,79 @@
|
|||
/* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use, copy,
|
||||
* modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
* of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
* SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "gmid.h"
|
||||
|
||||
#define UTF8_ACCEPT 0
|
||||
#define UTF8_REJECT 1
|
||||
|
||||
static const uint8_t utf8d[] = {
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
|
||||
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
|
||||
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
|
||||
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
|
||||
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
|
||||
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
|
||||
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
|
||||
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
|
||||
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
|
||||
};
|
||||
|
||||
static uint32_t inline
|
||||
utf8_decode(uint32_t* state, uint32_t* codep, uint8_t byte) {
|
||||
uint32_t type = utf8d[byte];
|
||||
|
||||
*codep = (*state != UTF8_ACCEPT) ?
|
||||
(byte & 0x3fu) | (*codep << 6) :
|
||||
(0xff >> type) & (byte);
|
||||
|
||||
*state = utf8d[256 + *state*16 + type];
|
||||
return *state;
|
||||
}
|
||||
|
||||
/* for the iri parser. Modelled after printCodePoints */
|
||||
int
|
||||
valid_multibyte_utf8(struct parser *p)
|
||||
{
|
||||
uint32_t cp = 0, state = 0;
|
||||
|
||||
for (; *p->uri; p->uri++)
|
||||
if (!utf8_decode(&state, &cp, *p->uri))
|
||||
break;
|
||||
|
||||
/* reject the ASCII range */
|
||||
if (state || cp <= 0x7F) {
|
||||
/* XXX: do some error recovery? */
|
||||
if (state)
|
||||
p->err = "invalid UTF-8 character";
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
Loading…
Reference in New Issue