diff --git a/contrib/unaccent/expected/unaccent.out b/contrib/unaccent/expected/unaccent.out index f080707c4a..d03374c799 100644 --- a/contrib/unaccent/expected/unaccent.out +++ b/contrib/unaccent/expected/unaccent.out @@ -51,6 +51,18 @@ SELECT unaccent('℗'); -- sound recording copyright (P) (1 row) +SELECT unaccent('1½'); -- math expression with whitespace + unaccent +---------- + 1 1/2 +(1 row) + +SELECT unaccent('〝'); -- quote + unaccent +---------- + " +(1 row) + SELECT unaccent('unaccent', 'foobar'); unaccent ---------- @@ -93,6 +105,18 @@ SELECT unaccent('unaccent', '℗'); (P) (1 row) +SELECT unaccent('unaccent', '1½'); + unaccent +---------- + 1 1/2 +(1 row) + +SELECT unaccent('unaccent', '〝'); + unaccent +---------- + " +(1 row) + SELECT ts_lexize('unaccent', 'foobar'); ts_lexize ----------- @@ -135,6 +159,18 @@ SELECT ts_lexize('unaccent', '℗'); {(P)} (1 row) +SELECT ts_lexize('unaccent', '1½'); + ts_lexize +----------- + {"1 1/2"} +(1 row) + +SELECT ts_lexize('unaccent', '〝'); + ts_lexize +----------- + {"\""} +(1 row) + -- Controversial case. Black-Letter Capital H (U+210C) is translated by -- Latin-ASCII.xml as 'x', but it should be 'H'. SELECT unaccent('ℌ'); diff --git a/contrib/unaccent/generate_unaccent_rules.py b/contrib/unaccent/generate_unaccent_rules.py index b4b4c38beb..cffb7db7ce 100644 --- a/contrib/unaccent/generate_unaccent_rules.py +++ b/contrib/unaccent/generate_unaccent_rules.py @@ -58,6 +58,10 @@ COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA def print_record(codepoint, letter): if letter: + # If the letter has whitespace or double quotes, escape double + # quotes and apply more quotes around it. + if (' ' in letter) or ('"' in letter): + letter = '"' + letter.replace('"', '""') + '"' output = chr(codepoint) + "\t" + letter else: output = chr(codepoint) diff --git a/contrib/unaccent/sql/unaccent.sql b/contrib/unaccent/sql/unaccent.sql index 663646c1ac..70c7f1c0a0 100644 --- a/contrib/unaccent/sql/unaccent.sql +++ b/contrib/unaccent/sql/unaccent.sql @@ -20,6 +20,8 @@ SELECT unaccent('˃˖˗˜'); SELECT unaccent('À'); -- Remove combining diacritical 0x0300 SELECT unaccent('℃℉'); -- degree signs SELECT unaccent('℗'); -- sound recording copyright +SELECT unaccent('1½'); -- math expression with whitespace +SELECT unaccent('〝'); -- quote SELECT unaccent('unaccent', 'foobar'); SELECT unaccent('unaccent', 'ёлка'); @@ -28,6 +30,8 @@ SELECT unaccent('unaccent', '˃˖˗˜'); SELECT unaccent('unaccent', 'À'); SELECT unaccent('unaccent', '℃℉'); SELECT unaccent('unaccent', '℗'); +SELECT unaccent('unaccent', '1½'); +SELECT unaccent('unaccent', '〝'); SELECT ts_lexize('unaccent', 'foobar'); SELECT ts_lexize('unaccent', 'ёлка'); @@ -36,6 +40,8 @@ SELECT ts_lexize('unaccent', '˃˖˗˜'); SELECT ts_lexize('unaccent', 'À'); SELECT ts_lexize('unaccent', '℃℉'); SELECT ts_lexize('unaccent', '℗'); +SELECT ts_lexize('unaccent', '1½'); +SELECT ts_lexize('unaccent', '〝'); -- Controversial case. Black-Letter Capital H (U+210C) is translated by -- Latin-ASCII.xml as 'x', but it should be 'H'. diff --git a/contrib/unaccent/unaccent.c b/contrib/unaccent/unaccent.c index 64c879e547..5635f04214 100644 --- a/contrib/unaccent/unaccent.c +++ b/contrib/unaccent/unaccent.c @@ -127,24 +127,30 @@ initTrie(const char *filename) * src and trg are sequences of one or more non-whitespace * characters, separated by whitespace. Whitespace at start * or end of line is ignored. If trg is omitted, an empty - * string is used as the replacement. + * string is used as the replacement. trg can be optionally + * quoted, in which case whitespaces are included in it. * * We use a simple state machine, with states * 0 initial (before src) * 1 in src * 2 in whitespace after src - * 3 in trg - * 4 in whitespace after trg - * -1 syntax error detected + * 3 in trg (non-quoted) + * 4 in trg (quoted) + * 5 in whitespace after trg + * -1 syntax error detected (two strings) + * -2 syntax error detected (unfinished quoted string) *---------- */ int state; char *ptr; char *src = NULL; char *trg = NULL; + char *trgstore = NULL; int ptrlen; int srclen = 0; int trglen = 0; + int trgstorelen = 0; + bool trgquoted = false; state = 0; for (ptr = line; *ptr; ptr += ptrlen) @@ -156,8 +162,10 @@ initTrie(const char *filename) if (state == 1) state = 2; else if (state == 3) - state = 4; - continue; + state = 5; + /* whitespaces are OK in quoted area */ + if (state != 4) + continue; } switch (state) { @@ -173,14 +181,41 @@ initTrie(const char *filename) break; case 2: /* start of trg */ + if (*ptr == '"') + { + trgquoted = true; + state = 4; + } + else + state = 3; + trg = ptr; trglen = ptrlen; - state = 3; break; case 3: - /* continue trg */ + /* continue non-quoted trg */ trglen += ptrlen; break; + case 4: + /* continue quoted trg */ + trglen += ptrlen; + + /* + * If this is a quote, consider it as the end of + * trg except if the follow-up character is itself + * a quote. + */ + if (*ptr == '"') + { + if (*(ptr + 1) == '"') + { + ptr++; + trglen += 1; + } + else + state = 5; + } + break; default: /* bogus line format */ state = -1; @@ -195,15 +230,46 @@ initTrie(const char *filename) trglen = 0; } + /* If still in a quoted area, fallback to an error */ + if (state == 4) + state = -2; + + /* If trg was quoted, remove its quotes and unescape it */ + if (trgquoted && state > 0) + { + /* Ignore first and end quotes */ + trgstore = palloc0(sizeof(char *) * trglen - 2); + trgstorelen = 0; + for (int i = 1; i < trglen - 1; i++) + { + trgstore[trgstorelen] = trg[i]; + trgstorelen++; + /* skip second double quotes */ + if (trg[i] == '"' && trg[i + 1] == '"') + i++; + } + } + else + { + trgstore = palloc0(sizeof(char *) * trglen); + trgstorelen = trglen; + memcpy(trgstore, trg, trgstorelen); + } + if (state > 0) rootTrie = placeChar(rootTrie, (unsigned char *) src, srclen, - trg, trglen); - else if (state < 0) + trgstore, trgstorelen); + else if (state == -1) ereport(WARNING, (errcode(ERRCODE_CONFIG_FILE_ERROR), errmsg("invalid syntax: more than two strings in unaccent rule"))); + else if (state == -2) + ereport(WARNING, + (errcode(ERRCODE_CONFIG_FILE_ERROR), + errmsg("invalid syntax: unfinished quoted string in unaccent rule"))); + pfree(trgstore); pfree(line); } skip = false; diff --git a/contrib/unaccent/unaccent.rules b/contrib/unaccent/unaccent.rules index 3030166ed6..ca6caa51f5 100644 --- a/contrib/unaccent/unaccent.rules +++ b/contrib/unaccent/unaccent.rules @@ -5,9 +5,9 @@ ® (R) ± +/- » >> -¼ 1/4 -½ 1/2 -¾ 3/4 +¼ " 1/4" +½ " 1/2" +¾ " 3/4" ¿ ? À A Á A @@ -403,7 +403,7 @@ ʪ ls ʫ lz ʹ ' -ʺ " +ʺ """" ʻ ' ʼ ' ʽ ' @@ -1058,15 +1058,15 @@ ’ ' ‚ , ‛ ' -“ " -” " +“ """" +” """" „ ,, -‟ " +‟ """" ․ . ‥ .. … ... ′ ' -″ " +″ """" ‹ < › > ‼ !! @@ -1134,22 +1134,22 @@ ⅇ e ⅈ i ⅉ j -⅐ 1/7 -⅑ 1/9 -⅒ 1/10 -⅓ 1/3 -⅔ 2/3 -⅕ 1/5 -⅖ 2/5 -⅗ 3/5 -⅘ 4/5 -⅙ 1/6 -⅚ 5/6 -⅛ 1/8 -⅜ 3/8 -⅝ 5/8 -⅞ 7/8 -⅟ 1/ +⅐ " 1/7" +⅑ " 1/9" +⅒ " 1/10" +⅓ " 1/3" +⅔ " 2/3" +⅕ " 1/5" +⅖ " 2/5" +⅗ " 3/5" +⅘ " 4/5" +⅙ " 1/6" +⅚ " 5/6" +⅛ " 1/8" +⅜ " 3/8" +⅝ " 5/8" +⅞ " 7/8" +⅟ " 1/" Ⅰ I Ⅱ II Ⅲ III @@ -1182,7 +1182,7 @@ ⅽ c ⅾ d ⅿ m -↉ 0/3 +↉ " 0/3" − - ∕ / ∖ \ @@ -1296,8 +1296,8 @@ 〙 ] 〚 [ 〛 ] -〝 " -〞 " +〝 """" +〞 """" ㍱ hPa ㍲ da ㍳ AU @@ -1512,7 +1512,7 @@ ﹪ % ﹫ @ ! ! -" " +" """" # # $ $ % % diff --git a/doc/src/sgml/unaccent.sgml b/doc/src/sgml/unaccent.sgml index f3ddc64bbc..94100ed260 100644 --- a/doc/src/sgml/unaccent.sgml +++ b/doc/src/sgml/unaccent.sgml @@ -84,6 +84,22 @@ + + + Some characters, like numeric symbols, may require whitespaces in their + translation rule. It is possible to use double quotes around the translated + characters in this case. A double quote needs to be escaped with a second + double quote when including one in the translated character. For example: + +¼ " 1/4" +½ " 1/2" +¾ " 3/4" +“ """" +” """" + + + + As with other PostgreSQL text search configuration files,