summaryrefslogtreecommitdiff
path: root/src/include/mb
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2015-05-14 22:27:07 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2015-05-14 22:27:12 -0400
commit7730f48ede0d222e7f750541d3d5f0f74d75d99b (patch)
tree472b56a394d55b08d31fcbaa1015d2475c788795 /src/include/mb
parent83e176ec18d2a91dbea1d0d1bd94c38dc47cd77c (diff)
downloadpostgresql-7730f48ede0d222e7f750541d3d5f0f74d75d99b.tar.gz
Teach UtfToLocal/LocalToUtf to support algorithmic encoding conversions.
Until now, these functions have only supported encoding conversions using lookup tables, which is fine as long as there's not too many code points to convert. However, GB18030 expects all 1.1 million Unicode code points to be convertible, which would require a ridiculously-sized lookup table. Fortunately, a large fraction of those conversions can be expressed through arithmetic, ie the conversions are one-to-one in certain defined ranges. To support that, provide a callback function that is used after consulting the lookup tables. (This patch doesn't actually change anything about the GB18030 conversion behavior, just provide infrastructure for fixing it.) Since this requires changing the APIs of UtfToLocal/LocalToUtf anyway, take the opportunity to rearrange their argument lists into what seems to me a saner order. And beautify the call sites by using lengthof() instead of error-prone sizeof() arithmetic. In passing, also mark all the lookup tables used by these calls "const". This moves an impressive amount of stuff into the text segment, at least on my machine, and is safer anyhow.
Diffstat (limited to 'src/include/mb')
-rw-r--r--src/include/mb/pg_wchar.h41
1 files changed, 30 insertions, 11 deletions
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index f7222fc177..f8b0edc678 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -366,9 +366,16 @@ typedef struct
extern const pg_wchar_tbl pg_wchar_table[];
/*
+ * Data structures for conversions between UTF-8 and other encodings
+ * (UtfToLocal() and LocalToUtf()). In these data structures, characters of
+ * either encoding are represented by uint32 words; hence we can only support
+ * characters up to 4 bytes long. For example, the byte sequence 0xC2 0x89
+ * would be represented by 0x0000C289, and 0xE8 0xA2 0xB4 by 0x00E8A2B4.
+ *
+ * Maps are arrays of these structs, which must be in order by the lookup key
+ * (so that bsearch() can be used).
+ *
* UTF-8 to local code conversion map
- * Note that we limit the max length of UTF-8 to 4 bytes,
- * which is UCS-4 00010000-001FFFFF range.
*/
typedef struct
{
@@ -386,7 +393,7 @@ typedef struct
} pg_local_to_utf;
/*
- * UTF-8 to local code conversion map(combined characters)
+ * UTF-8 to local code conversion map (for combined characters)
*/
typedef struct
{
@@ -396,7 +403,7 @@ typedef struct
} pg_utf_to_local_combined;
/*
- * local code to UTF-8 conversion map(combined characters)
+ * local code to UTF-8 conversion map (for combined characters)
*/
typedef struct
{
@@ -406,6 +413,13 @@ typedef struct
} pg_local_to_utf_combined;
/*
+ * callback function for algorithmic encoding conversions (in either direction)
+ *
+ * if function returns zero, it does not know how to convert the code
+ */
+typedef uint32 (*utf_local_conversion_func) (uint32 code);
+
+/*
* Support macro for encoding conversion functions to validate their
* arguments. (This could be made more compact if we included fmgr.h
* here, but we don't want to do that because this header file is also
@@ -494,13 +508,18 @@ extern char *pg_server_to_any(const char *s, int len, int encoding);
extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
-extern void LocalToUtf(const unsigned char *iso, unsigned char *utf,
- const pg_local_to_utf *map, const pg_local_to_utf_combined *cmap,
- int size1, int size2, int encoding, int len);
-
-extern void UtfToLocal(const unsigned char *utf, unsigned char *iso,
- const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap,
- int size1, int size2, int encoding, int len);
+extern void UtfToLocal(const unsigned char *utf, int len,
+ unsigned char *iso,
+ const pg_utf_to_local *map, int mapsize,
+ const pg_utf_to_local_combined *cmap, int cmapsize,
+ utf_local_conversion_func conv_func,
+ int encoding);
+extern void LocalToUtf(const unsigned char *iso, int len,
+ unsigned char *utf,
+ const pg_local_to_utf *map, int mapsize,
+ const pg_local_to_utf_combined *cmap, int cmapsize,
+ utf_local_conversion_func conv_func,
+ int encoding);
extern bool pg_verifymbstr(const char *mbstr, int len, bool noError);
extern bool pg_verify_mbstr(int encoding, const char *mbstr, int len,