diff options
Diffstat (limited to 'ext/json/utf8_decode.c')
-rw-r--r-- | ext/json/utf8_decode.c | 179 |
1 files changed, 179 insertions, 0 deletions
diff --git a/ext/json/utf8_decode.c b/ext/json/utf8_decode.c new file mode 100644 index 0000000..2d0422b --- /dev/null +++ b/ext/json/utf8_decode.c @@ -0,0 +1,179 @@ +/* utf8_decode.c */ + +/* 2005-12-25 */ + +/* +Copyright (c) 2005 JSON.org + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +The Software shall be used for Good, not Evil. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "utf8_decode.h" + +/* + Very Strict UTF-8 Decoder + + UTF-8 is a multibyte character encoding of Unicode. A character can be + represented by 1-4 bytes. The bit pattern of the first byte indicates the + number of continuation bytes. + + Most UTF-8 decoders tend to be lenient, attempting to recover as much + information as possible, even from badly encoded input. This UTF-8 + decoder is not lenient. It will reject input which does not include + proper continuation bytes. It will reject aliases (or suboptimal + codings). It will reject surrogates. (Surrogate encoding should only be + used with UTF-16.) + + Code Contination Minimum Maximum + 0xxxxxxx 0 0 127 + 10xxxxxx error + 110xxxxx 1 128 2047 + 1110xxxx 2 2048 65535 excluding 55296 - 57343 + 11110xxx 3 65536 1114111 + 11111xxx error +*/ + + +/* + Get the next byte. It returns UTF8_END if there are no more bytes. +*/ +static int +get(json_utf8_decode *utf8) +{ + int c; + if (utf8->the_index >= utf8->the_length) { + return UTF8_END; + } + c = utf8->the_input[utf8->the_index] & 0xFF; + utf8->the_index += 1; + return c; +} + + +/* + Get the 6-bit payload of the next continuation byte. + Return UTF8_ERROR if it is not a contination byte. +*/ +static int +cont(json_utf8_decode *utf8) +{ + int c = get(utf8); + return ((c & 0xC0) == 0x80) ? (c & 0x3F) : UTF8_ERROR; +} + + +/* + Initialize the UTF-8 decoder. The decoder is not reentrant, +*/ +void +utf8_decode_init(json_utf8_decode *utf8, char p[], int length) +{ + utf8->the_index = 0; + utf8->the_input = p; + utf8->the_length = length; + utf8->the_char = 0; + utf8->the_byte = 0; +} + + +/* + Get the current byte offset. This is generally used in error reporting. +*/ +int +utf8_decode_at_byte(json_utf8_decode *utf8) +{ + return utf8->the_byte; +} + + +/* + Get the current character offset. This is generally used in error reporting. + The character offset matches the byte offset if the text is strictly ASCII. +*/ +int +utf8_decode_at_character(json_utf8_decode *utf8) +{ + return utf8->the_char > 0 ? utf8->the_char - 1 : 0; +} + + +/* + Extract the next character. + Returns: the character (between 0 and 1114111) + or UTF8_END (the end) + or UTF8_ERROR (error) +*/ +int +utf8_decode_next(json_utf8_decode *utf8) +{ + int c; /* the first byte of the character */ + int r; /* the result */ + + if (utf8->the_index >= utf8->the_length) { + return utf8->the_index == utf8->the_length ? UTF8_END : UTF8_ERROR; + } + utf8->the_byte = utf8->the_index; + utf8->the_char += 1; + c = get(utf8); +/* + Zero continuation (0 to 127) +*/ + if ((c & 0x80) == 0) { + return c; + } +/* + One contination (128 to 2047) +*/ + if ((c & 0xE0) == 0xC0) { + int c1 = cont(utf8); + if (c1 < 0) { + return UTF8_ERROR; + } + r = ((c & 0x1F) << 6) | c1; + return r >= 128 ? r : UTF8_ERROR; + } +/* + Two continuation (2048 to 55295 and 57344 to 65535) +*/ + if ((c & 0xF0) == 0xE0) { + int c1 = cont(utf8); + int c2 = cont(utf8); + if (c1 < 0 || c2 < 0) { + return UTF8_ERROR; + } + r = ((c & 0x0F) << 12) | (c1 << 6) | c2; + return r >= 2048 && (r < 55296 || r > 57343) ? r : UTF8_ERROR; + } +/* + Three continuation (65536 to 1114111) +*/ + if ((c & 0xF8) == 0xF0) { + int c1 = cont(utf8); + int c2 = cont(utf8); + int c3 = cont(utf8); + if (c1 < 0 || c2 < 0 || c3 < 0) { + return UTF8_ERROR; + } + r = ((c & 0x0F) << 18) | (c1 << 12) | (c2 << 6) | c3; + return r >= 65536 && r <= 1114111 ? r : UTF8_ERROR; + } + return UTF8_ERROR; +} |