summaryrefslogtreecommitdiff
path: root/ext/json/utf8_decode.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/json/utf8_decode.c')
-rw-r--r--ext/json/utf8_decode.c179
1 files changed, 179 insertions, 0 deletions
diff --git a/ext/json/utf8_decode.c b/ext/json/utf8_decode.c
new file mode 100644
index 0000000..2d0422b
--- /dev/null
+++ b/ext/json/utf8_decode.c
@@ -0,0 +1,179 @@
+/* utf8_decode.c */
+
+/* 2005-12-25 */
+
+/*
+Copyright (c) 2005 JSON.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+The Software shall be used for Good, not Evil.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "utf8_decode.h"
+
+/*
+ Very Strict UTF-8 Decoder
+
+ UTF-8 is a multibyte character encoding of Unicode. A character can be
+ represented by 1-4 bytes. The bit pattern of the first byte indicates the
+ number of continuation bytes.
+
+ Most UTF-8 decoders tend to be lenient, attempting to recover as much
+ information as possible, even from badly encoded input. This UTF-8
+ decoder is not lenient. It will reject input which does not include
+ proper continuation bytes. It will reject aliases (or suboptimal
+ codings). It will reject surrogates. (Surrogate encoding should only be
+ used with UTF-16.)
+
+ Code Contination Minimum Maximum
+ 0xxxxxxx 0 0 127
+ 10xxxxxx error
+ 110xxxxx 1 128 2047
+ 1110xxxx 2 2048 65535 excluding 55296 - 57343
+ 11110xxx 3 65536 1114111
+ 11111xxx error
+*/
+
+
+/*
+ Get the next byte. It returns UTF8_END if there are no more bytes.
+*/
+static int
+get(json_utf8_decode *utf8)
+{
+ int c;
+ if (utf8->the_index >= utf8->the_length) {
+ return UTF8_END;
+ }
+ c = utf8->the_input[utf8->the_index] & 0xFF;
+ utf8->the_index += 1;
+ return c;
+}
+
+
+/*
+ Get the 6-bit payload of the next continuation byte.
+ Return UTF8_ERROR if it is not a contination byte.
+*/
+static int
+cont(json_utf8_decode *utf8)
+{
+ int c = get(utf8);
+ return ((c & 0xC0) == 0x80) ? (c & 0x3F) : UTF8_ERROR;
+}
+
+
+/*
+ Initialize the UTF-8 decoder. The decoder is not reentrant,
+*/
+void
+utf8_decode_init(json_utf8_decode *utf8, char p[], int length)
+{
+ utf8->the_index = 0;
+ utf8->the_input = p;
+ utf8->the_length = length;
+ utf8->the_char = 0;
+ utf8->the_byte = 0;
+}
+
+
+/*
+ Get the current byte offset. This is generally used in error reporting.
+*/
+int
+utf8_decode_at_byte(json_utf8_decode *utf8)
+{
+ return utf8->the_byte;
+}
+
+
+/*
+ Get the current character offset. This is generally used in error reporting.
+ The character offset matches the byte offset if the text is strictly ASCII.
+*/
+int
+utf8_decode_at_character(json_utf8_decode *utf8)
+{
+ return utf8->the_char > 0 ? utf8->the_char - 1 : 0;
+}
+
+
+/*
+ Extract the next character.
+ Returns: the character (between 0 and 1114111)
+ or UTF8_END (the end)
+ or UTF8_ERROR (error)
+*/
+int
+utf8_decode_next(json_utf8_decode *utf8)
+{
+ int c; /* the first byte of the character */
+ int r; /* the result */
+
+ if (utf8->the_index >= utf8->the_length) {
+ return utf8->the_index == utf8->the_length ? UTF8_END : UTF8_ERROR;
+ }
+ utf8->the_byte = utf8->the_index;
+ utf8->the_char += 1;
+ c = get(utf8);
+/*
+ Zero continuation (0 to 127)
+*/
+ if ((c & 0x80) == 0) {
+ return c;
+ }
+/*
+ One contination (128 to 2047)
+*/
+ if ((c & 0xE0) == 0xC0) {
+ int c1 = cont(utf8);
+ if (c1 < 0) {
+ return UTF8_ERROR;
+ }
+ r = ((c & 0x1F) << 6) | c1;
+ return r >= 128 ? r : UTF8_ERROR;
+ }
+/*
+ Two continuation (2048 to 55295 and 57344 to 65535)
+*/
+ if ((c & 0xF0) == 0xE0) {
+ int c1 = cont(utf8);
+ int c2 = cont(utf8);
+ if (c1 < 0 || c2 < 0) {
+ return UTF8_ERROR;
+ }
+ r = ((c & 0x0F) << 12) | (c1 << 6) | c2;
+ return r >= 2048 && (r < 55296 || r > 57343) ? r : UTF8_ERROR;
+ }
+/*
+ Three continuation (65536 to 1114111)
+*/
+ if ((c & 0xF8) == 0xF0) {
+ int c1 = cont(utf8);
+ int c2 = cont(utf8);
+ int c3 = cont(utf8);
+ if (c1 < 0 || c2 < 0 || c3 < 0) {
+ return UTF8_ERROR;
+ }
+ r = ((c & 0x0F) << 18) | (c1 << 12) | (c2 << 6) | c3;
+ return r >= 65536 && r <= 1114111 ? r : UTF8_ERROR;
+ }
+ return UTF8_ERROR;
+}