diff --git a/JsonStreamingParser.cpp b/JsonStreamingParser.cpp index cfc71d1..23fdbae 100644 --- a/JsonStreamingParser.cpp +++ b/JsonStreamingParser.cpp @@ -35,13 +35,15 @@ void JsonStreamingParser::reset() { unicodeEscapeBufferPos = 0; unicodeBufferPos = 0; characterCounter = 0; + utf8Length = 0; + utf8Pos = 0; } void JsonStreamingParser::setListener(JsonListener* listener) { myListener = listener; } -void JsonStreamingParser::parse(char c) { +void JsonStreamingParser::parse(unsigned char c) { //System.out.print(c); // valid whitespace characters in JSON (from RFC4627 for JSON) include: // space, horizontal tab, line feed or new line, and carriage return. @@ -58,13 +60,39 @@ void JsonStreamingParser::parse(char c) { endString(); } else if (c == '\\') { state = STATE_START_ESCAPE; - } else if ((c < 0x1f) || (c == 0x7f)) { + } else if (c >= 0xc2 && c <= 0xdf) { + state = STATE_UNESCAPED_UTF8; + utf8Pos = 1; + utf8Length = 2; + buffer[bufferPos] = c; + increaseBufferPointer(); + } else if (c >= 0xe0 && c <= 0xef) { + state = STATE_UNESCAPED_UTF8; + utf8Pos = 1; + utf8Length = 3; + buffer[bufferPos] = c; + increaseBufferPointer(); + } else if (c >= 0xf0 && c <= 0xf4) { + state = STATE_UNESCAPED_UTF8; + utf8Pos = 1; + utf8Length = 4; + buffer[bufferPos] = c; + increaseBufferPointer(); + } else if ((c < 0x1f) || (c == 0x7f) || (c >= 0x80 && c <= 0xa0) || (c == 0xad)) { //throw new RuntimeException("Unescaped control character encountered: " + c + " at position" + characterCounter); } else { buffer[bufferPos] = c; increaseBufferPointer(); } break; + case STATE_UNESCAPED_UTF8: + utf8Pos++; + buffer[bufferPos] = c; + increaseBufferPointer(); + if (utf8Pos == utf8Length) { + state = STATE_IN_STRING; + } + break; case STATE_IN_ARRAY: if (c == ']') { endArray(); @@ -146,7 +174,7 @@ void JsonStreamingParser::parse(char c) { buffer[bufferPos] = c; increaseBufferPointer(); } else if (c == '+' || c == '-') { - char last = buffer[bufferPos - 1]; + unsigned char last = buffer[bufferPos - 1]; if (!(last == 'e' || last == 'E')) { //throw new RuntimeException("Can only have '+' or '-' after the 'e' or 'E' in a number." + characterCounter); } @@ -222,7 +250,7 @@ void JsonStreamingParser::endString() { } bufferPos = 0; } -void JsonStreamingParser::startValue(char c) { +void JsonStreamingParser::startValue(unsigned char c) { if (c == '[') { startArray(); } else if (c == '{') { @@ -249,7 +277,7 @@ void JsonStreamingParser::startValue(char c) { } } -boolean JsonStreamingParser::isDigit(char c) { +boolean JsonStreamingParser::isDigit(unsigned char c) { // Only concerned with the first character in a number. return (c >= '0' && c <= '9') || c == '-'; } @@ -288,7 +316,7 @@ void JsonStreamingParser::endObject() { } } -void JsonStreamingParser::processEscapeCharacters(char c) { +void JsonStreamingParser::processEscapeCharacters(unsigned char c) { if (c == '"') { buffer[bufferPos] = '"'; increaseBufferPointer(); @@ -324,7 +352,7 @@ void JsonStreamingParser::processEscapeCharacters(char c) { } } -void JsonStreamingParser::processUnicodeCharacter(char c) { +void JsonStreamingParser::processUnicodeCharacter(unsigned char c) { if (!isHexCharacter(c)) { // throw new ParsingError($this->_line_number, $this->_char_number, // "Expected hex character for escaped Unicode character. Unicode parsed: " @@ -360,14 +388,14 @@ void JsonStreamingParser::processUnicodeCharacter(char c) { }*/ } } -boolean JsonStreamingParser::isHexCharacter(char c) { +boolean JsonStreamingParser::isHexCharacter(unsigned char c) { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); } -int JsonStreamingParser::getHexArrayAsDecimal(char hexArray[], int length) { +int JsonStreamingParser::getHexArrayAsDecimal(unsigned char hexArray[], int length) { int result = 0; for (int i = 0; i < length; i++) { - char current = hexArray[length - i - 1]; + unsigned char current = hexArray[length - i - 1]; int value = 0; if (current >= 'a' && current <= 'f') { value = current - 'a' + 10; @@ -381,7 +409,7 @@ int JsonStreamingParser::getHexArrayAsDecimal(char hexArray[], int length) { return result; } -boolean JsonStreamingParser::doesCharArrayContain(char myArray[], int length, char c) { +boolean JsonStreamingParser::doesCharArrayContain(unsigned char myArray[], int length, unsigned char c) { for (int i = 0; i < length; i++) { if (myArray[i] == c) { return true; @@ -417,10 +445,10 @@ void JsonStreamingParser::endNumber() { state = STATE_AFTER_VALUE; } -int JsonStreamingParser::convertDecimalBufferToInt(char myArray[], int length) { +int JsonStreamingParser::convertDecimalBufferToInt(unsigned char myArray[], int length) { int result = 0; for (int i = 0; i < length; i++) { - char current = myArray[length - i - 1]; + unsigned char current = myArray[length - i - 1]; result += (current - '0') * 10; } return result; @@ -490,7 +518,7 @@ void JsonStreamingParser::startString() { state = STATE_IN_STRING; } -void JsonStreamingParser::startNumber(char c) { +void JsonStreamingParser::startNumber(unsigned char c) { state = STATE_IN_NUMBER; buffer[bufferPos] = c; increaseBufferPointer(); @@ -504,9 +532,9 @@ void JsonStreamingParser::endUnicodeCharacter(int codepoint) { state = STATE_IN_STRING; } -char JsonStreamingParser::convertCodepointToCharacter(int num) { +unsigned char JsonStreamingParser::convertCodepointToCharacter(int num) { if (num <= 0x7F) - return (char) (num); + return (unsigned char) (num); // if(num<=0x7FF) return (char)((num>>6)+192) + (char)((num&63)+128); // if(num<=0xFFFF) return // chr((num>>12)+224).chr(((num>>6)&63)+128).chr((num&63)+128); diff --git a/JsonStreamingParser.h b/JsonStreamingParser.h index 377e846..7f14d64 100644 --- a/JsonStreamingParser.h +++ b/JsonStreamingParser.h @@ -43,6 +43,7 @@ See more at http://blog.squix.ch and https://github.com/squix78/json-streaming-p #define STATE_IN_NULL 11 #define STATE_AFTER_VALUE 12 #define STATE_UNICODE_SURROGATE 13 +#define STATE_UNESCAPED_UTF8 14 #define STACK_OBJECT 0 #define STACK_ARRAY 1 @@ -67,6 +68,9 @@ class JsonStreamingParser { char unicodeEscapeBuffer[10]; int unicodeEscapeBufferPos = 0; + + char utf8Length = 0; + char utf8Pos = 0; char unicodeBuffer[10]; int unicodeBufferPos = 0; @@ -81,21 +85,21 @@ class JsonStreamingParser { void endArray(); - void startValue(char c); + void startValue(unsigned char c); void startKey(); - void processEscapeCharacters(char c); + void processEscapeCharacters(unsigned char c); - boolean isDigit(char c); + boolean isDigit(unsigned char c); - boolean isHexCharacter(char c); + boolean isHexCharacter(unsigned char c); - char convertCodepointToCharacter(int num); + unsigned char convertCodepointToCharacter(int num); void endUnicodeCharacter(int codepoint); - void startNumber(char c); + void startNumber(unsigned char c); void startString(); @@ -111,17 +115,17 @@ class JsonStreamingParser { void endDocument(); - int convertDecimalBufferToInt(char myArray[], int length); + int convertDecimalBufferToInt(unsigned char myArray[], int length); void endNumber(); void endUnicodeSurrogateInterstitial(); - boolean doesCharArrayContain(char myArray[], int length, char c); + boolean doesCharArrayContain(unsigned char myArray[], int length, unsigned char c); - int getHexArrayAsDecimal(char hexArray[], int length); + int getHexArrayAsDecimal(unsigned char hexArray[], int length); - void processUnicodeCharacter(char c); + void processUnicodeCharacter(unsigned char c); void endObject(); @@ -129,7 +133,7 @@ class JsonStreamingParser { public: JsonStreamingParser(); - void parse(char c); + void parse(unsigned char c); void setListener(JsonListener* listener); void reset(); };