Skip to content

Commit

Permalink
Fix #276
Browse files Browse the repository at this point in the history
  • Loading branch information
cowtowncoder committed Jun 26, 2021
1 parent 2ddb1e6 commit d2080f6
Show file tree
Hide file tree
Showing 4 changed files with 249 additions and 95 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@ public class CBORGenerator extends GeneratorBase
* Let's ensure that we have big enough output buffer because of safety
* margins we need for UTF-8 encoding.
*/
final static int BYTE_BUFFER_FOR_OUTPUT = 16000;
protected final static int BYTE_BUFFER_FOR_OUTPUT = 16000;

/**
* The replacement character to use to fix invalid Unicode sequences
* (mismatched surrogate pair).
*
* @since 2.12
*/
final static int REPLACEMENT_CHAR = 0xfffd;
protected final static int REPLACEMENT_CHAR = 0xfffd;

/**
* Longest char chunk we will output is chosen so that it is guaranteed to
Expand Down Expand Up @@ -80,7 +80,7 @@ public enum Feature implements FormatFeature {
* an exception will be thrown to indicate invalid content.
*<p>
* Default value is {@code false} (for backwards compatibility) meaning that
* an invalide surrogate will result in exception ({@link IllegalArgumentException}
* an invalid surrogate will result in exception ({@link IllegalArgumentException}
*
* @since 2.12
*/
Expand Down
3 changes: 3 additions & 0 deletions release-notes/VERSION-2.x
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ Modules:
(actual fix in `jackson-databind`)
#272: (cbor) Uncaught exception in CBORParser._nextChunkedByte2 (by ossfuzzer)
(reported by Fabian M)
#276: (smile) Add `SmileGenerator.Feature.LENIENT_UTF_ENCODING` for lenient handling
of broken Unicode surrogate pairs on writing
(requested by kireet@github)
- `Ion-java` dep 1.4.0 -> 1.8.0
- Minor change to Ion module registration names (fully-qualified)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,20 @@ public enum Feature
* this option is disabled by default, and should only be enabled if it is likely that
* same values repeat relatively often.
*/
CHECK_SHARED_STRING_VALUES(false)
CHECK_SHARED_STRING_VALUES(false),

/**
* Feature that determines if an invalid surrogate encoding found in the
* incoming String should fail with an exception or silently be output
* as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD) or not; if not,
* an exception will be thrown to indicate invalid content.
*<p>
* Default value is {@code false} (for backwards compatibility) meaning that
* an invalid surrogate will result in exception ({@link IllegalArgumentException}
*
* @since 2.13
*/
LENIENT_UTF_ENCODING(false),
;

protected final boolean _defaultState;
Expand Down Expand Up @@ -156,6 +169,14 @@ public SharedStringNode(String value, int index, SharedStringNode next)
protected final static long MIN_INT_AS_LONG = (long) Integer.MIN_VALUE;
protected final static long MAX_INT_AS_LONG = (long) Integer.MAX_VALUE;

/**
* The replacement character to use to fix invalid Unicode sequences
* (mismatched surrogate pair).
*
* @since 2.13
*/
protected final static int REPLACEMENT_CHAR = 0xfffd;

/*
/**********************************************************
/* Configuration
Expand Down Expand Up @@ -1929,22 +1950,19 @@ private final int _shortUTF8Encode2(char[] str, int i, int end, int outputPtr)
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
continue;
}
// Yup, a surrogate pair
if (c > SURR1_LAST) { // must be from first range; second won't do
_throwIllegalSurrogate(c);
}
// ... meaning it must have a pair
if (i >= end) {
_throwIllegalSurrogate(c);
}
c = _convertSurrogate(c, str[i++]);
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
_throwIllegalSurrogate(c);
// Yup, looks like a surrogate pair... but is it?
if ((c <= SURR1_LAST) && (i < end)) { // must be from first range and have another char
final int d = str[i];
if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
++i;
outputPtr = _decodeAndWriteSurrogate(c, d, outBuf, outputPtr);
continue;
}
outputPtr = _invalidSurrogateEnd(c, d, outBuf, outputPtr);
continue;
}
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
// Nah, something wrong
outputPtr = _invalidSurrogateStart(c, outBuf, outputPtr);
}
int codedLen = outputPtr - _outputTail;
_outputTail = outputPtr;
Expand Down Expand Up @@ -1993,22 +2011,19 @@ private final int _shortUTF8Encode2(String str, int i, int end, int outputPtr)
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
continue;
}
// Yup, a surrogate pair
if (c > SURR1_LAST) { // must be from first range; second won't do
_throwIllegalSurrogate(c);
}
// ... meaning it must have a pair
if (i >= end) {
_throwIllegalSurrogate(c);
}
c = _convertSurrogate(c, str.charAt(i++));
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
_throwIllegalSurrogate(c);
// Yup, looks like a surrogate pair... but is it?
if ((c <= SURR1_LAST) && (i < end)) { // must be from first range and have another char
final int d = str.charAt(i);
if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
++i;
outputPtr = _decodeAndWriteSurrogate(c, d, outBuf, outputPtr);
continue;
}
outputPtr = _invalidSurrogateEnd(c, d, outBuf, outputPtr);
continue;
}
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
// Nah, something wrong
outputPtr = _invalidSurrogateStart(c, outBuf, outputPtr);
}
int codedLen = outputPtr - _outputTail;
_outputTail = outputPtr;
Expand All @@ -2021,9 +2036,8 @@ private void _mediumUTF8Encode(char[] str, int inputPtr, int inputEnd) throws IO

output_loop:
while (inputPtr < inputEnd) {
/* First, let's ensure we can output at least 4 bytes
* (longest UTF-8 encoded codepoint):
*/
// First, let's ensure we can output at least 4 bytes
// (longest UTF-8 encoded codepoint):
if (_outputTail >= bufferEnd) {
_flushBuffer();
}
Expand Down Expand Up @@ -2064,22 +2078,19 @@ private void _mediumUTF8Encode(char[] str, int inputPtr, int inputEnd) throws IO
_outputBuffer[_outputTail++] = (byte) (0x80 | (c & 0x3f));
continue;
}
// Yup, a surrogate:
if (c > SURR1_LAST) { // must be from first range
_throwIllegalSurrogate(c);
}
// and if so, followed by another from next range
if (inputPtr >= inputEnd) {
_throwIllegalSurrogate(c);
}
c = _convertSurrogate(c, str[inputPtr++]);
if (c > 0x10FFFF) { // illegal, as per RFC 4627
_throwIllegalSurrogate(c);
// Yup, looks like a surrogate pair... but is it?
if ((c <= SURR1_LAST) && (inputPtr < inputEnd)) { // must be from first range and have another char
final int d = str[inputPtr];
if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
++inputPtr;
_outputTail = _decodeAndWriteSurrogate(c, d, _outputBuffer, _outputTail);
continue;
}
_outputTail = _invalidSurrogateEnd(c, d, _outputBuffer, _outputTail);
continue;
}
_outputBuffer[_outputTail++] = (byte) (0xf0 | (c >> 18));
_outputBuffer[_outputTail++] = (byte) (0x80 | ((c >> 12) & 0x3f));
_outputBuffer[_outputTail++] = (byte) (0x80 | ((c >> 6) & 0x3f));
_outputBuffer[_outputTail++] = (byte) (0x80 | (c & 0x3f));
// Nah, something wrong
_outputTail = _invalidSurrogateStart(c, _outputBuffer, _outputTail);
}
}
}
Expand All @@ -2090,9 +2101,8 @@ private void _mediumUTF8Encode(String str, int inputPtr, int inputEnd) throws IO

output_loop:
while (inputPtr < inputEnd) {
/* First, let's ensure we can output at least 4 bytes
* (longest UTF-8 encoded codepoint):
*/
// First, let's ensure we can output at least 4 bytes
// (longest UTF-8 encoded codepoint):
if (_outputTail >= bufferEnd) {
_flushBuffer();
}
Expand Down Expand Up @@ -2133,63 +2143,88 @@ private void _mediumUTF8Encode(String str, int inputPtr, int inputEnd) throws IO
_outputBuffer[_outputTail++] = (byte) (0x80 | (c & 0x3f));
continue;
}
// Yup, a surrogate:
if (c > SURR1_LAST) { // must be from first range
_throwIllegalSurrogate(c);
}
// and if so, followed by another from next range
if (inputPtr >= inputEnd) {
_throwIllegalSurrogate(c);
}
c = _convertSurrogate(c, str.charAt(inputPtr++));
if (c > 0x10FFFF) { // illegal, as per RFC 4627
_throwIllegalSurrogate(c);
// Yup, looks like a surrogate pair... but is it?
if ((c <= SURR1_LAST) && (inputPtr < inputEnd)) { // must be from first range and have another char
final int d = str.charAt(inputPtr);
if ((d <= SURR2_LAST) && (d >= SURR2_FIRST)) {
++inputPtr;
_outputTail = _decodeAndWriteSurrogate(c, d, _outputBuffer, _outputTail);
continue;
}
_outputTail = _invalidSurrogateEnd(c, d, _outputBuffer, _outputTail);
continue;
}
_outputBuffer[_outputTail++] = (byte) (0xf0 | (c >> 18));
_outputBuffer[_outputTail++] = (byte) (0x80 | ((c >> 12) & 0x3f));
_outputBuffer[_outputTail++] = (byte) (0x80 | ((c >> 6) & 0x3f));
_outputBuffer[_outputTail++] = (byte) (0x80 | (c & 0x3f));
// Nah, something wrong
_outputTail = _invalidSurrogateStart(c, _outputBuffer, _outputTail);
}
}
}

/**
* Method called to calculate UTF codepoint, from a surrogate pair.

/*
/**********************************************************************
/* Internal methods, surrogate pair handling
/**********************************************************************
*/
private int _convertSurrogate(int firstPart, int secondPart) throws IOException

private int _invalidSurrogateStart(int code, byte[] outBuf, int outputPtr)
throws IOException
{
// Ok, then, is the second part valid?
if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
String msg = String.format("Broken surrogate pair: first char 0x%04X, second 0x%04X; illegal combination",
firstPart, secondPart);
_reportError(msg);
if (isEnabled(Feature.LENIENT_UTF_ENCODING)) {
return _appendReplacementChar(outBuf, outputPtr);
}
// Will be called in two distinct cases: either first character is
// invalid (code range of second part), or first character is valid
// but there is no second part to encode
if (code <= SURR1_LAST) {
// Unmatched first part (closing without second part?)
_reportError(String.format(
"Unmatched surrogate pair, starts with valid high surrogate (0x%04X) but ends without low surrogate",
code));
}
return 0x10000 + ((firstPart - SURR1_FIRST) << 10) + (secondPart - SURR2_FIRST);
_reportError(String.format(
"Invalid surrogate pair, starts with invalid high surrogate (0x%04X), not in valid range [0xD800, 0xDBFF]",
code));
return 0; // never gets here
}

private void _throwIllegalSurrogate(int code) throws IOException
private int _invalidSurrogateEnd(int surr1, int surr2,
byte[] outBuf, int outputPtr)
throws IOException
{
if (code > 0x10FFFF) { // over max?
_reportError(String.format(
"Illegal character point (0x%X) to output; max is 0x10FFFF as per RFC 4627", code));
}
if (code >= SURR1_FIRST) {
if (code <= SURR1_LAST) { // Unmatched first part (closing without second part?)
_reportError(String.format(
"Unmatched first part of surrogate pair (0x%04X)", code));
}
_reportError(String.format(
"Unmatched second part of surrogate pair (0x%04X)", code));
if (isEnabled(Feature.LENIENT_UTF_ENCODING)) {
return _appendReplacementChar(outBuf, outputPtr);
}
// should we ever get this?
_reportError(String.format("Illegal character point (0x%X) to output", code));
_reportError(String.format(
"Invalid surrogate pair, starts with valid high surrogate (0x%04X)"
+" but ends with invalid low surrogate (0x%04X), not in valid range [0xDC00, 0xDFFF]",
surr1, surr2));
return 0; // never gets here
}

private int _appendReplacementChar(byte[] outBuf, int outputPtr) {
outBuf[outputPtr++] = (byte) (0xe0 | (REPLACEMENT_CHAR >> 12));
outBuf[outputPtr++] = (byte) (0x80 | ((REPLACEMENT_CHAR >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (REPLACEMENT_CHAR & 0x3f));
return outputPtr;
}

private int _decodeAndWriteSurrogate(int surr1, int surr2,
byte[] outBuf, int outputPtr)
{
final int c = 0x10000 + ((surr1 - SURR1_FIRST) << 10)
+ (surr2 - SURR2_FIRST);
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
return outputPtr;
}

/*
/**********************************************************
/* Internal methods, writing bytes
/**********************************************************
*/
*/

private final void _ensureRoomForOutput(int needed) throws IOException
{
Expand Down
Loading

0 comments on commit d2080f6

Please sign in to comment.