nodejs/lib/string_decoder.js

'use strict';

function assertEncoding(encoding) {
  if (encoding && !Buffer.isEncoding(encoding)) {
    throw new Error('Unknown encoding: ' + encoding);
  }
}

// StringDecoder provides an interface for efficiently splitting a series of
// buffers into a series of JS strings without breaking apart multi-byte
// characters. CESU-8 is handled as part of the UTF-8 encoding.
//
// @TODO Handling all encodings inside a single object makes it very difficult
// to reason about this code, so it should be split up in the future.
// @TODO There should be a utf8-strict encoding that rejects invalid UTF-8 code
// points as used by CESU-8.
const StringDecoder = exports.StringDecoder = function(encoding) {
  this.encoding = (encoding || 'utf8').toLowerCase().replace(/[-_]/, '');
  assertEncoding(encoding);
  switch (this.encoding) {
    case 'utf8':
      // CESU-8 represents each of Surrogate Pair by 3-bytes
      this.surrogateSize = 3;
      break;
    case 'ucs2':
    case 'utf16le':
      // UTF-16 represents each of Surrogate Pair by 2-bytes
      this.surrogateSize = 2;
      this.detectIncompleteChar = utf16DetectIncompleteChar;
      break;
    case 'base64':
      // Base-64 stores 3 bytes in 4 chars, and pads the remainder.
      this.surrogateSize = 3;
      this.detectIncompleteChar = base64DetectIncompleteChar;
      break;
    default:
      this.write = passThroughWrite;
      return;
  }

  // Enough space to store all bytes of a single character. UTF-8 needs 4
  // bytes, but CESU-8 may require up to 6 (3 bytes per surrogate).
  this.charBuffer = new Buffer(6);
  // Number of bytes received for the current incomplete multi-byte character.
  this.charReceived = 0;
  // Number of bytes expected for the current incomplete multi-byte character.
  this.charLength = 0;
};


// write decodes the given buffer and returns it as JS string that is
// guaranteed to not contain any partial multi-byte characters. Any partial
// character found at the end of the buffer is buffered up, and will be
// returned when calling write again with the remaining bytes.
//
// Note: Converting a Buffer containing an orphan surrogate to a String
// currently works, but converting a String to a Buffer (via `new Buffer`, or
// Buffer#write) will replace incomplete surrogates with the unicode
// replacement character. See https://codereview.chromium.org/121173009/ .
StringDecoder.prototype.write = function(buffer) {
  var charStr = '';
  // if our last write ended with an incomplete multibyte character
  while (this.charLength) {
    // determine how many remaining bytes this buffer has to offer for this char
    var available = (buffer.length >= this.charLength - this.charReceived) ?
        this.charLength - this.charReceived :
        buffer.length;

    // add the new bytes to the char buffer
    buffer.copy(this.charBuffer, this.charReceived, 0, available);
    this.charReceived += available;

    if (this.charReceived < this.charLength) {
      // still not enough chars in this buffer? wait for more ...
      return '';
    }

    // remove bytes belonging to the current character from the buffer
    buffer = buffer.slice(available, buffer.length);

    // get the character that was split
    charStr = this.charBuffer.slice(0, this.charLength).toString(this.encoding);

    // CESU-8: lead surrogate (D800-DBFF) is also the incomplete character
    var charCode = charStr.charCodeAt(charStr.length - 1);
    if (charCode >= 0xD800 && charCode <= 0xDBFF) {
      this.charLength += this.surrogateSize;
      charStr = '';
      continue;
    }
    this.charReceived = this.charLength = 0;

    // if there are no more bytes in this buffer, just emit our char
    if (buffer.length === 0) {
      return charStr;
    }
    break;
  }

  // determine and set charLength / charReceived
  this.detectIncompleteChar(buffer);

  var end = buffer.length;
  if (this.charLength) {
    // buffer the incomplete character bytes we got
    buffer.copy(this.charBuffer, 0, buffer.length - this.charReceived, end);
    end -= this.charReceived;
  }

  charStr += buffer.toString(this.encoding, 0, end);

  var end = charStr.length - 1;
  var charCode = charStr.charCodeAt(end);
  // CESU-8: lead surrogate (D800-DBFF) is also the incomplete character
  if (charCode >= 0xD800 && charCode <= 0xDBFF) {
    var size = this.surrogateSize;
    this.charLength += size;
    this.charReceived += size;
    this.charBuffer.copy(this.charBuffer, size, 0, size);
    buffer.copy(this.charBuffer, 0, 0, size);
    return charStr.substring(0, end);
  }

  // or just emit the charStr
  return charStr;
};

// detectIncompleteChar determines if there is an incomplete UTF-8 character at
// the end of the given buffer. If so, it sets this.charLength to the byte
// length that character, and sets this.charReceived to the number of bytes
// that are available for this character.
StringDecoder.prototype.detectIncompleteChar = function(buffer) {
  // determine how many bytes we have to check at the end of this buffer
  var i = (buffer.length >= 3) ? 3 : buffer.length;

  // Figure out if one of the last i bytes of our buffer announces an
  // incomplete char.
  for (; i > 0; i--) {
    var c = buffer[buffer.length - i];

    // See http://en.wikipedia.org/wiki/UTF-8#Description

    // 110XXXXX
    if (i == 1 && c >> 5 == 0x06) {
      this.charLength = 2;
      break;
    }

    // 1110XXXX
    if (i <= 2 && c >> 4 == 0x0E) {
      this.charLength = 3;
      break;
    }

    // 11110XXX
    if (i <= 3 && c >> 3 == 0x1E) {
      this.charLength = 4;
      break;
    }
  }
  this.charReceived = i;
};

StringDecoder.prototype.end = function(buffer) {
  var res = '';
  if (buffer && buffer.length)
    res = this.write(buffer);

  if (this.charReceived) {
    var cr = this.charReceived;
    var buf = this.charBuffer;
    var enc = this.encoding;
    res += buf.slice(0, cr).toString(enc);
  }

  return res;
};

function passThroughWrite(buffer) {
  return buffer.toString(this.encoding);
}

function utf16DetectIncompleteChar(buffer) {
  this.charReceived = buffer.length % 2;
  this.charLength = this.charReceived ? 2 : 0;
}

function base64DetectIncompleteChar(buffer) {
  this.charReceived = buffer.length % 3;
  this.charLength = this.charReceived ? 3 : 0;
}
lib: turn on strict mode Turn on strict mode for the files in the lib/ directory. It helps catch bugs and can have a positive effect on performance. PR-URL: https://github.com/node-forward/node/pull/64 Reviewed-By: Colin Ihrig <cjihrig@gmail.com> Reviewed-By: Fedor Indutny <fedor@indutny.com> 2014-11-22 16:59:48 +01:00			`'use strict';`

Don't allow invalid encodings in StringDecoder class 2012-10-05 02:43:15 +02:00			`function assertEncoding(encoding) {`
			`if (encoding && !Buffer.isEncoding(encoding)) {`
			`throw new Error('Unknown encoding: ' + encoding);`
			`}`
			`}`

string_decoder: Add more comments 2014-05-13 17:42:48 +02:00			`// StringDecoder provides an interface for efficiently splitting a series of`
			`// buffers into a series of JS strings without breaking apart multi-byte`
			`// characters. CESU-8 is handled as part of the UTF-8 encoding.`
			`//`
			`// @TODO Handling all encodings inside a single object makes it very difficult`
			`// to reason about this code, so it should be split up in the future.`
			`// @TODO There should be a utf8-strict encoding that rejects invalid UTF-8 code`
			`// points as used by CESU-8.`
lib: use const to define constants This commit replaces a number of var statements throughout the lib code with const statements. PR-URL: https://github.com/iojs/io.js/pull/541 Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl> 2015-01-21 17:36:59 +01:00			`const StringDecoder = exports.StringDecoder = function(encoding) {`
lint 2010-12-02 05:59:06 +01:00			`this.encoding = (encoding \|\| 'utf8').toLowerCase().replace(/[-_]/, '');`
Don't allow invalid encodings in StringDecoder class 2012-10-05 02:43:15 +02:00			`assertEncoding(encoding);`
string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00			`switch (this.encoding) {`
			`case 'utf8':`
			`// CESU-8 represents each of Surrogate Pair by 3-bytes`
			`this.surrogateSize = 3;`
			`break;`
			`case 'ucs2':`
			`case 'utf16le':`
			`// UTF-16 represents each of Surrogate Pair by 2-bytes`
			`this.surrogateSize = 2;`
			`this.detectIncompleteChar = utf16DetectIncompleteChar;`
			`break;`
string_decoder: Add 'end' method, do base64 properly 2012-10-12 00:53:11 +02:00			`case 'base64':`
			`// Base-64 stores 3 bytes in 4 chars, and pads the remainder.`
			`this.surrogateSize = 3;`
			`this.detectIncompleteChar = base64DetectIncompleteChar;`
			`break;`
string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00			`default:`
			`this.write = passThroughWrite;`
			`return;`
Refactor: Utf8Decoder -> StringDecoder Instead of just decoding Utf8, this will proxy requests to buffer.toString() for other encodings. This makes for a simpler interface. 2010-06-16 03:19:25 +02:00			`}`
string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00
string_decoder: Add more comments 2014-05-13 17:42:48 +02:00			`// Enough space to store all bytes of a single character. UTF-8 needs 4`
			`// bytes, but CESU-8 may require up to 6 (3 bytes per surrogate).`
string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00			`this.charBuffer = new Buffer(6);`
string_decoder: Add more comments 2014-05-13 17:42:48 +02:00			`// Number of bytes received for the current incomplete multi-byte character.`
string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00			`this.charReceived = 0;`
string_decoder: Add more comments 2014-05-13 17:42:48 +02:00			`// Number of bytes expected for the current incomplete multi-byte character.`
string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00			`this.charLength = 0;`
Implemented Utf8Decoder module Allows to safely decode a utf8 stream into strings without breaking on multibyte characters. 2010-05-18 04:25:51 +02:00			`};`

Refactor: Utf8Decoder -> StringDecoder Instead of just decoding Utf8, this will proxy requests to buffer.toString() for other encodings. This makes for a simpler interface. 2010-06-16 03:19:25 +02:00
string_decoder: Add more comments 2014-05-13 17:42:48 +02:00			`// write decodes the given buffer and returns it as JS string that is`
			`// guaranteed to not contain any partial multi-byte characters. Any partial`
			`// character found at the end of the buffer is buffered up, and will be`
			`// returned when calling write again with the remaining bytes.`
			`//`
			`// Note: Converting a Buffer containing an orphan surrogate to a String`
			// currently works, but converting a String to a Buffer (via `new Buffer`, or
			`// Buffer#write) will replace incomplete surrogates with the unicode`
			`// replacement character. See https://codereview.chromium.org/121173009/ .`
lint 2010-12-02 05:59:06 +01:00			`StringDecoder.prototype.write = function(buffer) {`
Implemented Utf8Decoder module Allows to safely decode a utf8 stream into strings without breaking on multibyte characters. 2010-05-18 04:25:51 +02:00			`var charStr = '';`
			`// if our last write ended with an incomplete multibyte character`
string_decoder: add support for CESU-8 Fixes #3217. 2012-05-05 05:22:01 +02:00			`while (this.charLength) {`
Implemented Utf8Decoder module Allows to safely decode a utf8 stream into strings without breaking on multibyte characters. 2010-05-18 04:25:51 +02:00			`// determine how many remaining bytes this buffer has to offer for this char`
string_decoder: Fix failures from new test cases This patch simplifies the implementation of StringDecoder, fixes the failures from the new test cases, and also no longer relies on v8's WriteUtf8 function to encode individual surrogates. 2014-05-13 17:36:40 +02:00			`var available = (buffer.length >= this.charLength - this.charReceived) ?`
lib: jslint string_decoder.js 2014-07-15 10:43:59 +02:00			`this.charLength - this.charReceived :`
			`buffer.length;`
Implemented Utf8Decoder module Allows to safely decode a utf8 stream into strings without breaking on multibyte characters. 2010-05-18 04:25:51 +02:00
			`// add the new bytes to the char buffer`
string_decoder: Fix failures from new test cases This patch simplifies the implementation of StringDecoder, fixes the failures from the new test cases, and also no longer relies on v8's WriteUtf8 function to encode individual surrogates. 2014-05-13 17:36:40 +02:00			`buffer.copy(this.charBuffer, this.charReceived, 0, available);`
			`this.charReceived += available;`
Implemented Utf8Decoder module Allows to safely decode a utf8 stream into strings without breaking on multibyte characters. 2010-05-18 04:25:51 +02:00
			`if (this.charReceived < this.charLength) {`
			`// still not enough chars in this buffer? wait for more ...`
Refactor: Utf8Decoder -> StringDecoder Instead of just decoding Utf8, this will proxy requests to buffer.toString() for other encodings. This makes for a simpler interface. 2010-06-16 03:19:25 +02:00			`return '';`
Implemented Utf8Decoder module Allows to safely decode a utf8 stream into strings without breaking on multibyte characters. 2010-05-18 04:25:51 +02:00			`}`

string_decoder: Fix failures from new test cases This patch simplifies the implementation of StringDecoder, fixes the failures from the new test cases, and also no longer relies on v8's WriteUtf8 function to encode individual surrogates. 2014-05-13 17:36:40 +02:00			`// remove bytes belonging to the current character from the buffer`
			`buffer = buffer.slice(available, buffer.length);`

Implemented Utf8Decoder module Allows to safely decode a utf8 stream into strings without breaking on multibyte characters. 2010-05-18 04:25:51 +02:00			`// get the character that was split`
string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00			`charStr = this.charBuffer.slice(0, this.charLength).toString(this.encoding);`
string_decoder: add support for CESU-8 Fixes #3217. 2012-05-05 05:22:01 +02:00
string_decoder: Fix failures from new test cases This patch simplifies the implementation of StringDecoder, fixes the failures from the new test cases, and also no longer relies on v8's WriteUtf8 function to encode individual surrogates. 2014-05-13 17:36:40 +02:00			`// CESU-8: lead surrogate (D800-DBFF) is also the incomplete character`
string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00			`var charCode = charStr.charCodeAt(charStr.length - 1);`
			`if (charCode >= 0xD800 && charCode <= 0xDBFF) {`
			`this.charLength += this.surrogateSize;`
			`charStr = '';`
			`continue;`
string_decoder: add support for CESU-8 Fixes #3217. 2012-05-05 05:22:01 +02:00			`}`
Implemented Utf8Decoder module Allows to safely decode a utf8 stream into strings without breaking on multibyte characters. 2010-05-18 04:25:51 +02:00			`this.charReceived = this.charLength = 0;`

Refactor: Utf8Decoder -> StringDecoder Instead of just decoding Utf8, this will proxy requests to buffer.toString() for other encodings. This makes for a simpler interface. 2010-06-16 03:19:25 +02:00			`// if there are no more bytes in this buffer, just emit our char`
string_decoder: Fix failures from new test cases This patch simplifies the implementation of StringDecoder, fixes the failures from the new test cases, and also no longer relies on v8's WriteUtf8 function to encode individual surrogates. 2014-05-13 17:36:40 +02:00			`if (buffer.length === 0) {`
			`return charStr;`
			`}`
string_decoder: add support for CESU-8 Fixes #3217. 2012-05-05 05:22:01 +02:00			`break;`
Implemented Utf8Decoder module Allows to safely decode a utf8 stream into strings without breaking on multibyte characters. 2010-05-18 04:25:51 +02:00			`}`

string_decoder: Fix failures from new test cases This patch simplifies the implementation of StringDecoder, fixes the failures from the new test cases, and also no longer relies on v8's WriteUtf8 function to encode individual surrogates. 2014-05-13 17:36:40 +02:00			`// determine and set charLength / charReceived`
			`this.detectIncompleteChar(buffer);`
string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00
			`var end = buffer.length;`
			`if (this.charLength) {`
			`// buffer the incomplete character bytes we got`
string_decoder: Fix failures from new test cases This patch simplifies the implementation of StringDecoder, fixes the failures from the new test cases, and also no longer relies on v8's WriteUtf8 function to encode individual surrogates. 2014-05-13 17:36:40 +02:00			`buffer.copy(this.charBuffer, 0, buffer.length - this.charReceived, end);`
			`end -= this.charReceived;`
string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00			`}`

			`charStr += buffer.toString(this.encoding, 0, end);`

			`var end = charStr.length - 1;`
			`var charCode = charStr.charCodeAt(end);`
string_decoder: Fix failures from new test cases This patch simplifies the implementation of StringDecoder, fixes the failures from the new test cases, and also no longer relies on v8's WriteUtf8 function to encode individual surrogates. 2014-05-13 17:36:40 +02:00			`// CESU-8: lead surrogate (D800-DBFF) is also the incomplete character`
string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00			`if (charCode >= 0xD800 && charCode <= 0xDBFF) {`
			`var size = this.surrogateSize;`
			`this.charLength += size;`
			`this.charReceived += size;`
			`this.charBuffer.copy(this.charBuffer, size, 0, size);`
string_decoder: Fix failures from new test cases This patch simplifies the implementation of StringDecoder, fixes the failures from the new test cases, and also no longer relies on v8's WriteUtf8 function to encode individual surrogates. 2014-05-13 17:36:40 +02:00			`buffer.copy(this.charBuffer, 0, 0, size);`
string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00			`return charStr.substring(0, end);`
			`}`

			`// or just emit the charStr`
			`return charStr;`
			`};`
Implemented Utf8Decoder module Allows to safely decode a utf8 stream into strings without breaking on multibyte characters. 2010-05-18 04:25:51 +02:00
string_decoder: Add more comments 2014-05-13 17:42:48 +02:00			`// detectIncompleteChar determines if there is an incomplete UTF-8 character at`
			`// the end of the given buffer. If so, it sets this.charLength to the byte`
			`// length that character, and sets this.charReceived to the number of bytes`
			`// that are available for this character.`
string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00			`StringDecoder.prototype.detectIncompleteChar = function(buffer) {`
Implemented Utf8Decoder module Allows to safely decode a utf8 stream into strings without breaking on multibyte characters. 2010-05-18 04:25:51 +02:00			`// determine how many bytes we have to check at the end of this buffer`
Refactor: Utf8Decoder -> StringDecoder Instead of just decoding Utf8, this will proxy requests to buffer.toString() for other encodings. This makes for a simpler interface. 2010-06-16 03:19:25 +02:00			`var i = (buffer.length >= 3) ? 3 : buffer.length;`
Implemented Utf8Decoder module Allows to safely decode a utf8 stream into strings without breaking on multibyte characters. 2010-05-18 04:25:51 +02:00
lint 2010-12-02 05:59:06 +01:00			`// Figure out if one of the last i bytes of our buffer announces an`
			`// incomplete char.`
Implemented Utf8Decoder module Allows to safely decode a utf8 stream into strings without breaking on multibyte characters. 2010-05-18 04:25:51 +02:00			`for (; i > 0; i--) {`
Fix global leaks 2010-12-05 00:20:34 +01:00			`var c = buffer[buffer.length - i];`
Implemented Utf8Decoder module Allows to safely decode a utf8 stream into strings without breaking on multibyte characters. 2010-05-18 04:25:51 +02:00
			`// See http://en.wikipedia.org/wiki/UTF-8#Description`

			`// 110XXXXX`
			`if (i == 1 && c >> 5 == 0x06) {`
			`this.charLength = 2;`
			`break;`
			`}`

			`// 1110XXXX`
			`if (i <= 2 && c >> 4 == 0x0E) {`
			`this.charLength = 3;`
			`break;`
			`}`

			`// 11110XXX`
			`if (i <= 3 && c >> 3 == 0x1E) {`
			`this.charLength = 4;`
			`break;`
			`}`
			`}`
string_decoder: Fix failures from new test cases This patch simplifies the implementation of StringDecoder, fixes the failures from the new test cases, and also no longer relies on v8's WriteUtf8 function to encode individual surrogates. 2014-05-13 17:36:40 +02:00			`this.charReceived = i;`
string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00			`};`
string_decoder: add support for CESU-8 Fixes #3217. 2012-05-05 05:22:01 +02:00
string_decoder: Add 'end' method, do base64 properly 2012-10-12 00:53:11 +02:00			`StringDecoder.prototype.end = function(buffer) {`
			`var res = '';`
			`if (buffer && buffer.length)`
			`res = this.write(buffer);`

			`if (this.charReceived) {`
			`var cr = this.charReceived;`
			`var buf = this.charBuffer;`
			`var enc = this.encoding;`
			`res += buf.slice(0, cr).toString(enc);`
			`}`

			`return res;`
			`};`

string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00			`function passThroughWrite(buffer) {`
			`return buffer.toString(this.encoding);`
			`}`
Refactor: Utf8Decoder -> StringDecoder Instead of just decoding Utf8, this will proxy requests to buffer.toString() for other encodings. This makes for a simpler interface. 2010-06-16 03:19:25 +02:00
string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00			`function utf16DetectIncompleteChar(buffer) {`
string_decoder: Fix failures from new test cases This patch simplifies the implementation of StringDecoder, fixes the failures from the new test cases, and also no longer relies on v8's WriteUtf8 function to encode individual surrogates. 2014-05-13 17:36:40 +02:00			`this.charReceived = buffer.length % 2;`
			`this.charLength = this.charReceived ? 2 : 0;`
string_decoder: added support for UTF-16LE Fixes #3223. 2012-05-05 15:47:24 +02:00			`}`
string_decoder: Add 'end' method, do base64 properly 2012-10-12 00:53:11 +02:00
			`function base64DetectIncompleteChar(buffer) {`
string_decoder: Fix failures from new test cases This patch simplifies the implementation of StringDecoder, fixes the failures from the new test cases, and also no longer relies on v8's WriteUtf8 function to encode individual surrogates. 2014-05-13 17:36:40 +02:00			`this.charReceived = buffer.length % 3;`
			`this.charLength = this.charReceived ? 3 : 0;`
string_decoder: Add 'end' method, do base64 properly 2012-10-12 00:53:11 +02:00			`}`