From 5d99a923f2bb9352abf80f8aeb850d924a8a1e38 Mon Sep 17 00:00:00 2001 From: Fedor Nezhivoi Date: Sun, 26 Jul 2020 18:12:43 +1000 Subject: [PATCH 1/7] Convert dbcs codec and tests (#256) --- encodings/dbcs-codec.js | 1014 +++++++++++---------- {test => generation/fixtures}/gbkFile.txt | 0 generation/gen-gbk-big5-fixtures.js | 41 + test/big5-test.js | 65 +- test/fixtures/gbk-big5.json | 10 + test/gbk-test.js | 130 ++- test/shiftjis-test.js | 38 +- test/webpack/iconv-lite-tests.js | 3 + 8 files changed, 673 insertions(+), 628 deletions(-) rename {test => generation/fixtures}/gbkFile.txt (100%) create mode 100644 generation/gen-gbk-big5-fixtures.js create mode 100644 test/fixtures/gbk-big5.json diff --git a/encodings/dbcs-codec.js b/encodings/dbcs-codec.js index 3294ed90..456c8ac7 100644 --- a/encodings/dbcs-codec.js +++ b/encodings/dbcs-codec.js @@ -1,12 +1,9 @@ "use strict"; -var Buffer = require("safer-buffer").Buffer; // Multibyte codec. In this scheme, a character is represented by 1 or more bytes. // Our codec supports UTF-16 surrogates, extensions for GB18030 and unicode sequences. // To save memory and loading time, we read table files only when requested. -exports._dbcs = DBCSCodec; - const UNASSIGNED = -1, GB18030_CODE = -2, SEQ_START = -10, @@ -21,588 +18,603 @@ for (let i = 0; i < 0x100; i++) { } // Class DBCSCodec reads and initializes mapping tables. -function DBCSCodec(codecOptions, iconv) { - this.encodingName = codecOptions.encodingName; - if (!codecOptions) throw new Error("DBCS codec is called without the data."); - if (!codecOptions.table) throw new Error("Encoding '" + this.encodingName + "' has no data."); - - // Load tables. - const mappingTable = codecOptions.table(); - - // Decode tables: MBCS -> Unicode. - - // decodeTables is a trie, encoded as an array of arrays of integers. Internal arrays are trie nodes and all have len = 256. - // Trie root is decodeTables[0]. - // Values: >= 0 -> unicode character code. can be > 0xFFFF - // == UNASSIGNED -> unknown/unassigned sequence. - // == GB18030_CODE -> this is the end of a GB18030 4-byte sequence. - // <= NODE_START -> index of the next node in our trie to process next byte. - // <= SEQ_START -> index of the start of a character code sequence, in decodeTableSeq. - this.decodeTables = []; - this.decodeTables[0] = UNASSIGNED_NODE.slice(0); // Create root node. - - // Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here. - this.decodeTableSeq = []; - - // Actual mapping tables consist of chunks. Use them to fill up decode tables. - for (let i = 0; i < mappingTable.length; i++) this._addDecodeChunk(mappingTable[i]); - - // Load & create GB18030 tables when needed. - if (typeof codecOptions.gb18030 === "function") { - this.gb18030 = codecOptions.gb18030(); // Load GB18030 ranges. - - // Add GB18030 common decode nodes. - const commonThirdByteNodeIdx = this.decodeTables.length; - this.decodeTables.push(UNASSIGNED_NODE.slice(0)); - - const commonFourthByteNodeIdx = this.decodeTables.length; - this.decodeTables.push(UNASSIGNED_NODE.slice(0)); - - // Fill out the tree - const firstByteNode = this.decodeTables[0]; - for (let i = 0x81; i <= 0xfe; i++) { - const secondNodeIdx = NODE_START - firstByteNode[i]; - const secondByteNode = this.decodeTables[secondNodeIdx]; - for (let j = 0x30; j <= 0x39; j++) { - if (secondByteNode[j] === UNASSIGNED) { - secondByteNode[j] = NODE_START - commonThirdByteNodeIdx; - } else if (secondByteNode[j] > NODE_START) { - throw new Error("gb18030 decode tables conflict at byte 2"); - } - - const thirdNodeIdx = NODE_START - secondByteNode[j]; - const thirdByteNode = this.decodeTables[thirdNodeIdx]; - for (let k = 0x81; k <= 0xfe; k++) { - const commonFourthNodeIdx = NODE_START - commonFourthByteNodeIdx; - if (thirdByteNode[k] === UNASSIGNED) { - thirdByteNode[k] = commonFourthNodeIdx; - } else if (thirdByteNode[k] === commonFourthNodeIdx) { - continue; - } else if (thirdByteNode[k] > NODE_START) { - throw new Error("gb18030 decode tables conflict at byte 3"); +exports._dbcs = class DBCSCodec { + constructor(codecOptions, iconv) { + this.encodingName = codecOptions.encodingName; + if (!codecOptions) throw new Error("DBCS codec is called without the data."); + if (!codecOptions.table) + throw new Error("Encoding '" + this.encodingName + "' has no data."); + + // Load tables. + const mappingTable = codecOptions.table(); + + // Decode tables: MBCS -> Unicode. + + // decodeTables is a trie, encoded as an array of arrays of integers. Internal arrays are trie nodes and all have len = 256. + // Trie root is decodeTables[0]. + // Values: >= 0 -> unicode character code. can be > 0xFFFF + // == UNASSIGNED -> unknown/unassigned sequence. + // == GB18030_CODE -> this is the end of a GB18030 4-byte sequence. + // <= NODE_START -> index of the next node in our trie to process next byte. + // <= SEQ_START -> index of the start of a character code sequence, in decodeTableSeq. + this.decodeTables = []; + this.decodeTables[0] = UNASSIGNED_NODE.slice(0); // Create root node. + + // Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here. + this.decodeTableSeq = []; + + // Actual mapping tables consist of chunks. Use them to fill up decode tables. + for (let i = 0; i < mappingTable.length; i++) this._addDecodeChunk(mappingTable[i]); + + // Load & create GB18030 tables when needed. + if (typeof codecOptions.gb18030 === "function") { + this.gb18030 = codecOptions.gb18030(); // Load GB18030 ranges. + + // Add GB18030 common decode nodes. + const commonThirdByteNodeIdx = this.decodeTables.length; + this.decodeTables.push(UNASSIGNED_NODE.slice(0)); + + const commonFourthByteNodeIdx = this.decodeTables.length; + this.decodeTables.push(UNASSIGNED_NODE.slice(0)); + + // Fill out the tree + const firstByteNode = this.decodeTables[0]; + for (let i = 0x81; i <= 0xfe; i++) { + const secondNodeIdx = NODE_START - firstByteNode[i]; + const secondByteNode = this.decodeTables[secondNodeIdx]; + for (let j = 0x30; j <= 0x39; j++) { + if (secondByteNode[j] === UNASSIGNED) { + secondByteNode[j] = NODE_START - commonThirdByteNodeIdx; + } else if (secondByteNode[j] > NODE_START) { + throw new Error("gb18030 decode tables conflict at byte 2"); } - const fourthNodeIdx = NODE_START - thirdByteNode[k]; - const fourthByteNode = this.decodeTables[fourthNodeIdx]; - for (let l = 0x30; l <= 0x39; l++) { - if (fourthByteNode[l] === UNASSIGNED) fourthByteNode[l] = GB18030_CODE; + const thirdNodeIdx = NODE_START - secondByteNode[j]; + const thirdByteNode = this.decodeTables[thirdNodeIdx]; + for (let k = 0x81; k <= 0xfe; k++) { + const commonFourthNodeIdx = NODE_START - commonFourthByteNodeIdx; + if (thirdByteNode[k] === UNASSIGNED) { + thirdByteNode[k] = commonFourthNodeIdx; + } else if (thirdByteNode[k] === commonFourthNodeIdx) { + continue; + } else if (thirdByteNode[k] > NODE_START) { + throw new Error("gb18030 decode tables conflict at byte 3"); + } + + const fourthNodeIdx = NODE_START - thirdByteNode[k]; + const fourthByteNode = this.decodeTables[fourthNodeIdx]; + for (let l = 0x30; l <= 0x39; l++) { + if (fourthByteNode[l] === UNASSIGNED) fourthByteNode[l] = GB18030_CODE; + } } } } } - } - this.defaultCharUnicode = iconv.defaultCharUnicode; - - // Encode tables: Unicode -> DBCS. - - // `encodeTable` is array mapping from unicode char to encoded char. All its values are integers for performance. - // Because it can be sparse, it is represented as array of buckets by 256 chars each. Bucket can be null. - // Values: >= 0 -> it is a normal char. Write the value (if <=256 then 1 byte, if <=65536 then 2 bytes, etc.). - // == UNASSIGNED -> no conversion found. Output a default char. - // <= SEQ_START -> it's an index in encodeTableSeq, see below. The character starts a sequence. - this.encodeTable = []; - - // `encodeTableSeq` is used when a sequence of unicode characters is encoded as a single code. We use a tree of - // objects where keys correspond to characters in sequence and leafs are the encoded dbcs values. A special DEF_CHAR key - // means end of sequence (needed when one sequence is a strict subsequence of another). - // Objects are kept separately from encodeTable to increase performance. - this.encodeTableSeq = []; - - // Some chars can be decoded, but need not be encoded. - const skipEncodeChars = {}; - if (codecOptions.encodeSkipVals) - for (let i = 0; i < codecOptions.encodeSkipVals.length; i++) { - const val = codecOptions.encodeSkipVals[i]; - if (typeof val === "number") { - skipEncodeChars[val] = true; - } else { - for (let j = val.from; j <= val.to; j++) skipEncodeChars[j] = true; + this.defaultCharUnicode = iconv.defaultCharUnicode; + + // Encode tables: Unicode -> DBCS. + + // `encodeTable` is array mapping from unicode char to encoded char. All its values are integers for performance. + // Because it can be sparse, it is represented as array of buckets by 256 chars each. Bucket can be null. + // Values: >= 0 -> it is a normal char. Write the value (if <=256 then 1 byte, if <=65536 then 2 bytes, etc.). + // == UNASSIGNED -> no conversion found. Output a default char. + // <= SEQ_START -> it's an index in encodeTableSeq, see below. The character starts a sequence. + this.encodeTable = []; + + // `encodeTableSeq` is used when a sequence of unicode characters is encoded as a single code. We use a tree of + // objects where keys correspond to characters in sequence and leafs are the encoded dbcs values. A special DEF_CHAR key + // means end of sequence (needed when one sequence is a strict subsequence of another). + // Objects are kept separately from encodeTable to increase performance. + this.encodeTableSeq = []; + + // Some chars can be decoded, but need not be encoded. + const skipEncodeChars = {}; + if (codecOptions.encodeSkipVals) + for (let i = 0; i < codecOptions.encodeSkipVals.length; i++) { + const val = codecOptions.encodeSkipVals[i]; + if (typeof val === "number") { + skipEncodeChars[val] = true; + } else { + for (let j = val.from; j <= val.to; j++) skipEncodeChars[j] = true; + } } - } - // Use decode trie to recursively fill out encode tables. - this._fillEncodeTable(0, 0, skipEncodeChars); + // Use decode trie to recursively fill out encode tables. + this._fillEncodeTable(0, 0, skipEncodeChars); + + // Add more encoding pairs when needed. + if (codecOptions.encodeAdd) { + for (const uChar in codecOptions.encodeAdd) { + if (hasOwnProperty.call(codecOptions.encodeAdd, uChar)) + this._setEncodeChar(uChar.charCodeAt(0), codecOptions.encodeAdd[uChar]); + } + } - // Add more encoding pairs when needed. - if (codecOptions.encodeAdd) { - for (const uChar in codecOptions.encodeAdd) { - if (hasOwnProperty.call(codecOptions.encodeAdd, uChar)) - this._setEncodeChar(uChar.charCodeAt(0), codecOptions.encodeAdd[uChar]); + this.defCharSB = this.encodeTable[0][iconv.defaultCharSingleByte.charCodeAt(0)]; + if (this.defCharSB === UNASSIGNED) { + this.defCharSB = this.encodeTable[0]["?"]; + } + if (this.defCharSB === UNASSIGNED) { + this.defCharSB = "?".charCodeAt(0); } } - this.defCharSB = this.encodeTable[0][iconv.defaultCharSingleByte.charCodeAt(0)]; - if (this.defCharSB === UNASSIGNED) { - this.defCharSB = this.encodeTable[0]["?"]; + get decoder() { + return DBCSDecoder; } - if (this.defCharSB === UNASSIGNED) { - this.defCharSB = "?".charCodeAt(0); + + get encoder() { + return DBCSEncoder; } -} -DBCSCodec.prototype.encoder = DBCSEncoder; -DBCSCodec.prototype.decoder = DBCSDecoder; - -// Decoder helpers -DBCSCodec.prototype._getDecodeTrieNode = function (addr) { - const bytes = []; - for (; addr > 0; addr >>>= 8) bytes.push(addr & 0xff); - if (bytes.length === 0) bytes.push(0); - - let node = this.decodeTables[0]; - for (let i = bytes.length - 1; i > 0; i--) { - // Traverse nodes deeper into the trie. - const val = node[bytes[i]]; - - if (val === UNASSIGNED) { - // Create new node. - node[bytes[i]] = NODE_START - this.decodeTables.length; - this.decodeTables.push((node = UNASSIGNED_NODE.slice(0))); - } else if (val <= NODE_START) { - // Existing node. - node = this.decodeTables[NODE_START - val]; - } else { - const hexAddr = addr.toString(16); - throw new Error(`Overwrite byte in ${this.encodingName}, addr: ${hexAddr}`); + _getDecodeTrieNode(addr) { + const bytes = []; + for (; addr > 0; addr >>>= 8) bytes.push(addr & 0xff); + if (bytes.length === 0) bytes.push(0); + + let node = this.decodeTables[0]; + for (let i = bytes.length - 1; i > 0; i--) { + // Traverse nodes deeper into the trie. + const val = node[bytes[i]]; + + if (val === UNASSIGNED) { + // Create new node. + node[bytes[i]] = NODE_START - this.decodeTables.length; + this.decodeTables.push((node = UNASSIGNED_NODE.slice(0))); + } else if (val <= NODE_START) { + // Existing node. + node = this.decodeTables[NODE_START - val]; + } else { + const hexAddr = addr.toString(16); + throw new Error(`Overwrite byte in ${this.encodingName}, addr: ${hexAddr}`); + } } + return node; } - return node; -}; -DBCSCodec.prototype._addDecodeChunk = function (chunk) { - // First element of chunk is the hex mbcs code where we start. - let curAddr = parseInt(chunk[0], 16); - - // Choose the decoding node where we'll write our chars. - const writeTable = this._getDecodeTrieNode(curAddr); - curAddr = curAddr & 0xff; - - // Write all other elements of the chunk to the table. - for (let k = 1; k < chunk.length; k++) { - const part = chunk[k]; - if (typeof part === "string") { - // String, write as-is. - for (let l = 0; l < part.length; ) { - const code = part.charCodeAt(l++); - if (0xd800 <= code && code < 0xdc00) { - // Decode surrogate - const codeTrail = part.charCodeAt(l++); - if (0xdc00 <= codeTrail && codeTrail < 0xe000) { - writeTable[curAddr++] = - 0x10000 + (code - 0xd800) * 0x400 + (codeTrail - 0xdc00); + _addDecodeChunk(chunk) { + // First element of chunk is the hex mbcs code where we start. + let curAddr = parseInt(chunk[0], 16); + + // Choose the decoding node where we'll write our chars. + const writeTable = this._getDecodeTrieNode(curAddr); + curAddr = curAddr & 0xff; + + // Write all other elements of the chunk to the table. + for (let k = 1; k < chunk.length; k++) { + const part = chunk[k]; + if (typeof part === "string") { + // String, write as-is. + for (let l = 0; l < part.length; ) { + const code = part.charCodeAt(l++); + if (0xd800 <= code && code < 0xdc00) { + // Decode surrogate + const codeTrail = part.charCodeAt(l++); + if (0xdc00 <= codeTrail && codeTrail < 0xe000) { + writeTable[curAddr++] = + 0x10000 + (code - 0xd800) * 0x400 + (codeTrail - 0xdc00); + } else { + throw new Error( + `Incorrect surrogate pair in ${this.encodingName} at chunk ${chunk[0]}` + ); + } + } else if (0x0ff0 < code && code <= 0x0fff) { + // Character sequence (our own encoding used) + const len = 0xfff - code + 2; + const seq = []; + for (let m = 0; m < len; m++) { + // Simple variation: don't support surrogates or subsequences in seq. + seq.push(part.charCodeAt(l++)); + } + + writeTable[curAddr++] = SEQ_START - this.decodeTableSeq.length; + this.decodeTableSeq.push(seq); } else { - throw new Error( - `Incorrect surrogate pair in ${this.encodingName} at chunk ${chunk[0]}` - ); + writeTable[curAddr++] = code; // Basic char } - } else if (0x0ff0 < code && code <= 0x0fff) { - // Character sequence (our own encoding used) - const len = 0xfff - code + 2; - const seq = []; - for (let m = 0; m < len; m++) { - // Simple variation: don't support surrogates or subsequences in seq. - seq.push(part.charCodeAt(l++)); - } - - writeTable[curAddr++] = SEQ_START - this.decodeTableSeq.length; - this.decodeTableSeq.push(seq); - } else { - writeTable[curAddr++] = code; // Basic char } - } - } else if (typeof part === "number") { - // Integer, meaning increasing sequence starting with prev character. - let charCode = writeTable[curAddr - 1] + 1; - for (let l = 0; l < part; l++) { - writeTable[curAddr++] = charCode++; - } - } else + } else if (typeof part === "number") { + // Integer, meaning increasing sequence starting with prev character. + let charCode = writeTable[curAddr - 1] + 1; + for (let l = 0; l < part; l++) { + writeTable[curAddr++] = charCode++; + } + } else + throw new Error( + `Incorrect type '${typeof part}' given in ${this.encodingName} at chunk ${ + chunk[0] + }` + ); + } + if (curAddr > 0xff) throw new Error( - `Incorrect type '${typeof part}' given in ${this.encodingName} at chunk ${chunk[0]}` + `Incorrect chunk in ${this.encodingName} at addr ${chunk[0]}: too long ${curAddr}` ); } - if (curAddr > 0xff) - throw new Error( - `Incorrect chunk in ${this.encodingName} at addr ${chunk[0]}: too long ${curAddr}` - ); -}; - -// Encoder helpers -DBCSCodec.prototype._getEncodeBucket = function (uCode) { - const high = uCode >> 8; // This could be > 0xFF because of astral characters. - if (this.encodeTable[high] === undefined) this.encodeTable[high] = UNASSIGNED_NODE.slice(0); // Create bucket on demand. - return this.encodeTable[high]; -}; -DBCSCodec.prototype._setEncodeChar = function (uCode, dbcsCode) { - const bucket = this._getEncodeBucket(uCode); - const low = uCode & 0xff; - if (bucket[low] <= SEQ_START) { - // There's already a sequence, set a single-char subsequence of it. - this.encodeTableSeq[SEQ_START - bucket[low]][DEF_CHAR] = dbcsCode; - } else if (bucket[low] === UNASSIGNED) { - bucket[low] = dbcsCode; + _getEncodeBucket(uCode) { + const high = uCode >> 8; // This could be > 0xFF because of astral characters. + if (this.encodeTable[high] === undefined) this.encodeTable[high] = UNASSIGNED_NODE.slice(0); // Create bucket on demand. + return this.encodeTable[high]; } -}; -DBCSCodec.prototype._setEncodeSequence = function (seq, dbcsCode) { - // Get the root of character tree according to first character of the sequence. - const uCode = seq[0]; - const bucket = this._getEncodeBucket(uCode); - const low = uCode & 0xff; - - let node; - if (bucket[low] <= SEQ_START) { - // There's already a sequence with - use it. - node = this.encodeTableSeq[SEQ_START - bucket[low]]; - } else { - // There was no sequence object - allocate a new one. - node = {}; - if (bucket[low] !== UNASSIGNED) node[DEF_CHAR] = bucket[low]; // If a char was set before - make it a single-char subsequence. - bucket[low] = SEQ_START - this.encodeTableSeq.length; - this.encodeTableSeq.push(node); + _setEncodeChar(uCode, dbcsCode) { + const bucket = this._getEncodeBucket(uCode); + const low = uCode & 0xff; + if (bucket[low] <= SEQ_START) { + // There's already a sequence, set a single-char subsequence of it. + this.encodeTableSeq[SEQ_START - bucket[low]][DEF_CHAR] = dbcsCode; + } else if (bucket[low] === UNASSIGNED) { + bucket[low] = dbcsCode; + } } - // Traverse the character tree, allocating new nodes as needed. - for (let j = 1; j < seq.length - 1; j++) { - const oldVal = node[uCode]; - if (typeof oldVal === "object") { - node = oldVal; + _setEncodeSequence(seq, dbcsCode) { + // Get the root of character tree according to first character of the sequence. + const uCode = seq[0]; + const bucket = this._getEncodeBucket(uCode); + const low = uCode & 0xff; + + let node; + if (bucket[low] <= SEQ_START) { + // There's already a sequence with - use it. + node = this.encodeTableSeq[SEQ_START - bucket[low]]; } else { - node = node[uCode] = {}; - if (oldVal !== undefined) node[DEF_CHAR] = oldVal; + // There was no sequence object - allocate a new one. + node = {}; + if (bucket[low] !== UNASSIGNED) node[DEF_CHAR] = bucket[low]; // If a char was set before - make it a single-char subsequence. + bucket[low] = SEQ_START - this.encodeTableSeq.length; + this.encodeTableSeq.push(node); } - } - // Set the leaf to given dbcsCode. - const uCode2 = seq[seq.length - 1]; - node[uCode2] = dbcsCode; -}; + // Traverse the character tree, allocating new nodes as needed. + for (let j = 1; j < seq.length - 1; j++) { + const oldVal = node[uCode]; + if (typeof oldVal === "object") { + node = oldVal; + } else { + node = node[uCode] = {}; + if (oldVal !== undefined) node[DEF_CHAR] = oldVal; + } + } -DBCSCodec.prototype._fillEncodeTable = function (nodeIdx, prefix, skipEncodeChars) { - const node = this.decodeTables[nodeIdx]; - let hasValues = false; - const subNodeEmpty = {}; - for (let i = 0; i < 0x100; i++) { - const uCode = node[i]; - const mbCode = prefix + i; - if (skipEncodeChars[mbCode]) continue; - - if (uCode >= 0) { - this._setEncodeChar(uCode, mbCode); - hasValues = true; - } else if (uCode <= NODE_START) { - const subNodeIdx = NODE_START - uCode; - if (!subNodeEmpty[subNodeIdx]) { - // Skip empty subtrees (they are too large in gb18030). - var newPrefix = (mbCode << 8) >>> 0; // NOTE: '>>> 0' keeps 32-bit num positive. - if (this._fillEncodeTable(subNodeIdx, newPrefix, skipEncodeChars)) { - hasValues = true; - } else { - subNodeEmpty[subNodeIdx] = true; + // Set the leaf to given dbcsCode. + const uCode2 = seq[seq.length - 1]; + node[uCode2] = dbcsCode; + } + + _fillEncodeTable(nodeIdx, prefix, skipEncodeChars) { + const node = this.decodeTables[nodeIdx]; + let hasValues = false; + const subNodeEmpty = {}; + for (let i = 0; i < 0x100; i++) { + const uCode = node[i]; + const mbCode = prefix + i; + if (skipEncodeChars[mbCode]) continue; + + if (uCode >= 0) { + this._setEncodeChar(uCode, mbCode); + hasValues = true; + } else if (uCode <= NODE_START) { + const subNodeIdx = NODE_START - uCode; + if (!subNodeEmpty[subNodeIdx]) { + // Skip empty subtrees (they are too large in gb18030). + var newPrefix = (mbCode << 8) >>> 0; // NOTE: '>>> 0' keeps 32-bit num positive. + if (this._fillEncodeTable(subNodeIdx, newPrefix, skipEncodeChars)) { + hasValues = true; + } else { + subNodeEmpty[subNodeIdx] = true; + } } + } else if (uCode <= SEQ_START) { + this._setEncodeSequence(this.decodeTableSeq[SEQ_START - uCode], mbCode); + hasValues = true; } - } else if (uCode <= SEQ_START) { - this._setEncodeSequence(this.decodeTableSeq[SEQ_START - uCode], mbCode); - hasValues = true; } + return hasValues; } - return hasValues; }; // == Encoder ================================================================== -function DBCSEncoder(options, codec) { - // Encoder state - this.leadSurrogate = -1; - this.seqObj = undefined; - - // Static data - this.encodeTable = codec.encodeTable; - this.encodeTableSeq = codec.encodeTableSeq; - this.defaultCharSingleByte = codec.defCharSB; - this.gb18030 = codec.gb18030; -} +class DBCSEncoder { + constructor(options, codec, backend) { + this.backend = backend; + // Encoder state + this.leadSurrogate = -1; + this.seqObj = undefined; -DBCSEncoder.prototype.write = function (str) { - const newBuf = Buffer.alloc(str.length * (this.gb18030 ? 4 : 3)); - let leadSurrogate = this.leadSurrogate, - seqObj = this.seqObj, - nextChar = -1, - i = 0, - j = 0; - - for (;;) { - // 0. Get next character. - let uCode; - if (nextChar === -1) { - if (i === str.length) break; - uCode = str.charCodeAt(i++); - } else { - uCode = nextChar; - nextChar = -1; - } + // Static data + this.encodeTable = codec.encodeTable; + this.encodeTableSeq = codec.encodeTableSeq; + this.defaultCharSingleByte = codec.defCharSB; + this.gb18030 = codec.gb18030; + } - // 1. Handle surrogates. - if (0xd800 <= uCode && uCode < 0xe000) { - // Char is one of surrogates. - if (uCode < 0xdc00) { - // We've got a lead surrogate. - if (leadSurrogate === -1) { - leadSurrogate = uCode; - continue; - } else { - leadSurrogate = uCode; - // Double lead surrogate found. - uCode = UNASSIGNED; - } + write(str) { + const bytes = this.backend.allocBytes(str.length * (this.gb18030 ? 4 : 3)); + let leadSurrogate = this.leadSurrogate, + seqObj = this.seqObj, + nextChar = -1, + i = 0, + bytePos = 0; + + for (;;) { + // 0. Get next character. + let uCode; + if (nextChar === -1) { + if (i === str.length) break; + uCode = str.charCodeAt(i++); } else { - // We've got trail surrogate. - if (leadSurrogate !== -1) { - uCode = 0x10000 + (leadSurrogate - 0xd800) * 0x400 + (uCode - 0xdc00); - leadSurrogate = -1; - } else { - // Incomplete surrogate pair - only trail surrogate found. - uCode = UNASSIGNED; - } + uCode = nextChar; + nextChar = -1; } - } else if (leadSurrogate !== -1) { - // Incomplete surrogate pair - only lead surrogate found. - nextChar = uCode; - uCode = UNASSIGNED; // Write an error, then current char. - leadSurrogate = -1; - } - // 2. Convert uCode character. - let dbcsCode = UNASSIGNED; - if (seqObj !== undefined && uCode !== UNASSIGNED) { - // We are in the middle of the sequence - let resCode = seqObj[uCode]; - if (typeof resCode === "object") { - // Sequence continues. - seqObj = resCode; - continue; - } else if (typeof resCode == "number") { - // Sequence finished. Write it. - dbcsCode = resCode; - } else if (resCode === undefined) { - // Current character is not part of the sequence. - - // Try default character for this sequence - resCode = seqObj[DEF_CHAR]; - if (resCode !== undefined) { - dbcsCode = resCode; // Found. Write it. - nextChar = uCode; // Current character will be written too in the next iteration. + // 1. Handle surrogates. + if (0xd800 <= uCode && uCode < 0xe000) { + // Char is one of surrogates. + if (uCode < 0xdc00) { + // We've got a lead surrogate. + if (leadSurrogate === -1) { + leadSurrogate = uCode; + continue; + } else { + leadSurrogate = uCode; + // Double lead surrogate found. + uCode = UNASSIGNED; + } } else { - // TODO: What if we have no default? (resCode == undefined) - // Then, we should write first char of the sequence as-is and try the rest recursively. - // Didn't do it for now because no encoding has this situation yet. - // Currently, just skip the sequence and write current char. + // We've got trail surrogate. + if (leadSurrogate !== -1) { + uCode = 0x10000 + (leadSurrogate - 0xd800) * 0x400 + (uCode - 0xdc00); + leadSurrogate = -1; + } else { + // Incomplete surrogate pair - only trail surrogate found. + uCode = UNASSIGNED; + } } - } - seqObj = undefined; - } else if (uCode >= 0) { - // Regular character - const subtable = this.encodeTable[uCode >> 8]; - if (subtable !== undefined) dbcsCode = subtable[uCode & 0xff]; - - if (dbcsCode <= SEQ_START) { - // Sequence start - seqObj = this.encodeTableSeq[SEQ_START - dbcsCode]; - continue; + } else if (leadSurrogate !== -1) { + // Incomplete surrogate pair - only lead surrogate found. + nextChar = uCode; + uCode = UNASSIGNED; // Write an error, then current char. + leadSurrogate = -1; } - if (dbcsCode === UNASSIGNED && this.gb18030) { - // Use GB18030 algorithm to find character(s) to write. - const idx = findIdx(this.gb18030.uChars, uCode); - if (idx !== -1) { - dbcsCode = this.gb18030.gbChars[idx] + (uCode - this.gb18030.uChars[idx]); - newBuf[j++] = 0x81 + Math.floor(dbcsCode / 12600); - dbcsCode = dbcsCode % 12600; - newBuf[j++] = 0x30 + Math.floor(dbcsCode / 1260); - dbcsCode = dbcsCode % 1260; - newBuf[j++] = 0x81 + Math.floor(dbcsCode / 10); - dbcsCode = dbcsCode % 10; - newBuf[j++] = 0x30 + dbcsCode; + // 2. Convert uCode character. + let dbcsCode = UNASSIGNED; + if (seqObj !== undefined && uCode !== UNASSIGNED) { + // We are in the middle of the sequence + let resCode = seqObj[uCode]; + if (typeof resCode === "object") { + // Sequence continues. + seqObj = resCode; continue; + } else if (typeof resCode == "number") { + // Sequence finished. Write it. + dbcsCode = resCode; + } else if (resCode === undefined) { + // Current character is not part of the sequence. + + // Try default character for this sequence + resCode = seqObj[DEF_CHAR]; + if (resCode !== undefined) { + dbcsCode = resCode; // Found. Write it. + nextChar = uCode; // Current character will be written too in the next iteration. + } else { + // TODO: What if we have no default? (resCode == undefined) + // Then, we should write first char of the sequence as-is and try the rest recursively. + // Didn't do it for now because no encoding has this situation yet. + // Currently, just skip the sequence and write current char. + } + } + seqObj = undefined; + } else if (uCode >= 0) { + // Regular character + const subtable = this.encodeTable[uCode >> 8]; + if (subtable !== undefined) dbcsCode = subtable[uCode & 0xff]; + + if (dbcsCode <= SEQ_START) { + // Sequence start + seqObj = this.encodeTableSeq[SEQ_START - dbcsCode]; + continue; + } + + if (dbcsCode === UNASSIGNED && this.gb18030) { + // Use GB18030 algorithm to find character(s) to write. + const idx = findIdx(this.gb18030.uChars, uCode); + if (idx !== -1) { + dbcsCode = this.gb18030.gbChars[idx] + (uCode - this.gb18030.uChars[idx]); + bytes[bytePos++] = 0x81 + Math.floor(dbcsCode / 12600); + dbcsCode = dbcsCode % 12600; + bytes[bytePos++] = 0x30 + Math.floor(dbcsCode / 1260); + dbcsCode = dbcsCode % 1260; + bytes[bytePos++] = 0x81 + Math.floor(dbcsCode / 10); + dbcsCode = dbcsCode % 10; + bytes[bytePos++] = 0x30 + dbcsCode; + continue; + } } } - } - // 3. Write dbcsCode character. - if (dbcsCode === UNASSIGNED) { - dbcsCode = this.defaultCharSingleByte; - } + // 3. Write dbcsCode character. + if (dbcsCode === UNASSIGNED) { + dbcsCode = this.defaultCharSingleByte; + } - if (dbcsCode < 0x100) { - newBuf[j++] = dbcsCode; - } else if (dbcsCode < 0x10000) { - newBuf[j++] = dbcsCode >> 8; // high byte - newBuf[j++] = dbcsCode & 0xff; // low byte - } else if (dbcsCode < 0x1000000) { - newBuf[j++] = dbcsCode >> 16; - newBuf[j++] = (dbcsCode >> 8) & 0xff; - newBuf[j++] = dbcsCode & 0xff; - } else { - newBuf[j++] = dbcsCode >>> 24; - newBuf[j++] = (dbcsCode >>> 16) & 0xff; - newBuf[j++] = (dbcsCode >>> 8) & 0xff; - newBuf[j++] = dbcsCode & 0xff; + if (dbcsCode < 0x100) { + bytes[bytePos++] = dbcsCode; + } else if (dbcsCode < 0x10000) { + bytes[bytePos++] = dbcsCode >> 8; // high byte + bytes[bytePos++] = dbcsCode & 0xff; // low byte + } else if (dbcsCode < 0x1000000) { + bytes[bytePos++] = dbcsCode >> 16; + bytes[bytePos++] = (dbcsCode >> 8) & 0xff; + bytes[bytePos++] = dbcsCode & 0xff; + } else { + bytes[bytePos++] = dbcsCode >>> 24; + bytes[bytePos++] = (dbcsCode >>> 16) & 0xff; + bytes[bytePos++] = (dbcsCode >>> 8) & 0xff; + bytes[bytePos++] = dbcsCode & 0xff; + } } - } - this.seqObj = seqObj; - this.leadSurrogate = leadSurrogate; - return newBuf.slice(0, j); -}; - -DBCSEncoder.prototype.end = function () { - if (this.leadSurrogate === -1 && this.seqObj === undefined) { - return undefined; // All clean. Most often case. + this.seqObj = seqObj; + this.leadSurrogate = leadSurrogate; + return this.backend.bytesToResult(bytes, bytePos); } - const newBuf = Buffer.alloc(10); - let j = 0; + end() { + if (this.leadSurrogate === -1 && this.seqObj === undefined) { + return undefined; // All clean. Most often case. + } - if (this.seqObj) { - // We're in the sequence. - const dbcsCode = this.seqObj[DEF_CHAR]; - if (dbcsCode !== undefined) { - // Write beginning of the sequence. - if (dbcsCode < 0x100) { - newBuf[j++] = dbcsCode; + const bytes = this.backend.allocBytes(10); + let bytePos = 0; + + if (this.seqObj) { + // We're in the sequence. + const dbcsCode = this.seqObj[DEF_CHAR]; + if (dbcsCode !== undefined) { + // Write beginning of the sequence. + if (dbcsCode < 0x100) { + bytes[bytePos++] = dbcsCode; + } else { + bytes[bytePos++] = dbcsCode >> 8; // high byte + bytes[bytePos++] = dbcsCode & 0xff; // low byte + } } else { - newBuf[j++] = dbcsCode >> 8; // high byte - newBuf[j++] = dbcsCode & 0xff; // low byte + // See todo above. } - } else { - // See todo above. + this.seqObj = undefined; } - this.seqObj = undefined; - } - if (this.leadSurrogate !== -1) { - // Incomplete surrogate pair - only lead surrogate found. - newBuf[j++] = this.defaultCharSingleByte; - this.leadSurrogate = -1; - } + if (this.leadSurrogate !== -1) { + // Incomplete surrogate pair - only lead surrogate found. + bytes[bytePos++] = this.defaultCharSingleByte; + this.leadSurrogate = -1; + } - return newBuf.slice(0, j); -}; + return this.backend.bytesToResult(bytes, bytePos); + } -// Export for testing -DBCSEncoder.prototype.findIdx = findIdx; + // Export for testing + findIdx(table, val) { + return findIdx(table, val); + } +} // == Decoder ================================================================== -function DBCSDecoder(options, codec) { - // Decoder state - this.nodeIdx = 0; - this.prevBytes = []; +class DBCSDecoder { + constructor(options, codec, backend) { + this.backend = backend; - // Static data - this.decodeTables = codec.decodeTables; - this.decodeTableSeq = codec.decodeTableSeq; - this.defaultCharUnicode = codec.defaultCharUnicode; - this.gb18030 = codec.gb18030; -} + // Decoder state + this.nodeIdx = 0; + this.prevBytes = []; -DBCSDecoder.prototype.write = function (buf) { - const newBuf = Buffer.alloc(buf.length * 2), - prevBytes = this.prevBytes, - prevOffset = this.prevBytes.length; - - let nodeIdx = this.nodeIdx, - seqStart = -this.prevBytes.length, // idx of the start of current parsed sequence. - j = 0; - - for (let i = 0; i < buf.length; i++) { - const curByte = i >= 0 ? buf[i] : prevBytes[i + prevOffset]; - - // TODO: Check curByte is number 0 <= < 256 - - // Lookup in current trie node. - let uCode = this.decodeTables[nodeIdx][curByte]; - - if (uCode >= 0) { - // Normal character, just use it. - } else if (uCode === UNASSIGNED) { - // Unknown char. - // TODO: Callback with seq. - uCode = this.defaultCharUnicode.charCodeAt(0); - i = seqStart; // Skip one byte ('i' will be incremented by the for loop) and try to parse again. - } else if (uCode === GB18030_CODE) { - const b1 = i >= 3 ? buf[i - 3] : prevBytes[i - 3 + prevOffset]; - const b2 = i >= 2 ? buf[i - 2] : prevBytes[i - 2 + prevOffset]; - const b3 = i >= 1 ? buf[i - 1] : prevBytes[i - 1 + prevOffset]; - const ptr = - (b1 - 0x81) * 12600 + (b2 - 0x30) * 1260 + (b3 - 0x81) * 10 + (curByte - 0x30); - const idx = findIdx(this.gb18030.gbChars, ptr); - uCode = this.gb18030.uChars[idx] + ptr - this.gb18030.gbChars[idx]; - } else if (uCode <= NODE_START) { - // Go to next trie node. - nodeIdx = NODE_START - uCode; - continue; - } else if (uCode <= SEQ_START) { - // Output a sequence of chars. - const seq = this.decodeTableSeq[SEQ_START - uCode]; - for (let k = 0; k < seq.length - 1; k++) { - uCode = seq[k]; - newBuf[j++] = uCode & 0xff; - newBuf[j++] = uCode >> 8; - } - uCode = seq[seq.length - 1]; - } else - throw new Error( - `iconv-lite internal error: invalid decoding table value ${uCode} at ${nodeIdx}/${curByte}` - ); + // Static data + this.decodeTables = codec.decodeTables; + this.decodeTableSeq = codec.decodeTableSeq; + this.defaultCharUnicode = codec.defaultCharUnicode; + this.gb18030 = codec.gb18030; + } - // Write the character to buffer, handling higher planes using surrogate pair. - if (uCode >= 0x10000) { - uCode -= 0x10000; - const uCodeLead = 0xd800 | (uCode >> 10); - newBuf[j++] = uCodeLead & 0xff; - newBuf[j++] = uCodeLead >> 8; + write(buf) { + const chars = this.backend.allocRawChars(buf.length), + prevBytes = this.prevBytes, + prevOffset = this.prevBytes.length; + + let nodeIdx = this.nodeIdx, + seqStart = -this.prevBytes.length, // idx of the start of current parsed sequence. + charPos = 0; + + for (let i = 0; i < buf.length; i++) { + const curByte = i >= 0 ? buf[i] : prevBytes[i + prevOffset]; + + // TODO: Check curByte is number 0 <= < 256 + + // Lookup in current trie node. + let uCode = this.decodeTables[nodeIdx][curByte]; + + if (uCode >= 0) { + // Normal character, just use it. + } else if (uCode === UNASSIGNED) { + // Unknown char. + // TODO: Callback with seq. + uCode = this.defaultCharUnicode.charCodeAt(0); + i = seqStart; // Skip one byte ('i' will be incremented by the for loop) and try to parse again. + } else if (uCode === GB18030_CODE) { + const b1 = i >= 3 ? buf[i - 3] : prevBytes[i - 3 + prevOffset]; + const b2 = i >= 2 ? buf[i - 2] : prevBytes[i - 2 + prevOffset]; + const b3 = i >= 1 ? buf[i - 1] : prevBytes[i - 1 + prevOffset]; + const ptr = + (b1 - 0x81) * 12600 + (b2 - 0x30) * 1260 + (b3 - 0x81) * 10 + (curByte - 0x30); + const idx = findIdx(this.gb18030.gbChars, ptr); + uCode = this.gb18030.uChars[idx] + ptr - this.gb18030.gbChars[idx]; + } else if (uCode <= NODE_START) { + // Go to next trie node. + nodeIdx = NODE_START - uCode; + continue; + } else if (uCode <= SEQ_START) { + // Output a sequence of chars. + const seq = this.decodeTableSeq[SEQ_START - uCode]; + for (let k = 0; k < seq.length - 1; k++) { + uCode = seq[k]; + chars[charPos++] = uCode; + } + uCode = seq[seq.length - 1]; + } else + throw new Error( + `iconv-lite internal error: invalid decoding table value ${uCode} at ${nodeIdx}/${curByte}` + ); + + // Write the character to buffer, handling higher planes using surrogate pair. + if (uCode >= 0x10000) { + uCode -= 0x10000; + const uCodeLead = 0xd800 | (uCode >> 10); + chars[charPos++] = uCodeLead; + + uCode = 0xdc00 | (uCode & 0x3ff); + } + chars[charPos++] = uCode; - uCode = 0xdc00 | (uCode & 0x3ff); + // Reset trie node. + nodeIdx = 0; + seqStart = i + 1; } - newBuf[j++] = uCode & 0xff; - newBuf[j++] = uCode >> 8; - // Reset trie node. - nodeIdx = 0; - seqStart = i + 1; - } + this.nodeIdx = nodeIdx; + this.prevBytes = + seqStart >= 0 + ? Array.prototype.slice.call(buf, seqStart) + : prevBytes.slice(seqStart + prevOffset).concat(Array.prototype.slice.call(buf)); - this.nodeIdx = nodeIdx; - this.prevBytes = - seqStart >= 0 - ? Array.prototype.slice.call(buf, seqStart) - : prevBytes.slice(seqStart + prevOffset).concat(Array.prototype.slice.call(buf)); + return this.backend.rawCharsToResult(chars, charPos); + } - return newBuf.slice(0, j).toString("ucs2"); -}; + end() { + let ret = ""; -DBCSDecoder.prototype.end = function () { - let ret = ""; + // Try to parse all remaining chars. + while (this.prevBytes.length > 0) { + // Skip 1 character in the buffer. + ret += this.defaultCharUnicode; + const bytesArr = this.prevBytes.slice(1); - // Try to parse all remaining chars. - while (this.prevBytes.length > 0) { - // Skip 1 character in the buffer. - ret += this.defaultCharUnicode; - const bytesArr = this.prevBytes.slice(1); + // Parse remaining as usual. + this.prevBytes = []; + this.nodeIdx = 0; + if (bytesArr.length > 0) ret += this.write(bytesArr); + } - // Parse remaining as usual. this.prevBytes = []; this.nodeIdx = 0; - if (bytesArr.length > 0) ret += this.write(bytesArr); - } - this.prevBytes = []; - this.nodeIdx = 0; - return ret; -}; + return ret; + } +} // Binary search for GB18030. Returns largest i such that table[i] <= val. function findIdx(table, val) { diff --git a/test/gbkFile.txt b/generation/fixtures/gbkFile.txt similarity index 100% rename from test/gbkFile.txt rename to generation/fixtures/gbkFile.txt diff --git a/generation/gen-gbk-big5-fixtures.js b/generation/gen-gbk-big5-fixtures.js new file mode 100644 index 00000000..595c2627 --- /dev/null +++ b/generation/gen-gbk-big5-fixtures.js @@ -0,0 +1,41 @@ +"use strict"; + +const Iconv = require("iconv").Iconv, + fs = require("fs"), + path = require("path"), + utils = require("../test/utils"); + +const fixtures = { + big5: big5(), + gbk: gbk(), +}; +const outputFile = path.resolve(__dirname, "..", "test", "fixtures", "gbk-big5.json"); +fs.writeFileSync(outputFile, JSON.stringify(fixtures)); + +function gbk() { + const inputFile = path.resolve(__dirname, "fixtures", "gbkFile.txt"); + const contentBuffer = fs.readFileSync(inputFile); + + const codec = Iconv("GBK", "utf8"); + const str = codec.convert(contentBuffer).toString(); + + return { + bytes: utils.hex(contentBuffer, true), + string: str, + }; +} + +function big5() { + const contentBuffer = Buffer.from( + "PEhUTUw+DQo8SEVBRD4gICAgDQoJPFRJVExFPiBtZXRhILzQxdKquqjPpc6hR6SkpOW69K22IDwvVElUTEU+DQoJPG1ldGEgSFRUUC1FUVVJVj0iQ29udGVudC1UeXBlIiBDT05URU5UPSJ0ZXh0L2h0bWw7IGNoYXJzZXQ9YmlnNSI+DQo8L0hFQUQ+DQo8Qk9EWT4NCg0Ks2+sT6RArdPBY8XppKSk5br0rbahSTxicj4NCihUaGlzIHBhZ2UgdXNlcyBiaWc1IGNoYXJhY3RlciBzZXQuKTxicj4NCmNoYXJzZXQ9YmlnNQ0KDQo8L0JPRFk+DQo8L0hUTUw+", + "base64" + ); + + const codec = Iconv("big5", "utf8"); + const str = codec.convert(contentBuffer).toString(); + + return { + bytes: utils.hex(contentBuffer, true), + string: str, + }; +} diff --git a/test/big5-test.js b/test/big5-test.js index c7a7a380..4c492f37 100644 --- a/test/big5-test.js +++ b/test/big5-test.js @@ -1,71 +1,68 @@ "use strict"; -var assert = require("assert"), - Buffer = require("safer-buffer").Buffer, - iconv = require("../"); +const assert = require("assert"), + utils = require("./utils"), + fixtures = require("./fixtures/gbk-big5.json"), + iconv = utils.requireIconv(); -var testString = "中文abc", //unicode contains Big5-code and ascii - testStringBig5Buffer = Buffer.from([0xa4, 0xa4, 0xa4, 0xe5, 0x61, 0x62, 0x63]), +const testString = "中文abc", //unicode contains Big5-code and ascii + testStringBig5Buffer = utils.bytes("a4 a4 a4 e5 61 62 63"), testString2 = "測試", - testStringBig5Buffer2 = Buffer.from([0xb4, 0xfa, 0xb8, 0xd5]); + testStringBig5Buffer2 = utils.bytes("b4 fa b8 d5"); -describe("Big5 tests", function () { +describe("Big5 tests #node-web", function () { it("Big5 correctly encoded/decoded", function () { assert.strictEqual( - iconv.encode(testString, "big5").toString("hex"), - testStringBig5Buffer.toString("hex") + utils.hex(iconv.encode(testString, "big5")), + utils.hex(testStringBig5Buffer) ); assert.strictEqual(iconv.decode(testStringBig5Buffer, "big5"), testString); assert.strictEqual( - iconv.encode(testString2, "big5").toString("hex"), - testStringBig5Buffer2.toString("hex") + utils.hex(iconv.encode(testString2, "big5")), + utils.hex(testStringBig5Buffer2) ); assert.strictEqual(iconv.decode(testStringBig5Buffer2, "big5"), testString2); }); it("cp950 correctly encoded/decoded", function () { assert.strictEqual( - iconv.encode(testString, "cp950").toString("hex"), - testStringBig5Buffer.toString("hex") + utils.hex(iconv.encode(testString, "cp950")), + utils.hex(testStringBig5Buffer) ); assert.strictEqual(iconv.decode(testStringBig5Buffer, "cp950"), testString); }); it("Big5 file read decoded,compare with iconv result", function () { - var contentBuffer = Buffer.from( - "PEhUTUw+DQo8SEVBRD4gICAgDQoJPFRJVExFPiBtZXRhILzQxdKquqjPpc6hR6SkpOW69K22IDwvVElUTEU+DQoJPG1ldGEgSFRUUC1FUVVJVj0iQ29udGVudC1UeXBlIiBDT05URU5UPSJ0ZXh0L2h0bWw7IGNoYXJzZXQ9YmlnNSI+DQo8L0hFQUQ+DQo8Qk9EWT4NCg0Ks2+sT6RArdPBY8XppKSk5br0rbahSTxicj4NCihUaGlzIHBhZ2UgdXNlcyBiaWc1IGNoYXJhY3RlciBzZXQuKTxicj4NCmNoYXJzZXQ9YmlnNQ0KDQo8L0JPRFk+DQo8L0hUTUw+", - "base64" - ); - var str = iconv.decode(contentBuffer, "big5"); - var iconvc = new (require("iconv").Iconv)("big5", "utf8"); - assert.strictEqual(iconvc.convert(contentBuffer).toString(), str); + const contentBuffer = utils.bytes(fixtures.big5.bytes); + const str = iconv.decode(contentBuffer, "big5"); + assert.strictEqual(fixtures.big5.string, str); }); it("Big5 correctly decodes and encodes characters · and ×", function () { // https://github.com/ashtuchkin/iconv-lite/issues/13 // Reference: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT - var chars = "·×"; - var big5Chars = Buffer.from([0xa1, 0x50, 0xa1, 0xd1]); - assert.strictEqual(iconv.encode(chars, "big5").toString("hex"), big5Chars.toString("hex")); + const chars = "·×"; + const big5Chars = utils.bytes("a1 50 a1 d1"); + assert.strictEqual(utils.hex(iconv.encode(chars, "big5")), utils.hex(big5Chars)); assert.strictEqual(iconv.decode(big5Chars, "big5"), chars); }); it("Big5 correctly encodes & decodes sequences", function () { - assert.strictEqual(iconv.encode("\u00CA\u0304", "big5").toString("hex"), "8862"); - assert.strictEqual(iconv.encode("\u00EA\u030C", "big5").toString("hex"), "88a5"); - assert.strictEqual(iconv.encode("\u00CA", "big5").toString("hex"), "8866"); - assert.strictEqual(iconv.encode("\u00CA\u00CA", "big5").toString("hex"), "88668866"); + assert.strictEqual(utils.hex(iconv.encode("\u00CA\u0304", "big5")), "88 62"); + assert.strictEqual(utils.hex(iconv.encode("\u00EA\u030C", "big5")), "88 a5"); + assert.strictEqual(utils.hex(iconv.encode("\u00CA", "big5")), "88 66"); + assert.strictEqual(utils.hex(iconv.encode("\u00CA\u00CA", "big5")), "88 66 88 66"); - assert.strictEqual(iconv.encode("\u00CA\uD800", "big5").toString("hex"), "88663f"); // Unfinished surrogate. - assert.strictEqual(iconv.encode("\u00CA\uD841\uDD47", "big5").toString("hex"), "8866fa40"); // Finished surrogate ('𠕇'). - assert.strictEqual(iconv.encode("\u00CA𠕇", "big5").toString("hex"), "8866fa40"); // Finished surrogate ('𠕇'). + assert.strictEqual(utils.hex(iconv.encode("\u00CA\uD800", "big5")), "88 66 3f"); // Unfinished surrogate. + assert.strictEqual(utils.hex(iconv.encode("\u00CA\uD841\uDD47", "big5")), "88 66 fa 40"); // Finished surrogate ('𠕇'). + assert.strictEqual(utils.hex(iconv.encode("\u00CA𠕇", "big5")), "88 66 fa 40"); // Finished surrogate ('𠕇'). - assert.strictEqual(iconv.decode(Buffer.from("8862", "hex"), "big5"), "\u00CA\u0304"); - assert.strictEqual(iconv.decode(Buffer.from("8866", "hex"), "big5"), "\u00CA"); - assert.strictEqual(iconv.decode(Buffer.from("8866fa40", "hex"), "big5"), "\u00CA𠕇"); + assert.strictEqual(iconv.decode(utils.bytes("88 62"), "big5"), "\u00CA\u0304"); + assert.strictEqual(iconv.decode(utils.bytes("88 66"), "big5"), "\u00CA"); + assert.strictEqual(iconv.decode(utils.bytes("88 66 fa 40"), "big5"), "\u00CA𠕇"); }); it("Big5 correctly encodes 十", function () { - assert.strictEqual(iconv.encode("十", "big5").toString("hex"), "a451"); + assert.strictEqual(utils.hex(iconv.encode("十", "big5")), "a4 51"); }); }); diff --git a/test/fixtures/gbk-big5.json b/test/fixtures/gbk-big5.json new file mode 100644 index 00000000..2a006705 --- /dev/null +++ b/test/fixtures/gbk-big5.json @@ -0,0 +1,10 @@ +{ + "big5": { + "bytes": "3c 48 54 4d 4c 3e 0d 0a 3c 48 45 41 44 3e 20 20 20 20 0d 0a 09 3c 54 49 54 4c 45 3e 20 6d 65 74 61 20 bc d0 c5 d2 aa ba a8 cf a5 ce a1 47 a4 a4 a4 e5 ba f4 ad b6 20 3c 2f 54 49 54 4c 45 3e 0d 0a 09 3c 6d 65 74 61 20 48 54 54 50 2d 45 51 55 49 56 3d 22 43 6f 6e 74 65 6e 74 2d 54 79 70 65 22 20 43 4f 4e 54 45 4e 54 3d 22 74 65 78 74 2f 68 74 6d 6c 3b 20 63 68 61 72 73 65 74 3d 62 69 67 35 22 3e 0d 0a 3c 2f 48 45 41 44 3e 0d 0a 3c 42 4f 44 59 3e 0d 0a 0d 0a b3 6f ac 4f a4 40 ad d3 c1 63 c5 e9 a4 a4 a4 e5 ba f4 ad b6 a1 49 3c 62 72 3e 0d 0a 28 54 68 69 73 20 70 61 67 65 20 75 73 65 73 20 62 69 67 35 20 63 68 61 72 61 63 74 65 72 20 73 65 74 2e 29 3c 62 72 3e 0d 0a 63 68 61 72 73 65 74 3d 62 69 67 35 0d 0a 0d 0a 3c 2f 42 4f 44 59 3e 0d 0a 3c 2f 48 54 4d 4c 3e", + "string": "\r\n \r\n\t meta 標籤的使用:中文網頁 \r\n\t\r\n\r\n\r\n\r\n這是一個繁體中文網頁!
\r\n(This page uses big5 character set.)
\r\ncharset=big5\r\n\r\n\r\n" + }, + "gbk": { + "bytes": "3c 21 64 6f 63 74 79 70 65 20 68 74 6d 6c 3e 3c 68 74 6d 6c 3e 3c 68 65 61 64 3e 3c 6d 65 74 61 20 68 74 74 70 2d 65 71 75 69 76 3d 22 43 6f 6e 74 65 6e 74 2d 54 79 70 65 22 20 63 6f 6e 74 65 6e 74 3d 22 74 65 78 74 2f 68 74 6d 6c 3b 63 68 61 72 73 65 74 3d 67 62 32 33 31 32 22 3e 3c 74 69 74 6c 65 3e b0 d9 b6 c8 d2 bb cf c2 a3 ac c4 e3 be cd d6 aa b5 c0 20 20 20 20 20 20 3c 2f 74 69 74 6c 65 3e 3c 73 74 79 6c 65 3e 68 74 6d 6c 7b 6f 76 65 72 66 6c 6f 77 2d 79 3a 61 75 74 6f 7d 62 6f 64 79 7b 66 6f 6e 74 3a 31 32 70 78 20 61 72 69 61 6c 3b 74 65 78 74 2d 61 6c 69 67 6e 3a 63 65 6e 74 65 72 3b 62 61 63 6b 67 72 6f 75 6e 64 3a 23 66 66 66 7d 62 6f 64 79 2c 70 2c 66 6f 72 6d 2c 75 6c 2c 6c 69 7b 6d 61 72 67 69 6e 3a 30 3b 70 61 64 64 69 6e 67 3a 30 3b 6c 69 73 74 2d 73 74 79 6c 65 3a 6e 6f 6e 65 7d 62 6f 64 79 2c 66 6f 72 6d 2c 23 66 6d 7b 70 6f 73 69 74 69 6f 6e 3a 72 65 6c 61 74 69 76 65 7d 74 64 7b 74 65 78 74 2d 61 6c 69 67 6e 3a 6c 65 66 74 7d 69 6d 67 7b 62 6f 72 64 65 72 3a 30 7d 61 7b 63 6f 6c 6f 72 3a 23 30 30 63 7d 61 3a 61 63 74 69 76 65 7b 63 6f 6c 6f 72 3a 23 66 36 30 7d 23 75 7b 70 61 64 64 69 6e 67 3a 37 70 78 20 31 30 70 78 20 33 70 78 20 30 3b 74 65 78 74 2d 61 6c 69 67 6e 3a 72 69 67 68 74 7d 23 6d 7b 77 69 64 74 68 3a 36 38 30 70 78 3b 6d 61 72 67 69 6e 3a 30 20 61 75 74 6f 7d 23 6e 76 7b 66 6f 6e 74 2d 73 69 7a 65 3a 31 36 70 78 3b 6d 61 72 67 69 6e 3a 30 20 30 20 34 70 78 3b 74 65 78 74 2d 61 6c 69 67 6e 3a 6c 65 66 74 3b 74 65 78 74 2d 69 6e 64 65 6e 74 3a 31 31 37 70 78 7d 23 6e 76 20 61 2c 23 6e 76 20 62 2c 2e 62 74 6e 2c 23 6c 6b 7b 66 6f 6e 74 2d 73 69 7a 65 3a 31 34 70 78 7d 23 66 6d 7b 70 61 64 64 69 6e 67 2d 6c 65 66 74 3a 39 30 70 78 3b 74 65 78 74 2d 61 6c 69 67 6e 3a 6c 65 66 74 7d 23 6b 77 7b 77 69 64 74 68 3a 34 30 34 70 78 3b 68 65 69 67 68 74 3a 32 32 70 78 3b 70 61 64 64 69 6e 67 3a 34 70 78 20 37 70 78 3b 70 61 64 64 69 6e 67 3a 36 70 78 20 37 70 78 20 32 70 78 5c 39 3b 66 6f 6e 74 3a 31 36 70 78 20 61 72 69 61 6c 3b 62 61 63 6b 67 72 6f 75 6e 64 3a 75 72 6c 28 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 69 6d 67 2f 69 2d 31 2e 30 2e 30 2e 70 6e 67 29 20 6e 6f 2d 72 65 70 65 61 74 20 2d 33 30 34 70 78 20 30 3b 5f 62 61 63 6b 67 72 6f 75 6e 64 2d 61 74 74 61 63 68 6d 65 6e 74 3a 66 69 78 65 64 3b 62 6f 72 64 65 72 3a 31 70 78 20 73 6f 6c 69 64 20 23 63 64 63 64 63 64 3b 62 6f 72 64 65 72 2d 63 6f 6c 6f 72 3a 23 39 61 39 61 39 61 20 23 63 64 63 64 63 64 20 23 63 64 63 64 63 64 20 23 39 61 39 61 39 61 3b 76 65 72 74 69 63 61 6c 2d 61 6c 69 67 6e 3a 74 6f 70 7d 2e 62 74 6e 7b 77 69 64 74 68 3a 39 35 70 78 3b 68 65 69 67 68 74 3a 33 32 70 78 3b 70 61 64 64 69 6e 67 3a 30 3b 70 61 64 64 69 6e 67 2d 74 6f 70 3a 32 70 78 5c 39 3b 62 6f 72 64 65 72 3a 30 3b 62 61 63 6b 67 72 6f 75 6e 64 3a 23 64 64 64 20 75 72 6c 28 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 69 6d 67 2f 69 2d 31 2e 30 2e 30 2e 70 6e 67 29 20 6e 6f 2d 72 65 70 65 61 74 3b 63 75 72 73 6f 72 3a 70 6f 69 6e 74 65 72 7d 2e 62 74 6e 5f 68 7b 62 61 63 6b 67 72 6f 75 6e 64 2d 70 6f 73 69 74 69 6f 6e 3a 2d 31 30 30 70 78 20 30 7d 23 6b 77 2c 2e 62 74 6e 5f 77 72 7b 6d 61 72 67 69 6e 3a 30 20 35 70 78 20 30 20 30 7d 2e 62 74 6e 5f 77 72 7b 77 69 64 74 68 3a 39 37 70 78 3b 68 65 69 67 68 74 3a 33 34 70 78 3b 64 69 73 70 6c 61 79 3a 69 6e 6c 69 6e 65 2d 62 6c 6f 63 6b 3b 62 61 63 6b 67 72 6f 75 6e 64 3a 75 72 6c 28 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 69 6d 67 2f 69 2d 31 2e 30 2e 30 2e 70 6e 67 29 20 6e 6f 2d 72 65 70 65 61 74 20 2d 32 30 32 70 78 20 30 3b 5f 74 6f 70 3a 31 70 78 3b 2a 70 6f 73 69 74 69 6f 6e 3a 72 65 6c 61 74 69 76 65 7d 23 6c 6b 7b 6d 61 72 67 69 6e 3a 33 33 70 78 20 30 7d 23 6c 6b 20 73 70 61 6e 7b 66 6f 6e 74 3a 31 34 70 78 20 22 cb ce cc e5 22 7d 23 6c 6d 7b 68 65 69 67 68 74 3a 36 30 70 78 7d 23 6c 68 7b 6d 61 72 67 69 6e 3a 31 36 70 78 20 30 20 35 70 78 3b 77 6f 72 64 2d 73 70 61 63 69 6e 67 3a 33 70 78 7d 2e 74 6f 6f 6c 73 7b 70 6f 73 69 74 69 6f 6e 3a 61 62 73 6f 6c 75 74 65 3b 74 6f 70 3a 2d 34 70 78 3b 2a 74 6f 70 3a 31 30 70 78 3b 72 69 67 68 74 3a 2d 31 33 70 78 3b 7d 23 6d 48 6f 6c 64 65 72 7b 77 69 64 74 68 3a 36 32 70 78 3b 70 6f 73 69 74 69 6f 6e 3a 72 65 6c 61 74 69 76 65 3b 7a 2d 69 6e 64 65 78 3a 32 39 36 3b 64 69 73 70 6c 61 79 3a 6e 6f 6e 65 7d 23 6d 43 6f 6e 7b 68 65 69 67 68 74 3a 31 38 70 78 3b 6c 69 6e 65 2d 68 65 69 67 68 74 3a 31 38 70 78 3b 70 6f 73 69 74 69 6f 6e 3a 61 62 73 6f 6c 75 74 65 3b 63 75 72 73 6f 72 3a 70 6f 69 6e 74 65 72 3b 70 61 64 64 69 6e 67 3a 30 20 31 38 70 78 20 30 20 30 3b 62 61 63 6b 67 72 6f 75 6e 64 3a 75 72 6c 28 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 69 6d 67 2f 62 67 2d 31 2e 30 2e 30 2e 67 69 66 29 20 6e 6f 2d 72 65 70 65 61 74 20 72 69 67 68 74 20 2d 31 33 34 70 78 3b 62 61 63 6b 67 72 6f 75 6e 64 2d 70 6f 73 69 74 69 6f 6e 3a 72 69 67 68 74 20 2d 31 33 36 70 78 5c 39 7d 23 6d 43 6f 6e 20 73 70 61 6e 7b 63 6f 6c 6f 72 3a 23 30 30 63 3b 63 75 72 73 6f 72 3a 64 65 66 61 75 6c 74 3b 64 69 73 70 6c 61 79 3a 62 6c 6f 63 6b 7d 23 6d 43 6f 6e 20 2e 68 77 7b 74 65 78 74 2d 64 65 63 6f 72 61 74 69 6f 6e 3a 75 6e 64 65 72 6c 69 6e 65 3b 63 75 72 73 6f 72 3a 70 6f 69 6e 74 65 72 7d 23 6d 4d 65 6e 75 7b 77 69 64 74 68 3a 35 36 70 78 3b 62 6f 72 64 65 72 3a 31 70 78 20 73 6f 6c 69 64 20 23 39 61 39 39 66 66 3b 6c 69 73 74 2d 73 74 79 6c 65 3a 6e 6f 6e 65 3b 70 6f 73 69 74 69 6f 6e 3a 61 62 73 6f 6c 75 74 65 3b 72 69 67 68 74 3a 37 70 78 3b 74 6f 70 3a 32 38 70 78 3b 64 69 73 70 6c 61 79 3a 6e 6f 6e 65 3b 62 61 63 6b 67 72 6f 75 6e 64 3a 23 66 66 66 7d 23 6d 4d 65 6e 75 20 61 7b 77 69 64 74 68 3a 31 30 30 25 3b 68 65 69 67 68 74 3a 31 30 30 25 3b 64 69 73 70 6c 61 79 3a 62 6c 6f 63 6b 3b 6c 69 6e 65 2d 68 65 69 67 68 74 3a 32 32 70 78 3b 74 65 78 74 2d 69 6e 64 65 6e 74 3a 36 70 78 3b 74 65 78 74 2d 64 65 63 6f 72 61 74 69 6f 6e 3a 6e 6f 6e 65 7d 23 6d 4d 65 6e 75 20 61 3a 68 6f 76 65 72 7b 62 61 63 6b 67 72 6f 75 6e 64 3a 23 64 39 65 31 66 36 7d 23 6d 4d 65 6e 75 20 2e 6c 6e 7b 68 65 69 67 68 74 3a 31 70 78 3b 62 61 63 6b 67 72 6f 75 6e 64 3a 23 63 63 66 3b 6f 76 65 72 66 6c 6f 77 3a 68 69 64 64 65 6e 3b 6d 61 72 67 69 6e 3a 32 70 78 3b 66 6f 6e 74 2d 73 69 7a 65 3a 31 70 78 3b 6c 69 6e 65 2d 68 65 69 67 68 74 3a 31 70 78 7d 23 63 70 2c 23 63 70 20 61 7b 63 6f 6c 6f 72 3a 23 37 37 63 7d 23 73 65 74 68 7b 64 69 73 70 6c 61 79 3a 6e 6f 6e 65 3b 62 65 68 61 76 69 6f 72 3a 75 72 6c 28 23 64 65 66 61 75 6c 74 23 68 6f 6d 65 70 61 67 65 29 7d 23 73 65 74 66 7b 64 69 73 70 6c 61 79 3a 6e 6f 6e 65 7d 3c 2f 73 74 79 6c 65 3e 0d 0a 3c 2f 68 65 61 64 3e 0d 0a 0d 0a 3c 62 6f 64 79 3e 3c 64 69 76 20 69 64 3d 22 75 22 3e 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 67 61 6f 6a 69 2f 70 72 65 66 65 72 65 6e 63 65 73 2e 68 74 6d 6c 22 20 6e 61 6d 65 3d 22 74 6a 5f 73 65 74 74 69 6e 67 22 3e cb d1 cb f7 c9 e8 d6 c3 3c 2f 61 3e 26 6e 62 73 70 3b 7c 26 6e 62 73 70 3b 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 70 61 73 73 70 6f 72 74 2e 62 61 69 64 75 2e 63 6f 6d 2f 3f 6c 6f 67 69 6e 26 74 70 6c 3d 6d 6e 22 20 6e 61 6d 65 3d 22 74 6a 5f 6c 6f 67 69 6e 22 3e b5 c7 c2 bc 3c 2f 61 3e 3c 2f 64 69 76 3e 0d 0a 3c 64 69 76 20 69 64 3d 22 6d 22 3e 3c 70 20 69 64 3d 22 6c 67 22 3e 3c 69 6d 67 20 73 72 63 3d 22 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 69 6d 67 2f 62 61 69 64 75 5f 73 79 6c 6f 67 6f 31 2e 67 69 66 22 20 77 69 64 74 68 3d 22 32 37 30 22 20 68 65 69 67 68 74 3d 22 31 32 39 22 20 75 73 65 6d 61 70 3d 22 23 6d 70 22 3e 3c 6d 61 70 20 6e 61 6d 65 3d 22 6d 70 22 3e 3c 61 72 65 61 20 73 68 61 70 65 3d 22 72 65 63 74 22 20 63 6f 6f 72 64 73 3d 22 34 30 2c 32 35 2c 32 33 30 2c 39 35 22 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 68 69 2e 62 61 69 64 75 2e 63 6f 6d 2f 62 61 69 64 75 2f 22 20 74 61 72 67 65 74 3d 22 5f 62 6c 61 6e 6b 22 20 74 69 74 6c 65 3d 22 b5 e3 b4 cb bd f8 c8 eb 20 b0 d9 b6 c8 b5 c4 bf d5 bc e4 22 20 3e 3c 2f 6d 61 70 3e 3c 2f 70 3e 3c 70 20 69 64 3d 22 6e 76 22 3e 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 6e 65 77 73 2e 62 61 69 64 75 2e 63 6f 6d 22 3e d0 c2 26 6e 62 73 70 3b ce c5 3c 2f 61 3e a1 a1 3c 62 3e cd f8 26 6e 62 73 70 3b d2 b3 3c 2f 62 3e a1 a1 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 74 69 65 62 61 2e 62 61 69 64 75 2e 63 6f 6d 22 3e cc f9 26 6e 62 73 70 3b b0 c9 3c 2f 61 3e a1 a1 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 7a 68 69 64 61 6f 2e 62 61 69 64 75 2e 63 6f 6d 22 3e d6 aa 26 6e 62 73 70 3b b5 c0 3c 2f 61 3e a1 a1 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 6d 70 33 2e 62 61 69 64 75 2e 63 6f 6d 22 3e 4d 50 33 3c 2f 61 3e a1 a1 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 69 6d 61 67 65 2e 62 61 69 64 75 2e 63 6f 6d 22 3e cd bc 26 6e 62 73 70 3b c6 ac 3c 2f 61 3e a1 a1 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 76 69 64 65 6f 2e 62 61 69 64 75 2e 63 6f 6d 22 3e ca d3 26 6e 62 73 70 3b c6 b5 3c 2f 61 3e a1 a1 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 6d 61 70 2e 62 61 69 64 75 2e 63 6f 6d 22 3e b5 d8 26 6e 62 73 70 3b cd bc 3c 2f 61 3e 3c 2f 70 3e 3c 64 69 76 20 69 64 3d 22 66 6d 22 3e 3c 66 6f 72 6d 20 6e 61 6d 65 3d 22 66 22 20 61 63 74 69 6f 6e 3d 22 2f 73 22 3e 3c 69 6e 70 75 74 20 74 79 70 65 3d 22 74 65 78 74 22 20 6e 61 6d 65 3d 22 77 64 22 20 69 64 3d 22 6b 77 22 20 6d 61 78 6c 65 6e 67 74 68 3d 22 31 30 30 22 3e 3c 69 6e 70 75 74 20 74 79 70 65 3d 22 68 69 64 64 65 6e 22 20 6e 61 6d 65 3d 22 72 73 76 5f 62 70 22 20 76 61 6c 75 65 3d 22 30 22 3e 3c 69 6e 70 75 74 20 74 79 70 65 3d 22 68 69 64 64 65 6e 22 20 6e 61 6d 65 3d 22 72 73 76 5f 73 70 74 22 20 76 61 6c 75 65 3d 22 33 22 3e 3c 73 70 61 6e 20 63 6c 61 73 73 3d 22 62 74 6e 5f 77 72 22 3e 3c 69 6e 70 75 74 20 74 79 70 65 3d 22 73 75 62 6d 69 74 22 20 76 61 6c 75 65 3d 22 b0 d9 b6 c8 d2 bb cf c2 22 20 69 64 3d 22 73 75 22 20 63 6c 61 73 73 3d 22 62 74 6e 22 20 6f 6e 6d 6f 75 73 65 64 6f 77 6e 3d 22 74 68 69 73 2e 63 6c 61 73 73 4e 61 6d 65 3d 27 62 74 6e 20 62 74 6e 5f 68 27 22 20 6f 6e 6d 6f 75 73 65 6f 75 74 3d 22 74 68 69 73 2e 63 6c 61 73 73 4e 61 6d 65 3d 27 62 74 6e 27 22 3e 3c 2f 73 70 61 6e 3e 3c 2f 66 6f 72 6d 3e 3c 73 70 61 6e 20 63 6c 61 73 73 3d 22 74 6f 6f 6c 73 22 3e 3c 73 70 61 6e 20 69 64 3d 22 6d 48 6f 6c 64 65 72 22 3e 3c 64 69 76 20 69 64 3d 22 6d 43 6f 6e 22 3e 3c 73 70 61 6e 3e ca e4 c8 eb b7 a8 3c 2f 73 70 61 6e 3e 3c 2f 64 69 76 3e 3c 2f 73 70 61 6e 3e 3c 2f 73 70 61 6e 3e 3c 75 6c 20 69 64 3d 22 6d 4d 65 6e 75 22 3e 3c 6c 69 3e 3c 61 20 68 72 65 66 3d 22 23 22 20 6e 61 6d 65 3d 22 69 6d 65 5f 68 77 22 3e ca d6 d0 b4 3c 2f 61 3e 3c 2f 6c 69 3e 3c 6c 69 3e 3c 61 20 68 72 65 66 3d 22 23 22 20 6e 61 6d 65 3d 22 69 6d 65 5f 70 79 22 3e c6 b4 d2 f4 3c 2f 61 3e 3c 2f 6c 69 3e 3c 6c 69 20 63 6c 61 73 73 3d 22 6c 6e 22 3e 3c 2f 6c 69 3e 3c 6c 69 3e 3c 61 20 68 72 65 66 3d 22 23 22 20 6e 61 6d 65 3d 22 69 6d 65 5f 63 6c 22 3e b9 d8 b1 d5 3c 2f 61 3e 3c 2f 6c 69 3e 3c 2f 75 6c 3e 3c 2f 64 69 76 3e 0d 0a 3c 70 20 69 64 3d 22 6c 6b 22 3e 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 68 69 2e 62 61 69 64 75 2e 63 6f 6d 22 3e bf d5 bc e4 3c 2f 61 3e a1 a1 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 62 61 69 6b 65 2e 62 61 69 64 75 2e 63 6f 6d 22 3e b0 d9 bf c6 3c 2f 61 3e a1 a1 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 77 77 77 2e 68 61 6f 31 32 33 2e 63 6f 6d 22 3e 68 61 6f 31 32 33 3c 2f 61 3e 3c 73 70 61 6e 3e 20 7c 20 3c 61 20 68 72 65 66 3d 22 2f 6d 6f 72 65 2f 22 3e b8 fc b6 e0 26 67 74 3b 26 67 74 3b 3c 2f 61 3e 3c 2f 73 70 61 6e 3e 3c 2f 70 3e 3c 70 20 69 64 3d 22 6c 6d 22 3e 3c 2f 70 3e 3c 70 3e 3c 61 20 69 64 3d 22 73 65 74 68 22 20 6f 6e 43 6c 69 63 6b 3d 22 74 68 69 73 2e 73 65 74 48 6f 6d 65 50 61 67 65 28 27 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 27 29 22 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 75 74 69 6c 69 74 79 2e 62 61 69 64 75 2e 63 6f 6d 2f 74 72 61 66 2f 63 6c 69 63 6b 2e 70 68 70 3f 69 64 3d 32 31 35 26 75 72 6c 3d 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 22 20 6f 6e 6d 6f 75 73 65 64 6f 77 6e 3d 22 72 65 74 75 72 6e 20 6e 73 5f 63 28 7b 27 66 6d 27 3a 27 62 65 68 73 27 2c 27 74 61 62 27 3a 27 68 6f 6d 65 70 61 67 65 27 2c 27 70 6f 73 27 3a 30 7d 29 22 3e b0 d1 b0 d9 b6 c8 c9 e8 ce aa d6 f7 d2 b3 3c 2f 61 3e 3c 61 20 69 64 3d 22 73 65 74 66 22 20 6f 6e 43 6c 69 63 6b 3d 22 66 61 28 74 68 69 73 29 22 20 68 72 65 66 3d 22 6a 61 76 61 73 63 72 69 70 74 3a 76 6f 69 64 28 30 29 22 20 6f 6e 6d 6f 75 73 65 64 6f 77 6e 3d 22 72 65 74 75 72 6e 20 6e 73 5f 63 28 7b 27 66 6d 27 3a 27 62 65 68 73 27 2c 27 74 61 62 27 3a 27 66 61 76 6f 72 69 74 65 73 27 2c 27 70 6f 73 27 3a 30 7d 29 22 3e b0 d1 b0 d9 b6 c8 bc d3 c8 eb ca d5 b2 d8 bc d0 3c 2f 61 3e 3c 2f 70 3e 0d 0a 3c 70 20 69 64 3d 22 6c 68 22 3e 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 65 2e 62 61 69 64 75 2e 63 6f 6d 2f 3f 72 65 66 65 72 3d 38 38 38 22 3e bc d3 c8 eb b0 d9 b6 c8 cd c6 b9 e3 3c 2f 61 3e 20 7c 20 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 74 6f 70 2e 62 61 69 64 75 2e 63 6f 6d 22 3e cb d1 cb f7 b7 e7 d4 c6 b0 f1 3c 2f 61 3e 20 7c 20 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 68 6f 6d 65 2e 62 61 69 64 75 2e 63 6f 6d 22 3e b9 d8 d3 da b0 d9 b6 c8 3c 2f 61 3e 20 7c 20 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 69 72 2e 62 61 69 64 75 2e 63 6f 6d 22 3e 41 62 6f 75 74 20 42 61 69 64 75 3c 2f 61 3e 3c 2f 70 3e 3c 70 20 69 64 3d 22 63 70 22 3e 26 63 6f 70 79 3b 32 30 31 31 20 42 61 69 64 75 20 3c 61 20 68 72 65 66 3d 22 2f 64 75 74 79 2f 22 3e ca b9 d3 c3 b0 d9 b6 c8 c7 b0 b1 d8 b6 c1 3c 2f 61 3e 20 3c 61 20 68 72 65 66 3d 22 68 74 74 70 3a 2f 2f 77 77 77 2e 6d 69 69 62 65 69 61 6e 2e 67 6f 76 2e 63 6e 22 20 74 61 72 67 65 74 3d 22 5f 62 6c 61 6e 6b 22 3e be a9 49 43 50 d6 a4 30 33 30 31 37 33 ba c5 3c 2f 61 3e 20 3c 69 6d 67 20 73 72 63 3d 22 68 74 74 70 3a 2f 2f 67 69 6d 67 2e 62 61 69 64 75 2e 63 6f 6d 2f 69 6d 67 2f 67 73 2e 67 69 66 22 3e 3c 2f 70 3e 3c 2f 64 69 76 3e 3c 2f 62 6f 64 79 3e 0d 0a 0d 0a 3c 73 63 72 69 70 74 3e 76 61 72 20 77 3d 77 69 6e 64 6f 77 2c 64 3d 64 6f 63 75 6d 65 6e 74 2c 6e 3d 6e 61 76 69 67 61 74 6f 72 2c 6b 3d 64 2e 66 2e 77 64 2c 61 3d 64 2e 67 65 74 45 6c 65 6d 65 6e 74 42 79 49 64 28 22 6e 76 22 29 2e 67 65 74 45 6c 65 6d 65 6e 74 73 42 79 54 61 67 4e 61 6d 65 28 22 61 22 29 2c 69 73 49 45 3d 6e 2e 75 73 65 72 41 67 65 6e 74 2e 69 6e 64 65 78 4f 66 28 22 4d 53 49 45 22 29 21 3d 2d 31 26 26 21 77 69 6e 64 6f 77 2e 6f 70 65 72 61 3b 66 6f 72 28 76 61 72 20 69 3d 30 3b 69 3c 61 2e 6c 65 6e 67 74 68 3b 69 2b 2b 29 7b 61 5b 69 5d 2e 6f 6e 63 6c 69 63 6b 3d 66 75 6e 63 74 69 6f 6e 28 29 7b 69 66 28 6b 2e 76 61 6c 75 65 2e 6c 65 6e 67 74 68 3e 30 29 7b 76 61 72 20 43 3d 74 68 69 73 2c 41 3d 43 2e 68 72 65 66 2c 42 3d 65 6e 63 6f 64 65 55 52 49 43 6f 6d 70 6f 6e 65 6e 74 28 6b 2e 76 61 6c 75 65 29 3b 69 66 28 41 2e 69 6e 64 65 78 4f 66 28 22 71 3d 22 29 21 3d 2d 31 29 7b 43 2e 68 72 65 66 3d 41 2e 72 65 70 6c 61 63 65 28 2f 71 3d 5b 5e 26 5c 78 32 34 5d 2a 2f 2c 22 71 3d 22 2b 42 29 7d 65 6c 73 65 7b 74 68 69 73 2e 68 72 65 66 2b 3d 22 3f 71 3d 22 2b 42 7d 7d 7d 7d 28 66 75 6e 63 74 69 6f 6e 28 29 7b 69 66 28 2f 71 3d 28 5b 5e 26 5d 2b 29 2f 2e 74 65 73 74 28 6c 6f 63 61 74 69 6f 6e 2e 73 65 61 72 63 68 29 29 7b 6b 2e 76 61 6c 75 65 3d 64 65 63 6f 64 65 55 52 49 43 6f 6d 70 6f 6e 65 6e 74 28 52 65 67 45 78 70 5b 22 5c 78 32 34 31 22 5d 29 7d 7d 29 28 29 3b 69 66 28 6e 2e 63 6f 6f 6b 69 65 45 6e 61 62 6c 65 64 26 26 21 2f 73 75 67 3f 3d 30 2f 2e 74 65 73 74 28 64 2e 63 6f 6f 6b 69 65 29 29 7b 64 2e 77 72 69 74 65 28 22 3c 73 63 72 69 70 74 20 73 72 63 3d 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 6a 73 2f 62 64 73 75 67 2e 6a 73 3f 76 3d 31 2e 30 2e 33 2e 30 3e 3c 5c 2f 73 63 72 69 70 74 3e 22 29 7d 66 75 6e 63 74 69 6f 6e 20 61 64 64 45 56 28 43 2c 42 2c 41 29 7b 69 66 28 77 2e 61 74 74 61 63 68 45 76 65 6e 74 29 7b 43 2e 61 74 74 61 63 68 45 76 65 6e 74 28 22 6f 6e 22 2b 42 2c 41 29 7d 65 6c 73 65 7b 69 66 28 77 2e 61 64 64 45 76 65 6e 74 4c 69 73 74 65 6e 65 72 29 7b 43 2e 61 64 64 45 76 65 6e 74 4c 69 73 74 65 6e 65 72 28 42 2c 41 2c 66 61 6c 73 65 29 7d 7d 7d 66 75 6e 63 74 69 6f 6e 20 47 28 41 29 7b 72 65 74 75 72 6e 20 64 2e 67 65 74 45 6c 65 6d 65 6e 74 42 79 49 64 28 41 29 7d 66 75 6e 63 74 69 6f 6e 20 6e 73 5f 63 28 45 29 7b 76 61 72 20 46 3d 65 6e 63 6f 64 65 55 52 49 43 6f 6d 70 6f 6e 65 6e 74 28 77 69 6e 64 6f 77 2e 64 6f 63 75 6d 65 6e 74 2e 6c 6f 63 61 74 69 6f 6e 2e 68 72 65 66 29 2c 44 3d 22 22 2c 41 3d 22 22 2c 42 3d 22 22 2c 43 3d 77 69 6e 64 6f 77 5b 22 42 44 5f 50 53 5f 43 22 2b 28 6e 65 77 20 44 61 74 65 28 29 29 2e 67 65 74 54 69 6d 65 28 29 5d 3d 6e 65 77 20 49 6d 61 67 65 28 29 3b 66 6f 72 28 76 20 69 6e 20 45 29 7b 41 3d 45 5b 76 5d 3b 44 2b 3d 76 2b 22 3d 22 2b 41 2b 22 26 22 7d 42 3d 22 26 6d 75 3d 22 2b 46 3b 43 2e 73 72 63 3d 22 68 74 74 70 3a 2f 2f 6e 73 63 6c 69 63 6b 2e 62 61 69 64 75 2e 63 6f 6d 2f 76 2e 67 69 66 3f 70 69 64 3d 32 30 31 26 70 6a 3d 77 77 77 26 22 2b 44 2b 22 70 61 74 68 3d 22 2b 46 2b 22 26 74 3d 22 2b 6e 65 77 20 44 61 74 65 28 29 2e 67 65 74 54 69 6d 65 28 29 3b 72 65 74 75 72 6e 20 74 72 75 65 7d 69 66 28 2f 5c 62 62 64 69 6d 65 3d 5b 31 32 5d 2f 2e 74 65 73 74 28 64 2e 63 6f 6f 6b 69 65 29 29 7b 64 6f 63 75 6d 65 6e 74 2e 77 72 69 74 65 28 22 3c 73 63 72 69 70 74 20 73 72 63 3d 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 63 61 63 68 65 2f 69 6d 65 2f 6a 73 2f 6f 70 65 6e 69 6d 65 2d 31 2e 30 2e 30 2e 6a 73 3e 3c 5c 2f 73 63 72 69 70 74 3e 22 29 7d 28 66 75 6e 63 74 69 6f 6e 28 29 7b 76 61 72 20 42 3d 47 28 22 75 73 65 72 22 29 2c 41 3d 47 28 22 75 73 65 72 4d 65 6e 75 22 29 3b 69 66 28 42 26 26 41 29 7b 61 64 64 45 56 28 42 2c 22 63 6c 69 63 6b 22 2c 66 75 6e 63 74 69 6f 6e 28 43 29 7b 41 2e 73 74 79 6c 65 2e 64 69 73 70 6c 61 79 3d 41 2e 73 74 79 6c 65 2e 64 69 73 70 6c 61 79 3d 3d 22 62 6c 6f 63 6b 22 3f 22 6e 6f 6e 65 22 3a 22 62 6c 6f 63 6b 22 3b 77 69 6e 64 6f 77 2e 65 76 65 6e 74 3f 43 2e 63 61 6e 63 65 6c 42 75 62 62 6c 65 3d 74 72 75 65 3a 43 2e 73 74 6f 70 50 72 6f 70 61 67 61 74 69 6f 6e 28 29 7d 29 3b 61 64 64 45 56 28 64 6f 63 75 6d 65 6e 74 2c 22 63 6c 69 63 6b 22 2c 66 75 6e 63 74 69 6f 6e 28 29 7b 41 2e 73 74 79 6c 65 2e 64 69 73 70 6c 61 79 3d 22 6e 6f 6e 65 22 7d 29 7d 7d 29 28 29 3b 28 66 75 6e 63 74 69 6f 6e 28 29 7b 76 61 72 20 45 3d 47 28 22 75 22 29 2e 67 65 74 45 6c 65 6d 65 6e 74 73 42 79 54 61 67 4e 61 6d 65 28 22 61 22 29 2c 43 3d 47 28 22 6e 76 22 29 2e 67 65 74 45 6c 65 6d 65 6e 74 73 42 79 54 61 67 4e 61 6d 65 28 22 61 22 29 2c 49 3d 47 28 22 6c 6b 22 29 2e 67 65 74 45 6c 65 6d 65 6e 74 73 42 79 54 61 67 4e 61 6d 65 28 22 61 22 29 2c 42 3d 22 22 3b 76 61 72 20 41 3d 5b 22 6e 65 77 73 22 2c 22 74 69 65 62 61 22 2c 22 7a 68 69 64 61 6f 22 2c 22 6d 70 33 22 2c 22 69 6d 67 22 2c 22 76 69 64 65 6f 22 2c 22 6d 61 70 22 5d 3b 76 61 72 20 48 3d 5b 22 68 69 22 2c 22 62 61 69 6b 65 22 2c 22 68 61 6f 31 32 33 22 2c 22 6d 6f 72 65 22 5d 3b 69 66 28 47 28 22 75 6e 22 29 26 26 47 28 22 75 6e 22 29 2e 69 6e 6e 65 72 48 54 4d 4c 21 3d 22 22 29 7b 42 3d 47 28 22 75 6e 22 29 2e 69 6e 6e 65 72 48 54 4d 4c 7d 66 75 6e 63 74 69 6f 6e 20 44 28 4a 29 7b 61 64 64 45 56 28 4a 2c 22 6d 6f 75 73 65 64 6f 77 6e 22 2c 66 75 6e 63 74 69 6f 6e 28 4c 29 7b 76 61 72 20 4c 3d 4c 7c 7c 77 69 6e 64 6f 77 2e 65 76 65 6e 74 3b 76 61 72 20 4b 3d 4c 2e 74 61 72 67 65 74 7c 7c 4c 2e 73 72 63 45 6c 65 6d 65 6e 74 3b 6e 73 5f 63 28 7b 66 6d 3a 22 62 65 68 73 22 2c 74 61 62 3a 4b 2e 6e 61 6d 65 7c 7c 22 74 6a 5f 75 73 65 72 22 2c 75 6e 3a 65 6e 63 6f 64 65 55 52 49 43 6f 6d 70 6f 6e 65 6e 74 28 42 29 7d 29 7d 29 7d 66 6f 72 28 76 61 72 20 46 3d 30 3b 46 3c 45 2e 6c 65 6e 67 74 68 3b 46 2b 2b 29 7b 44 28 45 5b 46 5d 29 7d 66 6f 72 28 76 61 72 20 46 3d 30 3b 46 3c 43 2e 6c 65 6e 67 74 68 3b 46 2b 2b 29 7b 43 5b 46 5d 2e 6e 61 6d 65 3d 22 74 6a 5f 22 2b 41 5b 46 5d 3b 44 28 43 5b 46 5d 29 7d 66 6f 72 28 76 61 72 20 46 3d 30 3b 46 3c 49 2e 6c 65 6e 67 74 68 3b 46 2b 2b 29 7b 49 5b 46 5d 2e 6e 61 6d 65 3d 22 74 6a 5f 22 2b 48 5b 46 5d 3b 44 28 49 5b 46 5d 29 7d 7d 29 28 29 3b 61 64 64 45 56 28 77 2c 22 6c 6f 61 64 22 2c 66 75 6e 63 74 69 6f 6e 28 29 7b 6b 2e 66 6f 63 75 73 28 29 7d 29 3b 77 2e 6f 6e 75 6e 6c 6f 61 64 3d 66 75 6e 63 74 69 6f 6e 28 29 7b 7d 3b 3c 2f 73 63 72 69 70 74 3e 0d 0a 0d 0a 0d 0a 3c 73 63 72 69 70 74 20 74 79 70 65 3d 22 74 65 78 74 2f 6a 61 76 61 73 63 72 69 70 74 22 20 73 72 63 3d 22 68 74 74 70 3a 2f 2f 77 77 77 2e 62 61 69 64 75 2e 63 6f 6d 2f 63 61 63 68 65 2f 68 70 73 2f 6a 73 2f 68 70 73 2d 31 2e 32 2e 6a 73 22 3e 3c 2f 73 63 72 69 70 74 3e 0d 0a 0d 0a 3c 2f 68 74 6d 6c 3e 3c 21 2d 2d 62 37 36 32 33 34 35 64 39 37 39 35 36 32 65 38 2d 2d 3e", + "string": "百度一下,你就知道 \r\n\r\n\r\n
搜索设置 | 登录
\r\n

新 闻 网 页 贴 吧 知 道 MP3 图 片 视 频 地 图

\r\n

空间 百科 hao123 | 更多>>

把百度设为主页把百度加入收藏夹

\r\n

加入百度推广 | 搜索风云榜 | 关于百度 | About Baidu

©2011 Baidu 使用百度前必读 京ICP证030173号

\r\n\r\n\r\n\r\n\r\n\r\n\r\n" + } +} diff --git a/test/gbk-test.js b/test/gbk-test.js index 75c3a901..b74033d0 100644 --- a/test/gbk-test.js +++ b/test/gbk-test.js @@ -1,46 +1,42 @@ "use strict"; -var fs = require("fs"), - assert = require("assert"), - Buffer = require("safer-buffer").Buffer, - iconv = require("../"); +const assert = require("assert"), + utils = require("./utils"), + fixtures = require("./fixtures/gbk-big5.json"), + iconv = utils.requireIconv(); -var testString = "中国abc", //unicode contains GBK-code and ascii - testStringGBKBuffer = Buffer.from([0xd6, 0xd0, 0xb9, 0xfa, 0x61, 0x62, 0x63]); +const testString = "中国abc", //unicode contains GBK-code and ascii + testStringGBKBuffer = utils.bytes("d6 d0 b9 fa 61 62 63"); -describe("GBK tests", function () { +describe("GBK tests #node-web", function () { it("GBK correctly encoded/decoded", function () { assert.strictEqual( - iconv.encode(testString, "GBK").toString("binary"), - testStringGBKBuffer.toString("binary") + utils.hex(iconv.encode(testString, "GBK")), + utils.hex(testStringGBKBuffer) ); assert.strictEqual(iconv.decode(testStringGBKBuffer, "GBK"), testString); }); it("GB2312 correctly encoded/decoded", function () { assert.strictEqual( - iconv.encode(testString, "GB2312").toString("binary"), - testStringGBKBuffer.toString("binary") + utils.hex(iconv.encode(testString, "GB2312")), + utils.hex(testStringGBKBuffer) ); assert.strictEqual(iconv.decode(testStringGBKBuffer, "GB2312"), testString); }); it("GBK file read decoded,compare with iconv result", function () { - var contentBuffer = fs.readFileSync(__dirname + "/gbkFile.txt"); - var str = iconv.decode(contentBuffer, "GBK"); - var iconvc = new (require("iconv").Iconv)("GBK", "utf8"); - assert.strictEqual(iconvc.convert(contentBuffer).toString(), str); + const contentBuffer = utils.bytes(fixtures.gbk.bytes); + const str = iconv.decode(contentBuffer, "GBK"); + assert.strictEqual(fixtures.gbk.string, str); }); it("GBK correctly decodes and encodes characters · and ×", function () { // https://github.com/ashtuchkin/iconv-lite/issues/13 // Reference: http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT - var chars = "·×"; - var gbkChars = Buffer.from([0xa1, 0xa4, 0xa1, 0xc1]); - assert.strictEqual( - iconv.encode(chars, "GBK").toString("binary"), - gbkChars.toString("binary") - ); + const chars = "·×"; + const gbkChars = utils.bytes("a1 a4 a1 c1"); + assert.strictEqual(utils.hex(iconv.encode(chars, "GBK")), utils.hex(gbkChars)); assert.strictEqual(iconv.decode(gbkChars, "GBK"), chars); }); @@ -48,8 +44,8 @@ describe("GBK tests", function () { // Euro character (U+20AC) has two encodings in GBK family: 0x80 and 0xA2 0xE3 // According to W3C's technical recommendation (https://www.w3.org/TR/encoding/#gbk-encoder), // Both GBK and GB18030 decoders should accept both encodings. - var gbkEuroEncoding1 = Buffer.from([0x80]), - gbkEuroEncoding2 = Buffer.from([0xa2, 0xe3]), + const gbkEuroEncoding1 = utils.bytes("80"), + gbkEuroEncoding2 = utils.bytes("a2 e3"), strEuro = "€"; assert.strictEqual(iconv.decode(gbkEuroEncoding1, "GBK"), strEuro); @@ -58,13 +54,10 @@ describe("GBK tests", function () { assert.strictEqual(iconv.decode(gbkEuroEncoding2, "GB18030"), strEuro); // But when decoding, GBK should produce 0x80, but GB18030 - 0xA2 0xE3. + assert.strictEqual(utils.hex(iconv.encode(strEuro, "GBK")), utils.hex(gbkEuroEncoding1)); assert.strictEqual( - iconv.encode(strEuro, "GBK").toString("hex"), - gbkEuroEncoding1.toString("hex") - ); - assert.strictEqual( - iconv.encode(strEuro, "GB18030").toString("hex"), - gbkEuroEncoding2.toString("hex") + utils.hex(iconv.encode(strEuro, "GB18030")), + utils.hex(gbkEuroEncoding2) ); }); @@ -92,65 +85,54 @@ describe("GBK tests", function () { ); }); - function swapBytes(buf) { - for (var i = 0; i < buf.length; i += 2) buf.writeUInt16LE(buf.readUInt16BE(i), i); - return buf; - } - function spacify4(str) { - return str.replace(/(....)/g, "$1 ").trim(); - } - function strToHex(str) { - return spacify4(swapBytes(Buffer.from(str, "ucs2")).toString("hex")); - } - it("GB18030 encodes/decodes 4 byte sequences", function () { - var chars = { - "\u0080": Buffer.from([0x81, 0x30, 0x81, 0x30]), - "\u0081": Buffer.from([0x81, 0x30, 0x81, 0x31]), - "\u008b": Buffer.from([0x81, 0x30, 0x82, 0x31]), - "\u0615": Buffer.from([0x81, 0x31, 0x82, 0x31]), - 㦟: Buffer.from([0x82, 0x31, 0x82, 0x31]), - "\udbd9\ude77": Buffer.from([0xe0, 0x31, 0x82, 0x31]), + const chars = { + "\u0080": utils.bytes("81 30 81 30"), + "\u0081": utils.bytes("81 30 81 31"), + "\u008b": utils.bytes("81 30 82 31"), + "\u0615": utils.bytes("81 31 82 31"), + 㦟: utils.bytes("82 31 82 31"), + "\udbd9\ude77": utils.bytes("e0 31 82 31"), }; - for (var uChar in chars) { - var gbkBuf = chars[uChar]; + for (const uChar in chars) { + const gbkBuf = chars[uChar]; + assert.strictEqual(utils.hex(iconv.encode(uChar, "GB18030")), utils.hex(gbkBuf)); assert.strictEqual( - iconv.encode(uChar, "GB18030").toString("hex"), - gbkBuf.toString("hex") + utils.strToHex(iconv.decode(gbkBuf, "GB18030")), + utils.strToHex(uChar) ); - assert.strictEqual(strToHex(iconv.decode(gbkBuf, "GB18030")), strToHex(uChar)); } }); it("GB18030 correctly decodes incomplete 4 byte sequences", function () { - var chars = { - "�": Buffer.from([0x82]), - "�1": Buffer.from([0x82, 0x31]), - "�1�": Buffer.from([0x82, 0x31, 0x82]), - 㦟: Buffer.from([0x82, 0x31, 0x82, 0x31]), - "� ": Buffer.from([0x82, 0x20]), - "�1 ": Buffer.from([0x82, 0x31, 0x20]), - "�1� ": Buffer.from([0x82, 0x31, 0x82, 0x20]), - "\u399f ": Buffer.from([0x82, 0x31, 0x82, 0x31, 0x20]), - "�1\u4fdb": Buffer.from([0x82, 0x31, 0x82, 0x61]), - "�1\u5010\u0061": Buffer.from([0x82, 0x31, 0x82, 0x82, 0x61]), - 㦟俛: Buffer.from([0x82, 0x31, 0x82, 0x31, 0x82, 0x61]), - "�1\u50101�1": Buffer.from([0x82, 0x31, 0x82, 0x82, 0x31, 0x82, 0x31]), + const chars = { + "�": utils.bytes("82"), + "�1": utils.bytes("82 31"), + "�1�": utils.bytes("82 31 82"), + 㦟: utils.bytes("82 31 82 31"), + "� ": utils.bytes("82 20"), + "�1 ": utils.bytes("82 31 20"), + "�1� ": utils.bytes("82 31 82 20"), + "\u399f ": utils.bytes("82 31 82 31 20"), + "�1\u4fdb": utils.bytes("82 31 82 61"), + "�1\u5010\u0061": utils.bytes("82 31 82 82 61"), + 㦟俛: utils.bytes("82 31 82 31 82 61"), + "�1\u50101�1": utils.bytes("82 31 82 82 31 82 31"), }; - for (var uChar in chars) { - var gbkBuf = chars[uChar]; - assert.strictEqual(strToHex(iconv.decode(gbkBuf, "GB18030")), strToHex(uChar)); + for (const uChar in chars) { + const gbkBuf = chars[uChar]; + assert.strictEqual( + utils.strToHex(iconv.decode(gbkBuf, "GB18030")), + utils.strToHex(uChar) + ); } }); it("GB18030:2005 changes are applied", function () { // See https://github.com/whatwg/encoding/issues/22 - var chars = "\u1E3F\u0000\uE7C7"; // Use \u0000 as separator - var gbkChars = Buffer.from([0xa8, 0xbc, 0x00, 0x81, 0x35, 0xf4, 0x37]); + const chars = "\u1E3F\u0000\uE7C7"; // Use \u0000 as separator + const gbkChars = utils.bytes("a8 bc 00 81 35 f4 37"); assert.strictEqual(iconv.decode(gbkChars, "GB18030"), chars); - assert.strictEqual( - iconv.encode(chars, "GB18030").toString("hex"), - gbkChars.toString("hex") - ); + assert.strictEqual(utils.hex(iconv.encode(chars, "GB18030")), utils.hex(gbkChars)); }); }); diff --git a/test/shiftjis-test.js b/test/shiftjis-test.js index e7a7fbb3..52d9ca48 100644 --- a/test/shiftjis-test.js +++ b/test/shiftjis-test.js @@ -1,45 +1,45 @@ "use strict"; -var assert = require("assert"), - Buffer = require("safer-buffer").Buffer, - iconv = require("../"); +const assert = require("assert"), + utils = require("./utils"), + iconv = utils.requireIconv(); -describe("ShiftJIS tests", function () { +describe("ShiftJIS tests #node-web", function () { it("ShiftJIS correctly encoded/decoded", function () { - var testString = "中文abc", //unicode contains ShiftJIS-code and ascii - testStringBig5Buffer = Buffer.from([0x92, 0x86, 0x95, 0xb6, 0x61, 0x62, 0x63]), + const testString = "中文abc", //unicode contains ShiftJIS-code and ascii + testStringBig5Buffer = utils.bytes("92 86 95 b6 61 62 63"), testString2 = "測試", - testStringBig5Buffer2 = Buffer.from([0x91, 0xaa, 0x8e, 0x8e]); + testStringBig5Buffer2 = utils.bytes("91 aa 8e 8e"); assert.strictEqual( - iconv.encode(testString, "shiftjis").toString("hex"), - testStringBig5Buffer.toString("hex") + utils.hex(iconv.encode(testString, "shiftjis")), + utils.hex(testStringBig5Buffer) ); assert.strictEqual(iconv.decode(testStringBig5Buffer, "shiftjis"), testString); assert.strictEqual( - iconv.encode(testString2, "shiftjis").toString("hex"), - testStringBig5Buffer2.toString("hex") + utils.hex(iconv.encode(testString2, "shiftjis")), + utils.hex(testStringBig5Buffer2) ); assert.strictEqual(iconv.decode(testStringBig5Buffer2, "shiftjis"), testString2); }); it("ShiftJIS extended chars are decoded, but not encoded", function () { - var buf = Buffer.from("ed40eefceeef", "hex"), + const buf = utils.bytes("ed 40 ee fc ee ef"), str = "纊"ⅰ", - res = "fa5cfa57fa40", // repeated block (these same chars are repeated in the different place) - buf2 = Buffer.from("f040f2fcf940", "hex"), + res = "fa 5c fa 57 fa 40", // repeated block (these same chars are repeated in the different place) + buf2 = utils.bytes("f0 40 f2 fc f9 40"), str2 = "", - res2 = "3f3f3f"; // non-repeated, UA block. + res2 = "3f 3f 3f"; // non-repeated, UA block. assert.strictEqual(iconv.decode(buf, "shiftjis"), str); assert.strictEqual(iconv.decode(buf2, "shiftjis"), str2); - assert.strictEqual(iconv.encode(str, "shiftjis").toString("hex"), res); - assert.strictEqual(iconv.encode(str2, "shiftjis").toString("hex"), res2); + assert.strictEqual(utils.hex(iconv.encode(str, "shiftjis")), res); + assert.strictEqual(utils.hex(iconv.encode(str2, "shiftjis")), res2); }); it("ShiftJIS includes extensions", function () { - assert.strictEqual(iconv.decode(Buffer.from("8740", "hex"), "shiftjis"), "①"); - assert.strictEqual(iconv.encode("①", "shiftjis").toString("hex"), "8740"); + assert.strictEqual(iconv.decode(utils.bytes("87 40"), "shiftjis"), "①"); + assert.strictEqual(utils.hex(iconv.encode("①", "shiftjis")), "87 40"); }); }); diff --git a/test/webpack/iconv-lite-tests.js b/test/webpack/iconv-lite-tests.js index c2690bee..9b1ad0fb 100644 --- a/test/webpack/iconv-lite-tests.js +++ b/test/webpack/iconv-lite-tests.js @@ -11,3 +11,6 @@ require("../sbcs-test"); require("../turkish-test"); require("../utf16-test"); require("../utils-test"); +require("../shiftjis-test"); +require("../gbk-test"); +require("../big5-test"); From 1f5c89e676c8de4f3d0d82896109868dacae9138 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ng=E1=BB=8Dc=20Tr=E1=BA=A7n?= Date: Wed, 29 Jul 2020 15:19:30 +0700 Subject: [PATCH 2/7] add byteLength method and hasState property --- encodings/dbcs-codec.js | 122 ++++++++++++++++++++++++++++++++++++++++ encodings/internal.js | 57 +++++++++++++++++++ encodings/sbcs-codec.js | 12 +++- encodings/utf16.js | 28 ++++++++- encodings/utf32.js | 58 +++++++++++++++++++ encodings/utf7.js | 96 ++++++++++++++++++++++++++++--- lib/index.d.ts | 5 ++ lib/index.js | 4 ++ 8 files changed, 371 insertions(+), 11 deletions(-) diff --git a/encodings/dbcs-codec.js b/encodings/dbcs-codec.js index 456c8ac7..594f29ec 100644 --- a/encodings/dbcs-codec.js +++ b/encodings/dbcs-codec.js @@ -332,6 +332,124 @@ class DBCSEncoder { this.gb18030 = codec.gb18030; } + byteLength(str) { + let byteLength = 0; + let leadSurrogate = -1, + seqObj = undefined, + nextChar = -1, + i = 0, + + for (; ;) { + // 0. Get next character. + let uCode; + if (nextChar === -1) { + if (i === str.length) break; + uCode = str.charCodeAt(i++); + } else { + uCode = nextChar; + nextChar = -1; + } + + // 1. Handle surrogates. + if (0xd800 <= uCode && uCode < 0xe000) { + // Char is one of surrogates. + if (uCode < 0xdc00) { + // We've got a lead surrogate. + if (leadSurrogate === -1) { + leadSurrogate = uCode; + continue; + } else { + leadSurrogate = uCode; + // Double lead surrogate found. + uCode = UNASSIGNED; + } + } else { + // We've got trail surrogate. + if (leadSurrogate !== -1) { + uCode = 0x10000 + (leadSurrogate - 0xd800) * 0x400 + (uCode - 0xdc00); + leadSurrogate = -1; + } else { + // Incomplete surrogate pair - only trail surrogate found. + uCode = UNASSIGNED; + } + } + } else if (leadSurrogate !== -1) { + // Incomplete surrogate pair - only lead surrogate found. + nextChar = uCode; + uCode = UNASSIGNED; + leadSurrogate = -1; + } + + // 2. Convert uCode character. + let dbcsCode = UNASSIGNED; + if (seqObj !== undefined && uCode !== UNASSIGNED) { + // We are in the middle of the sequence + let resCode = seqObj[uCode]; + if (typeof resCode === "object") { + // Sequence continues. + seqObj = resCode; + continue; + } else if (typeof resCode == "number") { + // Sequence finished. + dbcsCode = resCode; + } else if (resCode === undefined) { + // Current character is not part of the sequence. + + // Try default character for this sequence + resCode = seqObj[DEF_CHAR]; + if (resCode !== undefined) { + dbcsCode = resCode; // Found. + nextChar = uCode; // Current character will be written too in the next iteration. + } else { + // Skip + } + } + seqObj = undefined; + } else if (uCode >= 0) { + // Regular character + const subtable = this.encodeTable[uCode >> 8]; + if (subtable !== undefined) dbcsCode = subtable[uCode & 0xff]; + + if (dbcsCode <= SEQ_START) { + // Sequence start + seqObj = this.encodeTableSeq[SEQ_START - dbcsCode]; + continue; + } + + if (dbcsCode === UNASSIGNED && this.gb18030) { + // Use GB18030 algorithm to find character(s) to count. + const idx = findIdx(this.gb18030.uChars, uCode); + if (idx !== -1) { + dbcsCode = this.gb18030.gbChars[idx] + (uCode - this.gb18030.uChars[idx]); + dbcsCode = dbcsCode % 12600 % 1260 % 10; + byteLength += 4; + continue; + } + } + } + + // 3. Count dbcsCode character. + if (dbcsCode === UNASSIGNED) { + dbcsCode = this.defaultCharSingleByte; + } + + if (dbcsCode < 0x100) + byteLength += 1; + else if (dbcsCode < 0x10000) + byteLength += 2; + else if (dbcsCode < 0x1000000) + byteLength += 3; + else + byteLength += 4; + } + + return byteLength; + } + + get hasState() { + return this.leadSurrogate !== -1 || this.seqObj !== undefined; + } + write(str) { const bytes = this.backend.allocBytes(str.length * (this.gb18030 ? 4 : 3)); let leadSurrogate = this.leadSurrogate, @@ -521,6 +639,10 @@ class DBCSDecoder { this.gb18030 = codec.gb18030; } + get hasState() { + return this.prevBytes.length > 0; + } + write(buf) { const chars = this.backend.allocRawChars(buf.length), prevBytes = this.prevBytes, diff --git a/encodings/internal.js b/encodings/internal.js index f83c1469..67db325d 100644 --- a/encodings/internal.js +++ b/encodings/internal.js @@ -56,6 +56,13 @@ function InternalDecoder(options, codec) { this.decoder = new StringDecoder(codec.enc); } +Object.defineProperty(InternalDecoder.prototype, 'hasState', { + get: function () { + // TODO: hopefully this will not be changed in newer version of NodeJS + return this.decoder['lastNeed'] !== 0; + } +}); + InternalDecoder.prototype.write = function (buf) { if (!Buffer.isBuffer(buf)) { buf = Buffer.from(buf); @@ -75,6 +82,16 @@ function InternalEncoder(options, codec) { this.enc = codec.enc; } +Object.defineProperty(InternalEncoder.prototype, 'hasState', { + get: function () { + return false; + } +}); + +InternalEncoder.prototype.byteLength = function (str) { + return Buffer.byteLength(str, this.enc); +} + InternalEncoder.prototype.write = function (str) { return Buffer.from(str, this.enc); }; @@ -88,6 +105,20 @@ function InternalEncoderBase64() { this.prevStr = ""; } +Object.defineProperty(InternalEncoderBase64.prototype, 'hasState', { + get: function () { + return this.prevStr.length > 0; + } +}); + +InternalEncoderBase64.prototype.byteLength = function (str) { + var completeQuads = str.length - (str.length % 4); + str = str.slice(0, completeQuads); + var nonPaddedLength = str.search(/=*$/); + if (nonPaddedLength === -1) nonPaddedLength = str.length; + return Math.floor(nonPaddedLength * 3 / 4); +} + InternalEncoderBase64.prototype.write = function (str) { str = this.prevStr + str; var completeQuads = str.length - (str.length % 4); @@ -106,6 +137,26 @@ InternalEncoderBase64.prototype.end = function () { function InternalEncoderCesu8() {} +Object.defineProperty(InternalEncoderBase64.prototype, 'hasState', { + get: function () { + return false; + } +}); + +InternalEncoderCesu8.prototype.byteLength = function (str) { + let byteLength = 0; + for (let i = 0; i < str.length; i++) { + const charCode = str.charCodeAt(i); + if (charCode < 0x80) + byteLength += 1; + else if (charCode < 0x800) + byteLength += 2; + else + byteLength += 3; + } + return byteLength; +} + InternalEncoderCesu8.prototype.write = function (str) { const buf = Buffer.alloc(str.length * 3); let bufIdx = 0; @@ -140,6 +191,12 @@ function InternalDecoderCesu8(options, codec) { this.defaultCharUnicode = codec.defaultCharUnicode; } +Object.defineProperty(InternalDecoderCesu8.prototype, 'hasState', { + get: function () { + return this.contBytes > 0; + } +}); + InternalDecoderCesu8.prototype.write = function (buf) { let acc = this.acc, contBytes = this.contBytes, diff --git a/encodings/sbcs-codec.js b/encodings/sbcs-codec.js index 82014df5..1d1ea6e4 100644 --- a/encodings/sbcs-codec.js +++ b/encodings/sbcs-codec.js @@ -58,6 +58,12 @@ class SBCSEncoder { this.encodeBuf = codec.encodeBuf; } + byteLength(str) { + return str.length; + } + + get hasState() { return false; } + write(str) { const bytes = this.backend.allocBytes(str.length); @@ -68,7 +74,7 @@ class SBCSEncoder { return this.backend.bytesToResult(bytes, bytes.length); } - end() {} + end() { } } class SBCSDecoder { @@ -77,6 +83,8 @@ class SBCSDecoder { this.backend = backend; } + get hasState() { return false; } + write(buf) { // Strings are immutable in JS -> we use ucs2 buffer to speed up computations. const decodeBuf = this.decodeBuf; @@ -88,5 +96,5 @@ class SBCSDecoder { return this.backend.rawCharsToResult(chars, chars.length); } - end() {} + end() { } } diff --git a/encodings/utf16.js b/encodings/utf16.js index 14cd3c1e..ddfb4726 100644 --- a/encodings/utf16.js +++ b/encodings/utf16.js @@ -22,6 +22,12 @@ class Utf16LEEncoder { this.backend = backend; } + byteLength(str) { + return str.length * 2; + } + + get hasState() { return false; } + write(str) { const bytes = this.backend.allocBytes(str.length * 2); const chars = new Uint16Array(bytes.buffer, bytes.byteOffset, str.length); @@ -31,7 +37,7 @@ class Utf16LEEncoder { return this.backend.bytesToResult(bytes, bytes.length); } - end() {} + end() { } } class Utf16LEDecoder { @@ -42,6 +48,10 @@ class Utf16LEDecoder { this.leadSurrogate = undefined; } + get hasState() { + return this.leadSurrogate || this.leadByte !== -1; + } + write(buf) { // NOTE: This function is mostly the same as Utf16BEDecoder.write() with bytes swapped. // Please keep them in sync. @@ -158,6 +168,12 @@ class Utf16BEEncoder { this.backend = backend; } + byteLength(str) { + return str.length * 2; + } + + get hasState() { return false; } + write(str) { const bytes = this.backend.allocBytes(str.length * 2); let bytesPos = 0; @@ -169,7 +185,7 @@ class Utf16BEEncoder { return this.backend.bytesToResult(bytes, bytesPos); } - end() {} + end() { } } class Utf16BEDecoder { @@ -180,6 +196,10 @@ class Utf16BEDecoder { this.leadSurrogate = undefined; } + get hasState() { + return this.leadSurrogate || this.leadByte !== -1; + } + write(buf) { // NOTE: This function is mostly copy/paste from Utf16LEDecoder.write() with bytes swapped. // Please keep them in sync. Comments in that function apply here too. @@ -292,6 +312,10 @@ class Utf16Decoder { this.iconv = iconv; } + get hasState() { + return this.initialBufsLen !== 0 || (this.decoder != null && this.decoder.hasState); + } + write(buf) { if (!this.decoder) { // Codec is not chosen yet. Accumulate initial bytes. diff --git a/encodings/utf32.js b/encodings/utf32.js index be92fe27..543d37c2 100644 --- a/encodings/utf32.js +++ b/encodings/utf32.js @@ -29,6 +29,12 @@ function Utf32Encoder(options, codec) { this.highSurrogate = 0; } +Object.defineProperty(Utf32Encoder.prototype, 'hasState', { + get: function () { + return !!this.highSurrogate; + } +}); + Utf32Encoder.prototype.write = function (str) { var src = Buffer.from(str, "ucs2"); var dst = Buffer.alloc(src.length * 2); @@ -76,6 +82,36 @@ Utf32Encoder.prototype.write = function (str) { return dst; }; +Utf32Encoder.prototype.byteLength = function (str) { + var byteLength = 0; + var currentHighSurrogate = 0; + + for (var i = 0; i < str.length; i++) { + var code = str.charCodeAt(i); + var isHighSurrogate = (0xd800 <= code && code < 0xdc00); // prettier-ignore + var isLowSurrogate = (0xdc00 <= code && code < 0xe000); // prettier-ignore + + if (currentHighSurrogate) { + if (isHighSurrogate || !isLowSurrogate) { + byteLength += 4; + } else { + byteLength += 4; + currentHighSurrogate = 0; + continue; + } + } + + if (isHighSurrogate) { + currentHighSurrogate = code; + } else { + byteLength += 4; + currentHighSurrogate = 0; + } + } + + return byteLength; +} + Utf32Encoder.prototype.end = function () { // Treat any leftover high surrogate as a semi-valid independent character. if (!this.highSurrogate) { @@ -100,6 +136,12 @@ function Utf32Decoder(options, codec) { this.overflow = []; } +Object.defineProperty(Utf32Encoder.prototype, 'hasState', { + get: function () { + return this.overflow.length > 0; + } +}); + Utf32Decoder.prototype.write = function (src) { if (src.length === 0) return ""; @@ -212,6 +254,16 @@ function Utf32AutoEncoder(options, codec) { this.encoder = codec.iconv.getEncoder(options.defaultEncoding || "utf-32le", options); } +Object.defineProperty(Utf32Encoder.prototype, 'hasState', { + get: function () { + return this.encoder.hasState; + } +}); + +Utf32AutoEncoder.prototype.byteLength = function (str) { + return this.encoder.byteLength(str); +} + Utf32AutoEncoder.prototype.write = function (str) { return this.encoder.write(str); }; @@ -230,6 +282,12 @@ function Utf32AutoDecoder(options, codec) { this.iconv = codec.iconv; } +Object.defineProperty(Utf32Encoder.prototype, 'hasState', { + get: function () { + return this.initialBufsLen !== 0 || (this.decoder != null && this.decoder.hasState); + } +}); + Utf32AutoDecoder.prototype.write = function (buf) { if (!this.decoder) { // Codec is not chosen yet. Accumulate initial bytes. diff --git a/encodings/utf7.js b/encodings/utf7.js index 739d7acc..d921ac2c 100644 --- a/encodings/utf7.js +++ b/encodings/utf7.js @@ -17,25 +17,50 @@ Utf7Codec.prototype.bomAware = true; // -- Encoding const nonDirectChars = /[^A-Za-z0-9'(),-./:? \n\r\t]+/g; +const segmentPattern = /([^A-Za-z0-9'(),-./:? \n\r\t]+)|([A-Za-z0-9'(),-./:? \n\r\t]+)/g; function Utf7Encoder(options, codec) { this.iconv = codec.iconv; } +Utf7Encoder.prototype.byteLength = function (str) { + var byteLength = 0; + + const segments = str.matchAll(segmentPattern); + for (const segment of segments) { + if (segment[2] != null) // match group 2: direct chars + byteLength += segment[2].length; + else { // match group 1: non direct chars + if (segment[1] !== "+") + byteLength += Math.ceil((segment[1].length * 2) * 4 / 3); // without padding + byteLength += 2; // + and - + } + } + + return byteLength; +} + +Object.defineProperty(Utf7Encoder.prototype, 'hasState', { + get: function () { + return false; + } +}); + Utf7Encoder.prototype.write = function (str) { // Naive implementation. // Non-direct chars are encoded as "+-"; single "+" char is encoded as "+-". - const replaceFn = (chunk) => - "+" + - (chunk === "+" - ? "" - : this.iconv.encode(chunk, "utf16-be").toString("base64").replace(/=+$/, "")) + - "-"; + function replaceFn(chunk) { + if (chunk === "+") + return "+-"; + var base64Str = this.iconv.encode(chunk, "utf16-be").toString("base64").replace(/=+$/, ""); + return "+" + base64Str + "-"; + }; return Buffer.from(str.replace(nonDirectChars, replaceFn)); }; -Utf7Encoder.prototype.end = function () {}; + +Utf7Encoder.prototype.end = function () { }; // -- Decoding @@ -53,6 +78,12 @@ var plusChar = "+".charCodeAt(0), minusChar = "-".charCodeAt(0), andChar = "&".charCodeAt(0); +Object.defineProperty(Utf7Decoder.prototype, 'hasState', { + get: function () { + return this.inBase64 && this.base64Accum.length > 0; + } +}); + Utf7Decoder.prototype.write = function (buf) { let res = "", lastI = 0, @@ -150,6 +181,51 @@ function Utf7IMAPEncoder(options, codec) { this.base64AccumIdx = 0; } +Utf7Encoder.prototype.byteLength = function (str) { + var byteLength = 0; + var inBase64 = false, + base64AccumLength = 0, + + for (var i = 0; i < str.length; i++) { + var uChar = str.charCodeAt(i); + if (0x20 <= uChar && uChar <= 0x7e) { // Direct character or '&'. + if (inBase64) { + if (base64AccumLength > 0) { + byteLength += Math.ceil(base64AccumLength * 4 / 3); // without padding + base64AccumLength = 0; + } + byteLength++; // Count '-', then go to direct mode. + inBase64 = false; + } + if (!inBase64) { + byteLength++; // Count direct character + if (uChar === andChar) // Ampersand -> '&-' + byteLength++; + } + } else { // Non-direct character + if (!inBase64) { + byteLength++; // Count '&', then go to base64 mode. + inBase64 = true; + } + if (inBase64) { + base64AccumLength += 2; + if (base64AccumLength === 6) { + byteLength += base64AccumLength * 4 / 3; + base64AccumLength = 0; + } + } + } + } + + return byteLength; +} + +Object.defineProperty(Utf7IMAPEncoder.prototype, 'hasState', { + get: function () { + return this.inBase64; + } +}); + Utf7IMAPEncoder.prototype.write = function (str) { var inBase64 = this.inBase64, base64Accum = this.base64Accum, @@ -243,6 +319,12 @@ function Utf7IMAPDecoder(options, codec) { var base64IMAPChars = base64Chars.slice(); base64IMAPChars[",".charCodeAt(0)] = true; +Object.defineProperty(Utf7IMAPDecoder.prototype, 'hasState', { + get: function () { + return this.inBase64 && this.base64Accum.length > 0; + } +}); + Utf7IMAPDecoder.prototype.write = function (buf) { var res = "", lastI = 0, diff --git a/lib/index.d.ts b/lib/index.d.ts index 0cab85f4..9a79057a 100644 --- a/lib/index.d.ts +++ b/lib/index.d.ts @@ -13,6 +13,8 @@ declare module "iconv-lite" { export function encodingExists(encoding: string): boolean; + export function byteLength(content: string, encoding: string): number; + // Stream API export function decodeStream(encoding: string, options?: Options): NodeJS.ReadWriteStream; @@ -31,11 +33,14 @@ export interface Options { } export interface EncoderStream { + byteLength(str: string): number; write(str: string): Buffer; end(): Buffer | undefined; + hasState: boolean; } export interface DecoderStream { write(buf: Buffer): string; end(): string | undefined; + hasState: boolean; } diff --git a/lib/index.js b/lib/index.js index ea1f6146..237f9ad9 100644 --- a/lib/index.js +++ b/lib/index.js @@ -132,6 +132,10 @@ iconv.getDecoder = function getDecoder(encoding, options) { return decoder; }; +iconv.byteLength = function byteLength(str, encoding) { + return iconv.getEncoder(encoding).byteLength(str) +} + // Streaming API // NOTE: Streaming API naturally depends on 'stream' module from Node.js. Unfortunately in browser environments this module can add // up to 100Kb to the output bundle. To avoid unnecessary code bloat, we don't enable Streaming API in browser by default. From 72517be5620a82346db73b4a6b8cee5506c9caea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ng=E1=BB=8Dc=20Tr=E1=BA=A7n?= Date: Wed, 29 Jul 2020 23:45:47 +0700 Subject: [PATCH 3/7] update readme and fix typo, wrong variable and function name --- README.md | 30 ++++++++++++++++++++--- encodings/dbcs-codec.js | 18 ++++++-------- encodings/internal.js | 39 ++++++++++++++--------------- encodings/utf32.js | 20 +++++++-------- encodings/utf7.js | 54 +++++++++++++++++++++-------------------- 5 files changed, 90 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index 41017936..f07f9222 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,30 @@ buf = iconv.encode("Sample input string", "win1251"); // Check if encoding is supported iconv.encodingExists("us-ascii"); + +// Calculate the actual length in bytes. +len = iconv.byteLength("Hello, world! 😀", "utf16be"); + +// Get a decoder and decode two different buffers into a single string, the decoder keeps state between buffers +var utf8Decoder = iconv.getDecoder("utf8"); +var bytes1 = Buffer.from([0x20, 0x23, 0xe2]); // space, # and part of ☣ +var bytes2 = Buffer.from([0x98, 0xa3]); // the rest of ☣ +var str = utf8Decoder.write(bytes1); +// You can check if the decoder has state currently +var hasState = utf8Decoder.hasState; // true; +str += utf8Decoder.write(bytes2); +var hasState = utf8Decoder.hasState; // false; + +// The same for encoder, you rarely need to care about the encoder's state, except for some special encoders and surrogate pair +var utf8Encoder = iconv.getEncoder("utf8"); +var bytes = utf8Encoder.write("Hi \uD83D"); +var hasState = utf8Encoder.hasState; // true +bytes = bytes.concat([utf8Encoder.write("\uDE00")]); +hasState = utf8Encoder.hasState; // false + +// Use the "end" method to get the remaining data in encoder/decoder's state and clear the state +var bytes = encoder.end(); +var str = decoder.end(); ``` ### Streaming API @@ -112,9 +136,9 @@ This library supports UTF-32LE, UTF-32BE and UTF-32 encodings. Like the UTF-16 e ## Other notes -When decoding, be sure to supply a Buffer to decode() method, otherwise [bad things usually happen](https://github.com/ashtuchkin/iconv-lite/wiki/Use-Buffers-when-decoding). -Untranslatable characters are set to � or ?. No transliteration is currently supported. -Node versions 0.10.31 and 0.11.13 are buggy, don't use them (see #65, #77). +- When decoding, be sure to supply a Buffer to decode() method, otherwise [bad things usually happen](https://github.com/ashtuchkin/iconv-lite/wiki/Use-Buffers-when-decoding). +- Untranslatable characters are set to � or ?. No transliteration is currently supported. +- Node versions 0.10.31 and 0.11.13 are buggy, don't use them (see #65, #77). ## Testing diff --git a/encodings/dbcs-codec.js b/encodings/dbcs-codec.js index 594f29ec..5e96cb3d 100644 --- a/encodings/dbcs-codec.js +++ b/encodings/dbcs-codec.js @@ -337,9 +337,9 @@ class DBCSEncoder { let leadSurrogate = -1, seqObj = undefined, nextChar = -1, - i = 0, + i = 0; - for (; ;) { + for (;;) { // 0. Get next character. let uCode; if (nextChar === -1) { @@ -421,7 +421,7 @@ class DBCSEncoder { const idx = findIdx(this.gb18030.uChars, uCode); if (idx !== -1) { dbcsCode = this.gb18030.gbChars[idx] + (uCode - this.gb18030.uChars[idx]); - dbcsCode = dbcsCode % 12600 % 1260 % 10; + dbcsCode = ((dbcsCode % 12600) % 1260) % 10; byteLength += 4; continue; } @@ -433,14 +433,10 @@ class DBCSEncoder { dbcsCode = this.defaultCharSingleByte; } - if (dbcsCode < 0x100) - byteLength += 1; - else if (dbcsCode < 0x10000) - byteLength += 2; - else if (dbcsCode < 0x1000000) - byteLength += 3; - else - byteLength += 4; + if (dbcsCode < 0x100) byteLength += 1; + else if (dbcsCode < 0x10000) byteLength += 2; + else if (dbcsCode < 0x1000000) byteLength += 3; + else byteLength += 4; } return byteLength; diff --git a/encodings/internal.js b/encodings/internal.js index 67db325d..552bbefd 100644 --- a/encodings/internal.js +++ b/encodings/internal.js @@ -56,11 +56,11 @@ function InternalDecoder(options, codec) { this.decoder = new StringDecoder(codec.enc); } -Object.defineProperty(InternalDecoder.prototype, 'hasState', { +Object.defineProperty(InternalDecoder.prototype, "hasState", { get: function () { // TODO: hopefully this will not be changed in newer version of NodeJS - return this.decoder['lastNeed'] !== 0; - } + return this.decoder["lastNeed"] !== 0; + }, }); InternalDecoder.prototype.write = function (buf) { @@ -82,15 +82,15 @@ function InternalEncoder(options, codec) { this.enc = codec.enc; } -Object.defineProperty(InternalEncoder.prototype, 'hasState', { +Object.defineProperty(InternalEncoder.prototype, "hasState", { get: function () { return false; - } + }, }); InternalEncoder.prototype.byteLength = function (str) { return Buffer.byteLength(str, this.enc); -} +}; InternalEncoder.prototype.write = function (str) { return Buffer.from(str, this.enc); @@ -105,10 +105,10 @@ function InternalEncoderBase64() { this.prevStr = ""; } -Object.defineProperty(InternalEncoderBase64.prototype, 'hasState', { +Object.defineProperty(InternalEncoderBase64.prototype, "hasState", { get: function () { return this.prevStr.length > 0; - } + }, }); InternalEncoderBase64.prototype.byteLength = function (str) { @@ -116,8 +116,8 @@ InternalEncoderBase64.prototype.byteLength = function (str) { str = str.slice(0, completeQuads); var nonPaddedLength = str.search(/=*$/); if (nonPaddedLength === -1) nonPaddedLength = str.length; - return Math.floor(nonPaddedLength * 3 / 4); -} + return Math.floor((nonPaddedLength * 3) / 4); +}; InternalEncoderBase64.prototype.write = function (str) { str = this.prevStr + str; @@ -137,25 +137,22 @@ InternalEncoderBase64.prototype.end = function () { function InternalEncoderCesu8() {} -Object.defineProperty(InternalEncoderBase64.prototype, 'hasState', { +Object.defineProperty(InternalEncoderCesu8.prototype, "hasState", { get: function () { return false; - } + }, }); InternalEncoderCesu8.prototype.byteLength = function (str) { let byteLength = 0; for (let i = 0; i < str.length; i++) { const charCode = str.charCodeAt(i); - if (charCode < 0x80) - byteLength += 1; - else if (charCode < 0x800) - byteLength += 2; - else - byteLength += 3; + if (charCode < 0x80) byteLength += 1; + else if (charCode < 0x800) byteLength += 2; + else byteLength += 3; } return byteLength; -} +}; InternalEncoderCesu8.prototype.write = function (str) { const buf = Buffer.alloc(str.length * 3); @@ -191,10 +188,10 @@ function InternalDecoderCesu8(options, codec) { this.defaultCharUnicode = codec.defaultCharUnicode; } -Object.defineProperty(InternalDecoderCesu8.prototype, 'hasState', { +Object.defineProperty(InternalDecoderCesu8.prototype, "hasState", { get: function () { return this.contBytes > 0; - } + }, }); InternalDecoderCesu8.prototype.write = function (buf) { diff --git a/encodings/utf32.js b/encodings/utf32.js index 543d37c2..037d6645 100644 --- a/encodings/utf32.js +++ b/encodings/utf32.js @@ -29,10 +29,10 @@ function Utf32Encoder(options, codec) { this.highSurrogate = 0; } -Object.defineProperty(Utf32Encoder.prototype, 'hasState', { +Object.defineProperty(Utf32Encoder.prototype, "hasState", { get: function () { return !!this.highSurrogate; - } + }, }); Utf32Encoder.prototype.write = function (str) { @@ -110,7 +110,7 @@ Utf32Encoder.prototype.byteLength = function (str) { } return byteLength; -} +}; Utf32Encoder.prototype.end = function () { // Treat any leftover high surrogate as a semi-valid independent character. @@ -136,10 +136,10 @@ function Utf32Decoder(options, codec) { this.overflow = []; } -Object.defineProperty(Utf32Encoder.prototype, 'hasState', { +Object.defineProperty(Utf32Decoder.prototype, "hasState", { get: function () { return this.overflow.length > 0; - } + }, }); Utf32Decoder.prototype.write = function (src) { @@ -254,15 +254,15 @@ function Utf32AutoEncoder(options, codec) { this.encoder = codec.iconv.getEncoder(options.defaultEncoding || "utf-32le", options); } -Object.defineProperty(Utf32Encoder.prototype, 'hasState', { +Object.defineProperty(Utf32AutoEncoder.prototype, "hasState", { get: function () { return this.encoder.hasState; - } + }, }); Utf32AutoEncoder.prototype.byteLength = function (str) { return this.encoder.byteLength(str); -} +}; Utf32AutoEncoder.prototype.write = function (str) { return this.encoder.write(str); @@ -282,10 +282,10 @@ function Utf32AutoDecoder(options, codec) { this.iconv = codec.iconv; } -Object.defineProperty(Utf32Encoder.prototype, 'hasState', { +Object.defineProperty(Utf32AutoDecoder.prototype, "hasState", { get: function () { return this.initialBufsLen !== 0 || (this.decoder != null && this.decoder.hasState); - } + }, }); Utf32AutoDecoder.prototype.write = function (buf) { diff --git a/encodings/utf7.js b/encodings/utf7.js index d921ac2c..afe39f01 100644 --- a/encodings/utf7.js +++ b/encodings/utf7.js @@ -28,30 +28,30 @@ Utf7Encoder.prototype.byteLength = function (str) { const segments = str.matchAll(segmentPattern); for (const segment of segments) { - if (segment[2] != null) // match group 2: direct chars + if (segment[2] != null) + // match group 2: direct chars byteLength += segment[2].length; - else { // match group 1: non direct chars - if (segment[1] !== "+") - byteLength += Math.ceil((segment[1].length * 2) * 4 / 3); // without padding - byteLength += 2; // + and - + else { + // match group 1: non direct chars + if (segment[1] !== "+") byteLength += Math.ceil((segment[1].length * 2 * 4) / 3); // without padding + byteLength += 2; // + and - } } return byteLength; -} +}; -Object.defineProperty(Utf7Encoder.prototype, 'hasState', { +Object.defineProperty(Utf7Encoder.prototype, "hasState", { get: function () { return false; - } + }, }); Utf7Encoder.prototype.write = function (str) { // Naive implementation. // Non-direct chars are encoded as "+-"; single "+" char is encoded as "+-". - function replaceFn(chunk) { - if (chunk === "+") - return "+-"; + var replaceFn = (chunk) => { + if (chunk === "+") return "+-"; var base64Str = this.iconv.encode(chunk, "utf16-be").toString("base64").replace(/=+$/, ""); return "+" + base64Str + "-"; }; @@ -59,8 +59,7 @@ Utf7Encoder.prototype.write = function (str) { return Buffer.from(str.replace(nonDirectChars, replaceFn)); }; - -Utf7Encoder.prototype.end = function () { }; +Utf7Encoder.prototype.end = function () {}; // -- Decoding @@ -78,10 +77,10 @@ var plusChar = "+".charCodeAt(0), minusChar = "-".charCodeAt(0), andChar = "&".charCodeAt(0); -Object.defineProperty(Utf7Decoder.prototype, 'hasState', { +Object.defineProperty(Utf7Decoder.prototype, "hasState", { get: function () { return this.inBase64 && this.base64Accum.length > 0; - } + }, }); Utf7Decoder.prototype.write = function (buf) { @@ -184,14 +183,15 @@ function Utf7IMAPEncoder(options, codec) { Utf7Encoder.prototype.byteLength = function (str) { var byteLength = 0; var inBase64 = false, - base64AccumLength = 0, + base64AccumLength = 0; for (var i = 0; i < str.length; i++) { var uChar = str.charCodeAt(i); - if (0x20 <= uChar && uChar <= 0x7e) { // Direct character or '&'. + if (0x20 <= uChar && uChar <= 0x7e) { + // Direct character or '&'. if (inBase64) { if (base64AccumLength > 0) { - byteLength += Math.ceil(base64AccumLength * 4 / 3); // without padding + byteLength += Math.ceil((base64AccumLength * 4) / 3); // without padding base64AccumLength = 0; } byteLength++; // Count '-', then go to direct mode. @@ -199,10 +199,12 @@ Utf7Encoder.prototype.byteLength = function (str) { } if (!inBase64) { byteLength++; // Count direct character - if (uChar === andChar) // Ampersand -> '&-' + if (uChar === andChar) + // Ampersand -> '&-' byteLength++; } - } else { // Non-direct character + } else { + // Non-direct character if (!inBase64) { byteLength++; // Count '&', then go to base64 mode. inBase64 = true; @@ -210,7 +212,7 @@ Utf7Encoder.prototype.byteLength = function (str) { if (inBase64) { base64AccumLength += 2; if (base64AccumLength === 6) { - byteLength += base64AccumLength * 4 / 3; + byteLength += (base64AccumLength * 4) / 3; base64AccumLength = 0; } } @@ -218,12 +220,12 @@ Utf7Encoder.prototype.byteLength = function (str) { } return byteLength; -} +}; -Object.defineProperty(Utf7IMAPEncoder.prototype, 'hasState', { +Object.defineProperty(Utf7IMAPEncoder.prototype, "hasState", { get: function () { return this.inBase64; - } + }, }); Utf7IMAPEncoder.prototype.write = function (str) { @@ -319,10 +321,10 @@ function Utf7IMAPDecoder(options, codec) { var base64IMAPChars = base64Chars.slice(); base64IMAPChars[",".charCodeAt(0)] = true; -Object.defineProperty(Utf7IMAPDecoder.prototype, 'hasState', { +Object.defineProperty(Utf7IMAPDecoder.prototype, "hasState", { get: function () { return this.inBase64 && this.base64Accum.length > 0; - } + }, }); Utf7IMAPDecoder.prototype.write = function (buf) { From ab95d0a5f8c18e6e2e4df4f1d944ee2e60690dfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ng=E1=BB=8Dc=20Tr=E1=BA=A7n?= Date: Sun, 2 Aug 2020 01:41:39 +0700 Subject: [PATCH 4/7] add byteLength test for basic encodings --- encodings/dbcs-codec.js | 19 +++++++++++++++++ encodings/internal.js | 8 ++++++- encodings/utf32.js | 4 ++++ encodings/utf7.js | 8 ++++++- lib/bom-handling.js | 15 ++++++++++++++ lib/index.d.ts | 2 +- lib/index.js | 6 +++--- test/big5-test.js | 4 ++++ test/bom-test.js | 22 ++++++++++++++++++++ test/cesu8-test.js | 2 ++ test/cyrillic-test.js | 2 ++ test/dbcs-test.js | 46 ++++++++++++++++++++++------------------- test/gbk-test.js | 6 ++++++ test/greek-test.js | 2 ++ test/sbcs-test.js | 2 ++ test/shiftjis-test.js | 2 ++ test/turkish-test.js | 2 ++ test/utf16-test.js | 6 ++++++ test/utf32-test.js | 13 +++++++++--- test/utf7-test.js | 5 +++++ test/utils.js | 19 +++++++++++++++++ 21 files changed, 165 insertions(+), 30 deletions(-) diff --git a/encodings/dbcs-codec.js b/encodings/dbcs-codec.js index 5e96cb3d..e5665952 100644 --- a/encodings/dbcs-codec.js +++ b/encodings/dbcs-codec.js @@ -439,6 +439,25 @@ class DBCSEncoder { else byteLength += 4; } + if (leadSurrogate !== -1 || seqObj !== undefined) { + if (seqObj) { + // We're in the sequence. + const dbcsCode = seqObj[DEF_CHAR]; + if (dbcsCode !== undefined) { + // Write beginning of the sequence. + if (dbcsCode < 0x100) byteLength++; + else byteLength += 2; + } else { + // See todo above. + } + } + + if (leadSurrogate !== -1) { + // Incomplete surrogate pair - only lead surrogate found. + byteLength++; + } + } + return byteLength; } diff --git a/encodings/internal.js b/encodings/internal.js index 552bbefd..1f91a358 100644 --- a/encodings/internal.js +++ b/encodings/internal.js @@ -112,11 +112,17 @@ Object.defineProperty(InternalEncoderBase64.prototype, "hasState", { }); InternalEncoderBase64.prototype.byteLength = function (str) { + var byteLength = 0; var completeQuads = str.length - (str.length % 4); + var prevStr = str.slice(completeQuads); str = str.slice(0, completeQuads); var nonPaddedLength = str.search(/=*$/); if (nonPaddedLength === -1) nonPaddedLength = str.length; - return Math.floor((nonPaddedLength * 3) / 4); + byteLength += Math.floor((nonPaddedLength * 3) / 4); + nonPaddedLength = prevStr.search(/=*$/); + if (nonPaddedLength === -1) nonPaddedLength = str.length; + byteLength += Math.floor((nonPaddedLength * 3) / 4); + return byteLength; }; InternalEncoderBase64.prototype.write = function (str) { diff --git a/encodings/utf32.js b/encodings/utf32.js index 037d6645..a97925cb 100644 --- a/encodings/utf32.js +++ b/encodings/utf32.js @@ -109,6 +109,10 @@ Utf32Encoder.prototype.byteLength = function (str) { } } + if (currentHighSurrogate) { + byteLength += 4; + } + return byteLength; }; diff --git a/encodings/utf7.js b/encodings/utf7.js index afe39f01..3a28f519 100644 --- a/encodings/utf7.js +++ b/encodings/utf7.js @@ -180,7 +180,7 @@ function Utf7IMAPEncoder(options, codec) { this.base64AccumIdx = 0; } -Utf7Encoder.prototype.byteLength = function (str) { +Utf7IMAPEncoder.prototype.byteLength = function (str) { var byteLength = 0; var inBase64 = false, base64AccumLength = 0; @@ -218,6 +218,12 @@ Utf7Encoder.prototype.byteLength = function (str) { } } } + if (inBase64) { + if (base64AccumLength > 0) { + byteLength += Math.ceil((base64AccumLength * 4) / 3); // without padding + } + byteLength++; // Count '-', then go to direct mode. + } return byteLength; }; diff --git a/lib/bom-handling.js b/lib/bom-handling.js index e9ea1395..a66563d5 100644 --- a/lib/bom-handling.js +++ b/lib/bom-handling.js @@ -8,6 +8,17 @@ exports.PrependBOM = class PrependBOMWrapper { this.addBOM = true; } + get hasState() { + return this.encoder.hasState; + } + + byteLength(str) { + var byteLength = 0; + if (this.addBOM) str = BOMChar + str; + byteLength += this.encoder.byteLength(str); + return byteLength; + } + write(str) { if (this.addBOM) { str = BOMChar + str; @@ -28,6 +39,10 @@ exports.StripBOM = class StripBOMWrapper { this.options = options || {}; } + get hasState() { + return this.decoder.hasState; + } + write(buf) { var res = this.decoder.write(buf); if (this.pass || !res) return res; diff --git a/lib/index.d.ts b/lib/index.d.ts index 9a79057a..a607e348 100644 --- a/lib/index.d.ts +++ b/lib/index.d.ts @@ -13,7 +13,7 @@ declare module "iconv-lite" { export function encodingExists(encoding: string): boolean; - export function byteLength(content: string, encoding: string): number; + export function byteLength(content: string, encoding: string, options?: Options): number; // Stream API export function decodeStream(encoding: string, options?: Options): NodeJS.ReadWriteStream; diff --git a/lib/index.js b/lib/index.js index 237f9ad9..33239759 100644 --- a/lib/index.js +++ b/lib/index.js @@ -132,9 +132,9 @@ iconv.getDecoder = function getDecoder(encoding, options) { return decoder; }; -iconv.byteLength = function byteLength(str, encoding) { - return iconv.getEncoder(encoding).byteLength(str) -} +iconv.byteLength = function byteLength(str, encoding, options) { + return iconv.getEncoder(encoding, options).byteLength(str); +}; // Streaming API // NOTE: Streaming API naturally depends on 'stream' module from Node.js. Unfortunately in browser environments this module can add diff --git a/test/big5-test.js b/test/big5-test.js index 4c492f37..bc1e0b94 100644 --- a/test/big5-test.js +++ b/test/big5-test.js @@ -65,4 +65,8 @@ describe("Big5 tests #node-web", function () { it("Big5 correctly encodes 十", function () { assert.strictEqual(utils.hex(iconv.encode("十", "big5")), "a4 51"); }); + + it("Big5 byteLength works correctly", utils.checkByteLength("Big5")); + + it("cp950 byteLength works correctly", utils.checkByteLength("cp950")); }); diff --git a/test/bom-test.js b/test/bom-test.js index efb2b047..7601b4cc 100644 --- a/test/bom-test.js +++ b/test/bom-test.js @@ -1,6 +1,7 @@ "use strict"; const assert = require("assert"), + utils = require("./utils"), Buffer = require("safer-buffer").Buffer, iconv = require("../"); @@ -89,4 +90,25 @@ describe("BOM Handling", function () { assert.equal(iconv.decode(body2, "utf8", { stripBOM: stripBOM }), sampleStr); assert(!bomStripped); }); + + it("UTF-7 BOM byteLength works correctly", utils.checkByteLength("utf7", { addBOM: true })); + + it("UTF-8 BOM byteLength works correctly", utils.checkByteLength("utf8", { addBOM: true })); + + it( + "utf16le BOM byteLength works correctly", + utils.checkByteLength("utf16le", { addBOM: true }) + ); + + it( + "utf16be BOM byteLength works correctly", + utils.checkByteLength("utf16be", { addBOM: true }) + ); + + it("utf16 BOM byteLength works correctly", utils.checkByteLength("utf16", { addBOM: true })); + + it( + "utf16 NO BOM byteLength works correctly", + utils.checkByteLength("utf16", { addBOM: false }) + ); }); diff --git a/test/cesu8-test.js b/test/cesu8-test.js index 0f200d09..4a1550bf 100644 --- a/test/cesu8-test.js +++ b/test/cesu8-test.js @@ -1,6 +1,7 @@ "use strict"; var assert = require("assert"), + utils = require("./utils"), Buffer = require("safer-buffer").Buffer, iconv = require("../"); @@ -23,4 +24,5 @@ describe("CESU-8 codec", function () { assert.equal(iconv.decode(Buffer.from("eda081edb080", "hex"), "cesu8"), "𐐀"); assert.equal(iconv.decode(Buffer.from("eda0bdedb8b1", "hex"), "cesu8"), "😱"); }); + it("CESU-8 byteLength works correctly", utils.checkByteLength("cesu8")); }); diff --git a/test/cyrillic-test.js b/test/cyrillic-test.js index cdf61853..9993514b 100644 --- a/test/cyrillic-test.js +++ b/test/cyrillic-test.js @@ -110,6 +110,8 @@ describe("Test Cyrillic encodings #node-web", function () { utils.hex(untranslatableBytes) ); // Only '?' characters. }); + + it(enc + " byteLength works correctly", utils.checkByteLength(enc)); }); }); }); diff --git a/test/dbcs-test.js b/test/dbcs-test.js index 9fe5a6a9..003d27a9 100644 --- a/test/dbcs-test.js +++ b/test/dbcs-test.js @@ -1,6 +1,7 @@ "use strict"; var assert = require("assert"), + utils = require("./utils"), Buffer = require("safer-buffer").Buffer, iconv = require("../"), Iconv = require("iconv").Iconv; @@ -53,17 +54,18 @@ var aliases = { // prettier-ignore var iconvChanges = { // Characters that iconv changing (iconv char -> our char) // shiftjis/cp932 is changed in iconv (see comments in cp932.h) - shiftjis: {"〜":"~","‖":"∥","−":"-","¢":"¢","£":"£","¬":"¬"}, - eucjp: {"〜":"~","‖":"∥","−":"-","¢":"¢","£":"£","¬":"¬"}, - cp950: {"¥":"¥"}, + shiftjis: { "〜": "~", "‖": "∥", "−": "-", "¢": "¢", "£": "£", "¬": "¬" }, + eucjp: { "〜": "~", "‖": "∥", "−": "-", "¢": "¢", "£": "£", "¬": "¬" }, + cp950: { "¥": "¥" }, // Big5 is known for lots of different variations. We use Encoding Standard. - big5hkscs: {"•": "‧", "、": "﹑", "‾": "¯", "∼": "~", "♁": "⊕", "☉": "⊙", "/": "∕", "\": "﹨", "¥": "¥", "¢": "¢", "£": "£"}, + big5hkscs: { "•": "‧", "、": "﹑", "‾": "¯", "∼": "~", "♁": "⊕", "☉": "⊙", "/": "∕", "\": "﹨", "¥": "¥", "¢": "¢", "£": "£" }, // Iconv encodes some chars to the PUA area. In ICU there's no such mapping. - gb18030: { "ḿ": "", "龴": "", "龵": "", "龶": "", "龷": "", "龸": "", "龹": "", - "龺": "", "龻": "", "︐": "", "︑": "", "︒": "", "︓": "", "︔": "", "︕": "", - "︖": "", "︗": "", "︘": "", "︙": "", + gb18030: { + "ḿ": "", "龴": "", "龵": "", "龶": "", "龷": "", "龸": "", "龹": "", + "龺": "", "龻": "", "︐": "", "︑": "", "︒": "", "︓": "", "︔": "", "︕": "", + "︖": "", "︗": "", "︘": "", "︙": "", } } @@ -72,8 +74,8 @@ var iconvCannotDecode = { // Characters that we can decode, but iconv cannot. En shiftjis: { "80": "\x80", "5c": "¥", "7e": "‾", "81ca": "¬" }, eucjp: { "adf0": "≒", "adf1": "≡", "adf2": "∫", "adf3": "∮", "adf4": "∑", "adf5": "√", "adf6": "⊥", "adf7": "∠", "adf8": "∟", "adf9": "⊿", - "adfa": "∵", "adfb": "∩", "adfc": "∪", "a1c2": "∥", "ade2": "№", "ade4": "℡", - + "adfa": "∵", "adfb": "∩", "adfc": "∪", "a1c2": "∥", "ade2": "№", "ade4": "℡", + "adb5": "Ⅰ", "adb6": "Ⅱ", "adb7": "Ⅲ", "adb8": "Ⅳ", "adb9": "Ⅴ", "adba": "Ⅵ", "adbb": "Ⅶ", "adbc": "Ⅷ", "adbd": "Ⅸ", "adbe": "Ⅹ", "fcf1": "ⅰ", "fcf2": "ⅱ", "fcf3": "ⅲ", "fcf4": "ⅳ", "fcf5": "ⅴ", "fcf6": "ⅵ", "fcf7": "ⅶ", "fcf8": "ⅷ", "fcf9": "ⅸ", "fcfa": "ⅹ", "ada1": "①", "ada2": "②", "ada3": "③", "ada4": "④", "ada5": "⑤", "ada6": "⑥", "ada7": "⑦", "ada8": "⑧", "ada9": "⑨", "adaa": "⑩", @@ -109,27 +111,27 @@ var iconvCannotDecode = { // Characters that we can decode, but iconv cannot. En "a3c8": "␈", "a3c9": "␉", "a3ca": "␊", "a3cb": "␋", "a3cc": "␌", "a3cd": "␍", "a3ce": "␎", "a3cf": "␏", "a3d0": "␐", "a3d1": "␑", "a3d2": "␒", "a3d3": "␓", "a3d4": "␔", "a3d5": "␕", "a3d6": "␖", "a3d7": "␗", "a3d8": "␘", "a3d9": "␙", "a3da": "␚", "a3db": "␛", "a3dc": "␜", "a3dd": "␝", "a3de": "␞", "a3df": "␟", - "a3e0": "␡", "a3e1": "€", + "a3e0": "␡", "a3e1": "€", - "c6cf": "廴", "c6d3": "无", "c6d5": "癶", "c6d7": "隶", "c6de": "〃", "c6df": "仝", - "fa5f": "倩", "fa66": "偽", - "fabd": "包", "fac5": "卄", "fad5": "卿", "fb48": "嘅", "fbb8": "婷", "fbf3": "幵", "fbf9": "廐", "fc4f": "彘", - "fc6c": "悤", "fcb9": "撐", "fce2": "晴", "fcf1": "杞", "fdb7": "沜", "fdb8": "渝", "fdbb": "港", "fdf1": "煮", + "c6cf": "廴", "c6d3": "无", "c6d5": "癶", "c6d7": "隶", "c6de": "〃", "c6df": "仝", + "fa5f": "倩", "fa66": "偽", + "fabd": "包", "fac5": "卄", "fad5": "卿", "fb48": "嘅", "fbb8": "婷", "fbf3": "幵", "fbf9": "廐", "fc4f": "彘", + "fc6c": "悤", "fcb9": "撐", "fce2": "晴", "fcf1": "杞", "fdb7": "沜", "fdb8": "渝", "fdbb": "港", "fdf1": "煮", "fe52": "猪", "fe6f": "瑜", "feaa": "瓩", "fedd": "砉", }, gbk: { // All these will appear in GB18030, + U+0080 = € is compatibility with Windows. - "80": "€", "a2e3": "€", "a8bf": "ǹ", - "a98a": "⿰", "a98b": "⿱", "a98c": "⿲", "a98d": "⿳", "a98e": "⿴", "a98f": "⿵", "a990": "⿶", + "80": "€", "a2e3": "€", "a8bf": "ǹ", + "a98a": "⿰", "a98b": "⿱", "a98c": "⿲", "a98d": "⿳", "a98e": "⿴", "a98f": "⿵", "a990": "⿶", "a991": "⿷", "a992": "⿸", "a993": "⿹", "a994": "⿺", "a995": "⿻", "a989": "〾", - "fe50": "⺁", "fe54": "⺄", "fe55": "㑳", "fe56": "㑇", "fe57": "⺈", "fe58": "⺋", "fe5a": "㖞", - "fe5b": "㘚", "fe5c": "㘎", "fe5d": "⺌", "fe5e": "⺗", "fe5f": "㥮", + "fe50": "⺁", "fe54": "⺄", "fe55": "㑳", "fe56": "㑇", "fe57": "⺈", "fe58": "⺋", "fe5a": "㖞", + "fe5b": "㘚", "fe5c": "㘎", "fe5d": "⺌", "fe5e": "⺗", "fe5f": "㥮", "fe60": "㤘", "fe62": "㧏", "fe63": "㧟", "fe64": "㩳", "fe65": "㧐", "fe68": "㭎", "fe69": "㱮", - "fe6a": "㳠", "fe6b": "⺧", "fe6e": "⺪", "fe6f": "䁖", + "fe6a": "㳠", "fe6b": "⺧", "fe6e": "⺪", "fe6f": "䁖", "fe70": "䅟", "fe71": "⺮", "fe72": "䌷", "fe73": "⺳", "fe74": "⺶", "fe75": "⺷", "fe77": "䎱", "fe78": "䎬", "fe79": "⺻", "fe7a": "䏝", "fe7b": "䓖", "fe7c": "䙡", "fe7d": "䙌", - "fe80": "䜣", "fe81": "䜩", "fe82": "䝼", "fe83": "䞍", "fe84": "⻊", "fe85": "䥇", "fe86": "䥺", "fe87": "䥽", + "fe80": "䜣", "fe81": "䜩", "fe82": "䝼", "fe83": "䞍", "fe84": "⻊", "fe85": "䥇", "fe86": "䥺", "fe87": "䥽", "fe88": "䦂", "fe89": "䦃", "fe8a": "䦅", "fe8b": "䦆", "fe8c": "䦟", "fe8d": "䦛", "fe8e": "䦷", "fe8f": "䦶", - "fe92": "䲣", "fe93": "䲟", "fe94": "䲠", "fe95": "䲡", "fe96": "䱷", "fe97": "䲢", + "fe92": "䲣", "fe93": "䲟", "fe94": "䲠", "fe95": "䲡", "fe96": "䱷", "fe97": "䲢", "fe98": "䴓", "fe99": "䴔", "fe9a": "䴕", "fe9b": "䴖", "fe9c": "䴗", "fe9d": "䴘", "fe9e": "䴙", "fe9f": "䶮", // iconv and ICU are mapping "a3 a0" -> U+E5E5. However, WebKit/Chrome maps it to U+3000 noting compatibility with older websites. @@ -297,6 +299,8 @@ describe("Full DBCS encoding tests", function () { ); } }); + + it(enc + " byteLength works correctly", utils.checkByteLength(enc)); })(enc); } }); diff --git a/test/gbk-test.js b/test/gbk-test.js index b74033d0..21c3079a 100644 --- a/test/gbk-test.js +++ b/test/gbk-test.js @@ -135,4 +135,10 @@ describe("GBK tests #node-web", function () { assert.strictEqual(iconv.decode(gbkChars, "GB18030"), chars); assert.strictEqual(utils.hex(iconv.encode(chars, "GB18030")), utils.hex(gbkChars)); }); + + it("GBK byteLength works correctly", utils.checkByteLength("GBK")); + + it("GB2312 byteLength works correctly", utils.checkByteLength("GB2312")); + + it("GB18030 byteLength works correctly", utils.checkByteLength("GB18030")); }); diff --git a/test/greek-test.js b/test/greek-test.js index a9d7017f..96793b52 100644 --- a/test/greek-test.js +++ b/test/greek-test.js @@ -98,6 +98,8 @@ describe("Test Greek encodings #node-web", function () { utils.hex(untranslatableBytes) ); // Only '?' characters. }); + + it(enc + " byteLength works correctly", utils.checkByteLength(enc)); }); }); }); diff --git a/test/sbcs-test.js b/test/sbcs-test.js index 27825937..f3c9f4b9 100644 --- a/test/sbcs-test.js +++ b/test/sbcs-test.js @@ -179,6 +179,8 @@ describe("Full SBCS encoding tests #node-web", function () { } }); + it(enc + " byteLength works correctly", utils.checkByteLength(enc)); + /* // TODO: Implement unicode composition. After that, this test will be meaningful. diff --git a/test/shiftjis-test.js b/test/shiftjis-test.js index 52d9ca48..512fbc1e 100644 --- a/test/shiftjis-test.js +++ b/test/shiftjis-test.js @@ -42,4 +42,6 @@ describe("ShiftJIS tests #node-web", function () { assert.strictEqual(iconv.decode(utils.bytes("87 40"), "shiftjis"), "①"); assert.strictEqual(utils.hex(iconv.encode("①", "shiftjis")), "87 40"); }); + + it("shiftjis byteLength works correctly", utils.checkByteLength("shiftjis")); }); diff --git a/test/turkish-test.js b/test/turkish-test.js index 4aeabf99..671398fe 100644 --- a/test/turkish-test.js +++ b/test/turkish-test.js @@ -97,6 +97,8 @@ describe("Test Turkish encodings #node-web", function () { utils.hex(untranslatableBytes) ); // Only '?' characters. }); + + it(enc + " byteLength works correctly", utils.checkByteLength(enc)); }); }); }); diff --git a/test/utf16-test.js b/test/utf16-test.js index 2c73287d..1ae10d0e 100644 --- a/test/utf16-test.js +++ b/test/utf16-test.js @@ -54,6 +54,8 @@ describe("UTF-16LE encoder #node-web", function () { assert.equal(hex(encoder.write("\uDCA9")), "a9 dc"); assert.strictEqual(encoder.end(), undefined); }); + + it("byteLength works correctly", utils.checkByteLength(enc)); }); describe("UTF-16LE decoder #node-web", function () { @@ -178,6 +180,8 @@ describe("UTF-16BE encoder #node-web", function () { assert.equal(hex(encoder.write("\uDCA9")), "dc a9"); assert.strictEqual(encoder.end(), undefined); }); + + it("byteLength works correctly", utils.checkByteLength(enc)); }); describe("UTF-16BE decoder #node-web", function () { @@ -261,6 +265,8 @@ describe("UTF-16 encoder #node-web", function () { it("can skip BOM", function () { assert.equal(hex(iconv.encode(testStr, enc, { addBOM: false })), hex(utf16leBuf)); }); + + it("byteLength works correctly", utils.checkByteLength(enc)); }); describe("UTF-16 decoder #node-web", function () { diff --git a/test/utf32-test.js b/test/utf32-test.js index fe5d52d8..c9dc3dda 100644 --- a/test/utf32-test.js +++ b/test/utf32-test.js @@ -1,6 +1,7 @@ "use strict"; var assert = require("assert"), + utils = require("./utils"), Buffer = require("safer-buffer").Buffer, iconv = require("../"), Iconv = require("iconv").Iconv; @@ -8,8 +9,8 @@ var assert = require("assert"), // prettier-ignore var testStr = "1aя中文☃💩", testStr2 = "❝Stray high \uD977😱 and low\uDDDD☔ surrogate values.❞", - utf32leBuf = Buffer.from([ 0x31, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00, 0x4f, 0x04, 0x00, 0x00, 0x2d, 0x4e, 0x00, 0x00, 0x87, 0x65, 0x00, 0x00, 0x03, 0x26, 0x00, 0x00, 0xa9, 0xf4, 0x01, 0x00 ]), - utf32beBuf = Buffer.from([ 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x04, 0x4f, 0x00, 0x00, 0x4e, 0x2d, 0x00, 0x00, 0x65, 0x87, 0x00, 0x00, 0x26, 0x03, 0x00, 0x01, 0xf4, 0xa9 ]), + utf32leBuf = Buffer.from([0x31, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00, 0x4f, 0x04, 0x00, 0x00, 0x2d, 0x4e, 0x00, 0x00, 0x87, 0x65, 0x00, 0x00, 0x03, 0x26, 0x00, 0x00, 0xa9, 0xf4, 0x01, 0x00]), + utf32beBuf = Buffer.from([0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x04, 0x4f, 0x00, 0x00, 0x4e, 0x2d, 0x00, 0x00, 0x65, 0x87, 0x00, 0x00, 0x26, 0x03, 0x00, 0x01, 0xf4, 0xa9]), utf32leBOM = Buffer.from([0xff, 0xfe, 0x00, 0x00]), utf32beBOM = Buffer.from([0x00, 0x00, 0xfe, 0xff]), utf32leBufWithBOM = Buffer.concat([utf32leBOM, utf32leBuf]), @@ -81,6 +82,8 @@ describe("UTF-32LE codec", function () { var nodeStr = nodeIconv.convert(allCharsLEBuf).toString("utf8"); assert.equal(nodeStr, allCharsStr); }); + + it("byteLength works correctly", utils.checkByteLength("UTF-32LE")); }); describe("UTF-32BE codec", function () { @@ -118,6 +121,8 @@ describe("UTF-32BE codec", function () { var nodeStr = nodeIconv.convert(allCharsBEBuf).toString("utf8"); assert.equal(nodeStr, allCharsStr); }); + + it("byteLength works correctly", utils.checkByteLength("UTF-32BE")); }); describe("UTF-32 general codec", function () { @@ -155,6 +160,8 @@ describe("UTF-32 general codec", function () { it("correctly decodes UTF-32BE without BOM", function () { assert.equal(iconv.decode(iconv.encode(sampleStr, "utf-32-be"), "utf-32"), sampleStr); }); + + it("byteLength works correctly", utils.checkByteLength("UTF-32")); }); // Utility function to make bad matches easier to visualize. @@ -165,7 +172,7 @@ function escape(s) { var cc = s.charCodeAt(i); // prettier-ignore - if ((32 <= cc && cc < 127) && cc !== 0x5c) { + if ((32 <= cc && cc < 127) && cc !== 0x5c) { sb.push(s.charAt(i)); } else { var h = s.charCodeAt(i).toString(16).toUpperCase(); diff --git a/test/utf7-test.js b/test/utf7-test.js index e06c6ab1..ab0d9418 100644 --- a/test/utf7-test.js +++ b/test/utf7-test.js @@ -1,6 +1,7 @@ "use strict"; var assert = require("assert"), + utils = require("./utils"), Buffer = require("safer-buffer").Buffer, iconv = require("../"); @@ -137,6 +138,8 @@ describe("UTF-7 codec", function () { assert.equal(iconv.decode(Buffer.from("+AMAA4A-Next"), "utf-7"), "\u00c0\u00e0Next"); assert.equal(iconv.decode(Buffer.from("+AMAA4A!Next"), "utf-7"), "\u00c0\u00e0!Next"); }); + + it("byteLength works correctly", utils.checkByteLength("utf-7")); }); describe("UTF-7-IMAP codec", function () { @@ -220,4 +223,6 @@ describe("UTF-7-IMAP codec", function () { "\u00E4&\u00E4&\u00E4" ); }); + + it("byteLength works correctly", utils.checkByteLength("utf-7-imap")); }); diff --git a/test/utils.js b/test/utils.js index 4dc85842..1c3bd24b 100644 --- a/test/utils.js +++ b/test/utils.js @@ -253,4 +253,23 @@ const utils = (module.exports = { } assert.equal(i, str.length); }, + + checkByteLength(encoding, options) { + return () => { + utils.requireIconv(); + [ + "Hello😀world!", + "😊 Good bye 😊", + "Missing surrogate character \uD83D", + "中文abc", + "iconv-liteへようこそ", + "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя", + "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίόύώΆΈΉΊΌΎΏϊϋΪΫ", + ].forEach((content) => { + const actual = utils.iconv.byteLength(content, encoding, options); + const expect = utils.iconv.encode(content, encoding, options).length; + assert.equal(actual, expect); + }); + }; + }, }); From 191aa2af53a45ec64651e3b5cf1f82c55befcc93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ng=E1=BB=8Dc=20Tr=E1=BA=A7n?= Date: Sun, 2 Aug 2020 02:13:44 +0700 Subject: [PATCH 5/7] add byteLength test for main-test --- test/main-test.js | 20 ++++++++++++++++++++ test/utils.js | 23 +++++++++++++---------- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/test/main-test.js b/test/main-test.js index 00e50d95..847b57d1 100644 --- a/test/main-test.js +++ b/test/main-test.js @@ -1,6 +1,7 @@ "use strict"; var assert = require("assert"), + utils = require("./utils"), Buffer = require("safer-buffer").Buffer, iconv = require("../"); @@ -27,6 +28,13 @@ describe("Generic UTF8-UCS2 tests", function () { }); }); + it("UCS2 byteLength works correctly", utils.checkByteLength("UCS2", null, testStringLatin1)); + + it( + "binary byteLength works correctly", + utils.checkByteLength("binary", null, testStringLatin1) + ); + it("Base64 correctly encoded/decoded", function () { assert.strictEqual(iconv.encode(testStringBase64, "base64").toString("binary"), testString); assert.strictEqual( @@ -35,11 +43,18 @@ describe("Generic UTF8-UCS2 tests", function () { ); }); + it( + "Base64 byteLength works correctly", + utils.checkByteLength("base64", null, testStringBase64) + ); + it("Hex correctly encoded/decoded", function () { assert.strictEqual(iconv.encode(testStringHex, "hex").toString("binary"), testString); assert.strictEqual(iconv.decode(Buffer.from(testString, "binary"), "hex"), testStringHex); }); + it("Hex byteLength works correctly", utils.checkByteLength("hex", null, testStringHex)); + it("Latin1 correctly encoded/decoded", function () { assert.strictEqual( iconv.encode(testStringLatin1, "latin1").toString("binary"), @@ -51,6 +66,11 @@ describe("Generic UTF8-UCS2 tests", function () { ); }); + it( + "Latin1 byteLength works correctly", + utils.checkByteLength("Latin1", null, testStringLatin1) + ); + it("Convert to string, not buffer (utf8 used)", function () { assert.throws(function () { iconv.encode(Buffer.from(testStringLatin1, "utf8"), "utf8"); diff --git a/test/utils.js b/test/utils.js index 1c3bd24b..5bd6e555 100644 --- a/test/utils.js +++ b/test/utils.js @@ -254,18 +254,21 @@ const utils = (module.exports = { assert.equal(i, str.length); }, - checkByteLength(encoding, options) { + checkByteLength(encoding, options, _content) { return () => { utils.requireIconv(); - [ - "Hello😀world!", - "😊 Good bye 😊", - "Missing surrogate character \uD83D", - "中文abc", - "iconv-liteへようこそ", - "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя", - "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίόύώΆΈΉΊΌΎΏϊϋΪΫ", - ].forEach((content) => { + (_content + ? [_content] + : [ + "Hello😀world!", + "😊 Good bye 😊", + "Missing surrogate character \uD83D", + "中文abc", + "iconv-liteへようこそ", + "АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя", + "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩάέήίόύώΆΈΉΊΌΎΏϊϋΪΫ", + ] + ).forEach((content) => { const actual = utils.iconv.byteLength(content, encoding, options); const expect = utils.iconv.encode(content, encoding, options).length; assert.equal(actual, expect); From 965b5fbfc2abd9a34a9a4a1b061ebd5db68e3b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ng=E1=BB=8Dc=20Tr=E1=BA=A7n?= Date: Wed, 16 Sep 2020 15:28:20 +0700 Subject: [PATCH 6/7] format --- encodings/sbcs-codec.js | 12 ++++++++---- encodings/utf16.js | 12 ++++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/encodings/sbcs-codec.js b/encodings/sbcs-codec.js index 1d1ea6e4..a4e2f1b1 100644 --- a/encodings/sbcs-codec.js +++ b/encodings/sbcs-codec.js @@ -62,7 +62,9 @@ class SBCSEncoder { return str.length; } - get hasState() { return false; } + get hasState() { + return false; + } write(str) { const bytes = this.backend.allocBytes(str.length); @@ -74,7 +76,7 @@ class SBCSEncoder { return this.backend.bytesToResult(bytes, bytes.length); } - end() { } + end() {} } class SBCSDecoder { @@ -83,7 +85,9 @@ class SBCSDecoder { this.backend = backend; } - get hasState() { return false; } + get hasState() { + return false; + } write(buf) { // Strings are immutable in JS -> we use ucs2 buffer to speed up computations. @@ -96,5 +100,5 @@ class SBCSDecoder { return this.backend.rawCharsToResult(chars, chars.length); } - end() { } + end() {} } diff --git a/encodings/utf16.js b/encodings/utf16.js index ddfb4726..f7c2b289 100644 --- a/encodings/utf16.js +++ b/encodings/utf16.js @@ -26,7 +26,9 @@ class Utf16LEEncoder { return str.length * 2; } - get hasState() { return false; } + get hasState() { + return false; + } write(str) { const bytes = this.backend.allocBytes(str.length * 2); @@ -37,7 +39,7 @@ class Utf16LEEncoder { return this.backend.bytesToResult(bytes, bytes.length); } - end() { } + end() {} } class Utf16LEDecoder { @@ -172,7 +174,9 @@ class Utf16BEEncoder { return str.length * 2; } - get hasState() { return false; } + get hasState() { + return false; + } write(str) { const bytes = this.backend.allocBytes(str.length * 2); @@ -185,7 +189,7 @@ class Utf16BEEncoder { return this.backend.bytesToResult(bytes, bytesPos); } - end() { } + end() {} } class Utf16BEDecoder { From 8bbd3e4f0419b3c877b9e2d350a4c9267374e12d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ng=E1=BB=8Dc=20Tr=E1=BA=A7n?= Date: Wed, 16 Sep 2020 15:47:58 +0700 Subject: [PATCH 7/7] implement matchAll for older enviroments --- encodings/utf7.js | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/encodings/utf7.js b/encodings/utf7.js index 3a28f519..62f271ac 100644 --- a/encodings/utf7.js +++ b/encodings/utf7.js @@ -19,6 +19,18 @@ Utf7Codec.prototype.bomAware = true; const nonDirectChars = /[^A-Za-z0-9'(),-./:? \n\r\t]+/g; const segmentPattern = /([^A-Za-z0-9'(),-./:? \n\r\t]+)|([A-Za-z0-9'(),-./:? \n\r\t]+)/g; +function* matchAll(str, regExp) { + if (!regExp.global) { + throw new TypeError("Flag /g must be set!"); + } + const localCopy = new RegExp(regExp, regExp.flags); + let match = localCopy.exec(str); + while (match) { + yield match; + match = localCopy.exec(str); + } +} + function Utf7Encoder(options, codec) { this.iconv = codec.iconv; } @@ -26,7 +38,7 @@ function Utf7Encoder(options, codec) { Utf7Encoder.prototype.byteLength = function (str) { var byteLength = 0; - const segments = str.matchAll(segmentPattern); + const segments = matchAll(str, segmentPattern); for (const segment of segments) { if (segment[2] != null) // match group 2: direct chars