From 6068fb3b2008ed6e4cbbd01057bca280c91f021b Mon Sep 17 00:00:00 2001 From: Michael Kret <88898367+michael-radency@users.noreply.github.com> Date: Mon, 1 Dec 2025 10:59:07 +0200 Subject: [PATCH] fix(HTTP Request Node): Detect and handle non-UTF-8 response encodings (#20889) --- .../HttpRequest/V3/HttpRequestV3.node.ts | 13 +- .../HttpRequest/V3/utils/buffer-decoding.ts | 72 +++ .../V3/utils/test/buffer-decoding.test.ts | 439 ++++++++++++++++++ 3 files changed, 522 insertions(+), 2 deletions(-) create mode 100644 packages/nodes-base/nodes/HttpRequest/V3/utils/buffer-decoding.ts create mode 100644 packages/nodes-base/nodes/HttpRequest/V3/utils/test/buffer-decoding.test.ts diff --git a/packages/nodes-base/nodes/HttpRequest/V3/HttpRequestV3.node.ts b/packages/nodes-base/nodes/HttpRequest/V3/HttpRequestV3.node.ts index c1344320a82..4e0e144a6b0 100644 --- a/packages/nodes-base/nodes/HttpRequest/V3/HttpRequestV3.node.ts +++ b/packages/nodes-base/nodes/HttpRequest/V3/HttpRequestV3.node.ts @@ -44,6 +44,7 @@ import { import { setFilename } from './utils/binaryData'; import { mimeTypeFromResponse } from './utils/parse'; import { configureResponseOptimizer } from '../shared/optimizeResponse'; +import { binaryToStringWithEncodingDetection } from './utils/buffer-decoding'; function toText(data: T) { if (typeof data === 'object' && data !== null) { @@ -922,7 +923,11 @@ export class HttpRequestV3 implements INodeType { false, ) as boolean; - const data = await this.helpers.binaryToString(response.body as Buffer | Readable); + const data = await binaryToStringWithEncodingDetection( + response.body as Buffer | Readable, + responseContentType, + this.helpers, + ); response.body = jsonParse(data, { ...(neverError ? { fallbackValue: {} } @@ -934,7 +939,11 @@ export class HttpRequestV3 implements INodeType { } else { responseFormat = 'text'; if (!response.__bodyResolved) { - const data = await this.helpers.binaryToString(response.body as Buffer | Readable); + const data = await binaryToStringWithEncodingDetection( + response.body as Buffer | Readable, + responseContentType, + this.helpers, + ); response.body = !data ? undefined : data; } } diff --git a/packages/nodes-base/nodes/HttpRequest/V3/utils/buffer-decoding.ts b/packages/nodes-base/nodes/HttpRequest/V3/utils/buffer-decoding.ts new file mode 100644 index 00000000000..f2b91e2ba65 --- /dev/null +++ b/packages/nodes-base/nodes/HttpRequest/V3/utils/buffer-decoding.ts @@ -0,0 +1,72 @@ +import type { IExecuteFunctions } from 'n8n-workflow'; +import type { Readable } from 'stream'; + +const CHINESE_ENCODINGS = ['gb18030', 'gbk', 'gb2312'] as const; +const REPLACEMENT_CHAR = '�'; +const HIGH_ASCII_PATTERN = /[\x80-\xFF]{3,}/; +const DEFAULT_ENCODING = 'utf-8'; + +/** + * Enhanced encoding detection for better handling of non-UTF-8 content + * Extracts charset from Content-Type header (e.g., "text/html; charset=utf-8" → "utf-8") + */ +function detectEncoding(contentType?: string): BufferEncoding | undefined { + if (!contentType) return undefined; + + // Regex breakdown: + // /charset=([^;,\s]+)/i + // - charset= : Match literal "charset=" (case-insensitive due to 'i' flag) + // - ([^;,\s]+) : Capture group that matches one or more characters that are NOT: + // ^ = negation, ; = semicolon, , = comma, \s = any whitespace + // - i : Case-insensitive flag (matches "charset=", "CHARSET=", "Charset=", etc.) + const charsetMatch = contentType.match(/charset=([^;,\s]+)/i); + + if (charsetMatch) { + // charsetMatch[1] contains the captured group (the charset value) + // Convert to lowercase and remove any surrounding quotes + return charsetMatch[1].toLowerCase().replace(/['"]/g, '') as BufferEncoding; + } + + return undefined; +} + +/** + * Enhanced binary to string conversion for better handling of non-UTF-8 content + */ +export async function binaryToStringWithEncodingDetection( + body: Buffer | Readable, + contentType: string, + helpers: IExecuteFunctions['helpers'], +): Promise { + let bufferedData: Buffer; + + if (body instanceof Buffer) { + bufferedData = body; + } else { + bufferedData = await helpers.binaryToBuffer(body); + } + + const encoding = detectEncoding(contentType); + + if (encoding && encoding !== DEFAULT_ENCODING) { + return await helpers.binaryToString(bufferedData, encoding); + } + + const decodedString = await helpers.binaryToString(bufferedData); + + if (decodedString.includes(REPLACEMENT_CHAR) || HIGH_ASCII_PATTERN.test(decodedString)) { + const detected = helpers.detectBinaryEncoding(bufferedData).toLowerCase() as BufferEncoding; + if (detected && detected !== DEFAULT_ENCODING) { + return await helpers.binaryToString(bufferedData, detected); + } + + for (const chinese of CHINESE_ENCODINGS) { + try { + const reDecoded = await helpers.binaryToString(bufferedData, chinese as BufferEncoding); + if (!reDecoded.includes(REPLACEMENT_CHAR) && reDecoded.length > 0) return reDecoded; + } catch {} + } + } + + return decodedString; +} diff --git a/packages/nodes-base/nodes/HttpRequest/V3/utils/test/buffer-decoding.test.ts b/packages/nodes-base/nodes/HttpRequest/V3/utils/test/buffer-decoding.test.ts new file mode 100644 index 00000000000..8246aaaf8b3 --- /dev/null +++ b/packages/nodes-base/nodes/HttpRequest/V3/utils/test/buffer-decoding.test.ts @@ -0,0 +1,439 @@ +import { Readable } from 'stream'; +import type { IExecuteFunctions } from 'n8n-workflow'; + +import { binaryToStringWithEncodingDetection } from '../buffer-decoding'; + +describe('buffer-decoding utils', () => { + let mockHelpers: IExecuteFunctions['helpers']; + let mockBinaryToString: jest.MockedFunction< + (body: Buffer | Readable, encoding?: BufferEncoding) => Promise + >; + let mockBinaryToBuffer: jest.MockedFunction<(body: Buffer | Readable) => Promise>; + let mockDetectBinaryEncoding: jest.MockedFunction<(buffer: Buffer) => string>; + + beforeEach(() => { + jest.clearAllMocks(); + mockBinaryToString = jest.fn(); + mockBinaryToBuffer = jest.fn(); + mockDetectBinaryEncoding = jest.fn(); + mockHelpers = { + binaryToString: mockBinaryToString, + binaryToBuffer: mockBinaryToBuffer, + detectBinaryEncoding: mockDetectBinaryEncoding, + } as unknown as IExecuteFunctions['helpers']; + }); + + describe('binaryToStringWithEncodingDetection', () => { + describe('Content-Type header encoding detection', () => { + it('should use encoding from Content-Type header (lowercase)', async () => { + const buffer = Buffer.from('test content', 'utf8'); + const contentType = 'text/html; charset=iso-8859-1'; + + mockBinaryToString.mockResolvedValue('test content'); + + await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'iso-8859-1'); + }); + + it('should use encoding from Content-Type header (uppercase)', async () => { + const buffer = Buffer.from('test content', 'utf8'); + const contentType = 'text/html; CHARSET=UTF-16'; + + mockBinaryToString.mockResolvedValue('test content'); + + await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'utf-16'); + }); + + it('should remove quotes from charset value', async () => { + const buffer = Buffer.from('test content', 'utf8'); + const contentType = 'text/html; charset="utf-8"'; + + mockBinaryToString.mockResolvedValue('test content'); + + await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + // Since utf-8 is the default encoding, it should call without encoding parameter + expect(mockBinaryToString).toHaveBeenCalledWith(buffer); + }); + + it('should handle charset with single quotes', async () => { + const buffer = Buffer.from('test content', 'utf8'); + const contentType = "text/html; charset='iso-8859-1'"; + + mockBinaryToString.mockResolvedValue('test content'); + + await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'iso-8859-1'); + }); + + it('should handle charset in complex Content-Type header', async () => { + const buffer = Buffer.from('test content', 'utf8'); + const contentType = 'text/html; boundary=something; charset=windows-1252; other=value'; + + mockBinaryToString.mockResolvedValue('test content'); + + await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'windows-1252'); + }); + + it('should fall back to UTF-8 when no charset in Content-Type', async () => { + const buffer = Buffer.from('test content', 'utf8'); + const contentType = 'text/html'; + + mockBinaryToString.mockResolvedValueOnce('test content'); + + const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + expect(mockBinaryToString).toHaveBeenCalledWith(buffer); + expect(result).toBe('test content'); + }); + }); + + describe('UTF-8 fallback behavior', () => { + it('should return UTF-8 decoded string when no encoding issues detected', async () => { + const buffer = Buffer.from('test content', 'utf8'); + const contentType = 'text/html'; + + mockBinaryToString.mockResolvedValue('test content'); + + const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + expect(result).toBe('test content'); + expect(mockBinaryToString).toHaveBeenCalledTimes(1); + }); + + it('should not trigger re-encoding when UTF-8 content is clean', async () => { + const buffer = Buffer.from('Hello World! 🌍', 'utf8'); + const contentType = 'text/html'; + + mockBinaryToString.mockResolvedValue('Hello World! 🌍'); + + const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + expect(result).toBe('Hello World! 🌍'); + expect(mockBinaryToString).toHaveBeenCalledTimes(1); + }); + }); + + describe('Replacement character detection and re-encoding', () => { + it('should detect replacement characters and try chardet for Buffer', async () => { + const buffer = Buffer.from('test content with �', 'utf8'); + const contentType = 'text/html'; + + mockBinaryToString + .mockResolvedValueOnce('test content with �') // First UTF-8 attempt + .mockResolvedValueOnce('test content with proper chars'); // Second attempt with detected encoding + + mockDetectBinaryEncoding.mockReturnValue('iso-8859-1'); + + const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + expect(mockDetectBinaryEncoding).toHaveBeenCalledWith(buffer); + expect(mockBinaryToString).toHaveBeenCalledTimes(2); + expect(mockBinaryToString).toHaveBeenNthCalledWith(1, buffer); + expect(mockBinaryToString).toHaveBeenNthCalledWith(2, buffer, 'iso-8859-1'); + expect(result).toBe('test content with proper chars'); + }); + + it('should detect high ASCII pattern and try chardet for Buffer', async () => { + const buffer = Buffer.from([0x80, 0x81, 0x82, 0x83]); // High ASCII bytes + const contentType = 'text/html'; + const highAsciiString = String.fromCharCode(0x80, 0x81, 0x82, 0x83); + + mockBinaryToString + .mockResolvedValueOnce(highAsciiString) // First UTF-8 attempt + .mockResolvedValueOnce('proper decoded content'); // Second attempt with detected encoding + + mockDetectBinaryEncoding.mockReturnValue('windows-1252'); + + const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + expect(mockDetectBinaryEncoding).toHaveBeenCalledWith(buffer); + expect(mockBinaryToString).toHaveBeenCalledTimes(2); + expect(mockBinaryToString).toHaveBeenNthCalledWith(2, buffer, 'windows-1252'); + expect(result).toBe('proper decoded content'); + }); + + it('should try Chinese encodings for Readable streams with replacement chars', async () => { + const readable = new Readable(); + readable.push('content with �'); + readable.push(null); + const contentType = 'text/html'; + const buffer = Buffer.from('content with �'); + + mockBinaryToBuffer.mockResolvedValue(buffer); + mockBinaryToString + .mockResolvedValueOnce('content with �') // First UTF-8 attempt + .mockResolvedValueOnce('content with proper chars'); // gb18030 attempt (success) + + // Mock chardet to return empty string so it tries Chinese encodings + mockDetectBinaryEncoding.mockReturnValue(''); + + const result = await binaryToStringWithEncodingDetection( + readable, + contentType, + mockHelpers, + ); + + expect(mockBinaryToString).toHaveBeenCalledTimes(2); + expect(mockBinaryToString).toHaveBeenNthCalledWith(1, buffer); + expect(mockBinaryToString).toHaveBeenNthCalledWith(2, buffer, 'gb18030'); + expect(result).toBe('content with proper chars'); + }); + + it('should try all Chinese encodings for Readable streams if needed', async () => { + const readable = new Readable(); + readable.push('content with �'); + readable.push(null); + const contentType = 'text/html'; + const buffer = Buffer.from('content with �'); + + mockBinaryToBuffer.mockResolvedValue(buffer); + mockBinaryToString + .mockResolvedValueOnce('content with �') // First UTF-8 attempt + .mockResolvedValueOnce('content with �') // gb18030 attempt + .mockResolvedValueOnce('content with proper chars'); // gbk attempt (success) + + // Mock chardet to return empty string so it tries Chinese encodings + mockDetectBinaryEncoding.mockReturnValue(''); + + const result = await binaryToStringWithEncodingDetection( + readable, + contentType, + mockHelpers, + ); + + expect(mockBinaryToString).toHaveBeenCalledTimes(3); + expect(mockBinaryToString).toHaveBeenNthCalledWith(3, buffer, 'gbk'); + expect(result).toBe('content with proper chars'); + }); + + it('should return original string if all Chinese encodings fail for Readable', async () => { + const readable = new Readable(); + readable.push('content with �'); + readable.push(null); + const contentType = 'text/html'; + const buffer = Buffer.from('content with �'); + + mockBinaryToBuffer.mockResolvedValue(buffer); + mockBinaryToString + .mockResolvedValueOnce('content with �') // First UTF-8 attempt + .mockResolvedValueOnce('content with �') // gb18030 attempt + .mockResolvedValueOnce('content with �') // gbk attempt + .mockResolvedValueOnce('content with �'); // gb2312 attempt + + mockDetectBinaryEncoding.mockReturnValue(''); + + const result = await binaryToStringWithEncodingDetection( + readable, + contentType, + mockHelpers, + ); + + expect(result).toBe('content with �'); + }); + }); + + describe('Error handling', () => { + it('should handle chardet returning null for Buffer', async () => { + const buffer = Buffer.from('test content with �', 'utf8'); + const contentType = 'text/html'; + + mockBinaryToBuffer.mockResolvedValue(buffer); + mockBinaryToString.mockResolvedValue('test content with �'); + mockDetectBinaryEncoding.mockReturnValue(''); + + const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + expect(result).toBe('test content with �'); + expect(mockBinaryToString).toHaveBeenCalledTimes(4); // 1 initial + 3 Chinese encodings + }); + + it('should handle chardet returning same encoding as default', async () => { + const buffer = Buffer.from('test content with �', 'utf8'); + const contentType = 'text/html'; + + mockBinaryToBuffer.mockResolvedValue(buffer); + mockBinaryToString.mockResolvedValue('test content with �'); + mockDetectBinaryEncoding.mockReturnValue('utf-8'); + + const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + expect(result).toBe('test content with �'); + expect(mockBinaryToString).toHaveBeenCalledTimes(4); // 1 initial + 3 Chinese encodings + }); + + it('should handle errors in Chinese encoding attempts gracefully', async () => { + const readable = new Readable(); + readable.push('content with �'); + readable.push(null); + const contentType = 'text/html'; + const buffer = Buffer.from('content with �'); + + mockBinaryToBuffer.mockResolvedValue(buffer); + mockBinaryToString + .mockResolvedValueOnce('content with �') // First UTF-8 attempt + .mockRejectedValueOnce(new Error('Encoding error')) // gb18030 error + .mockResolvedValueOnce('content with proper chars'); // gbk success + + mockDetectBinaryEncoding.mockReturnValue(''); + + const result = await binaryToStringWithEncodingDetection( + readable, + contentType, + mockHelpers, + ); + + expect(result).toBe('content with proper chars'); + }); + + it('should return original string if all Chinese encodings throw errors', async () => { + const readable = new Readable(); + readable.push('content with �'); + readable.push(null); + const contentType = 'text/html'; + const buffer = Buffer.from('content with �'); + + mockBinaryToBuffer.mockResolvedValue(buffer); + mockBinaryToString + .mockResolvedValueOnce('content with �') // First UTF-8 attempt + .mockRejectedValue(new Error('Encoding error')); // All Chinese encodings fail + + mockDetectBinaryEncoding.mockReturnValue(''); + + const result = await binaryToStringWithEncodingDetection( + readable, + contentType, + mockHelpers, + ); + + expect(result).toBe('content with �'); + }); + }); + + describe('Edge cases', () => { + it('should handle empty content', async () => { + const buffer = Buffer.from('', 'utf8'); + const contentType = 'text/html'; + + mockBinaryToString.mockResolvedValue(''); + + const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + expect(result).toBe(''); + expect(mockBinaryToString).toHaveBeenCalledTimes(1); + }); + + it('should handle empty Readable stream', async () => { + const readable = new Readable(); + readable.push(null); // Empty stream + const contentType = 'text/html'; + const buffer = Buffer.from(''); + + mockBinaryToBuffer.mockResolvedValue(buffer); + mockBinaryToString.mockResolvedValue(''); + + const result = await binaryToStringWithEncodingDetection( + readable, + contentType, + mockHelpers, + ); + + expect(result).toBe(''); + }); + + it('should not re-encode if Chinese encoding returns empty string', async () => { + const readable = new Readable(); + readable.push('content with �'); + readable.push(null); + const contentType = 'text/html'; + const buffer = Buffer.from('content with �'); + + mockBinaryToBuffer.mockResolvedValue(buffer); + mockBinaryToString + .mockResolvedValueOnce('content with �') // First UTF-8 attempt + .mockResolvedValueOnce(''); // gb18030 returns empty + + mockDetectBinaryEncoding.mockReturnValue(''); + + const result = await binaryToStringWithEncodingDetection( + readable, + contentType, + mockHelpers, + ); + + expect(mockBinaryToString).toHaveBeenCalledTimes(4); // Should try all encodings + expect(result).toBe('content with �'); // Should return original + }); + + it('should handle very short high ASCII sequences (less than 3 chars)', async () => { + const buffer = Buffer.from([0x80, 0x81]); // Only 2 high ASCII bytes + const contentType = 'text/html'; + const shortHighAsciiString = String.fromCharCode(0x80, 0x81); + + mockBinaryToString.mockResolvedValue(shortHighAsciiString); + + const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + expect(result).toBe(shortHighAsciiString); + expect(mockBinaryToString).toHaveBeenCalledTimes(1); // Should not trigger re-encoding + expect(mockDetectBinaryEncoding).not.toHaveBeenCalled(); + }); + + it('should handle mixed content with both replacement chars and high ASCII', async () => { + const buffer = Buffer.from( + 'test � content ' + String.fromCharCode(0x80, 0x81, 0x82), + 'utf8', + ); + const contentType = 'text/html'; + + mockBinaryToString + .mockResolvedValueOnce('test � content ' + String.fromCharCode(0x80, 0x81, 0x82)) + .mockResolvedValueOnce('test proper content decoded'); + + mockDetectBinaryEncoding.mockReturnValue('windows-1252'); + + const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + expect(result).toBe('test proper content decoded'); + expect(mockDetectBinaryEncoding).toHaveBeenCalledWith(buffer); + }); + }); + + describe('Non-UTF-8 encoding specified in Content-Type', () => { + it('should skip re-encoding when non-UTF-8 encoding is specified and used', async () => { + const buffer = Buffer.from('test content', 'utf8'); + const contentType = 'text/html; charset=iso-8859-1'; + + mockBinaryToString.mockResolvedValue('test content'); + + const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + expect(result).toBe('test content'); + expect(mockBinaryToString).toHaveBeenCalledTimes(1); + expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'iso-8859-1'); + expect(mockDetectBinaryEncoding).not.toHaveBeenCalled(); + }); + + it('should return result from specified encoding even if it contains replacement chars', async () => { + const buffer = Buffer.from('test content with �', 'utf8'); + const contentType = 'text/html; charset=iso-8859-1'; + + mockBinaryToString.mockResolvedValue('test content with �'); + + const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers); + + // When a specific encoding is provided, the function uses it directly without re-encoding + expect(result).toBe('test content with �'); + expect(mockBinaryToString).toHaveBeenCalledTimes(1); + expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'iso-8859-1'); + expect(mockDetectBinaryEncoding).not.toHaveBeenCalled(); + }); + }); + }); +});