fix(HTTP Request Node): Detect and handle non-UTF-8 response encodings (#20889)

2026-05-26 06:17:21 +02:00 · 2025-12-01 10:59:07 +02:00 · 2025-12-01 10:59:07 +02:00 · 6068fb3b20
commit 6068fb3b20
parent 6e344f0f29
3 changed files with 522 additions and 2 deletions
--- a/packages/nodes-base/nodes/HttpRequest/V3/HttpRequestV3.node.ts
+++ b/packages/nodes-base/nodes/HttpRequest/V3/HttpRequestV3.node.ts
@ -44,6 +44,7 @@ import {
 import { setFilename } from './utils/binaryData';
 import { mimeTypeFromResponse } from './utils/parse';
 import { configureResponseOptimizer } from '../shared/optimizeResponse';
+import { binaryToStringWithEncodingDetection } from './utils/buffer-decoding';

 function toText<T>(data: T) {
 	if (typeof data === 'object' && data !== null) {
@ -922,7 +923,11 @@ export class HttpRequestV3 implements INodeType {
 									false,
 								) as boolean;

-								const data = await this.helpers.binaryToString(response.body as Buffer | Readable);
+								const data = await binaryToStringWithEncodingDetection(
+									response.body as Buffer | Readable,
+									responseContentType,
+									this.helpers,
+								);
 								response.body = jsonParse(data, {
 									...(neverError
 										? { fallbackValue: {} }
@ -934,7 +939,11 @@ export class HttpRequestV3 implements INodeType {
 						} else {
 							responseFormat = 'text';
 							if (!response.__bodyResolved) {
-								const data = await this.helpers.binaryToString(response.body as Buffer | Readable);
+								const data = await binaryToStringWithEncodingDetection(
+									response.body as Buffer | Readable,
+									responseContentType,
+									this.helpers,
+								);
 								response.body = !data ? undefined : data;
 							}
 						}
--- a/packages/nodes-base/nodes/HttpRequest/V3/utils/buffer-decoding.ts
+++ b/packages/nodes-base/nodes/HttpRequest/V3/utils/buffer-decoding.ts
@ -0,0 +1,72 @@
+import type { IExecuteFunctions } from 'n8n-workflow';
+import type { Readable } from 'stream';
+
+const CHINESE_ENCODINGS = ['gb18030', 'gbk', 'gb2312'] as const;
+const REPLACEMENT_CHAR = '<27>';
+const HIGH_ASCII_PATTERN = /[\x80-\xFF]{3,}/;
+const DEFAULT_ENCODING = 'utf-8';
+
+/**
+ * Enhanced encoding detection for better handling of non-UTF-8 content
+ * Extracts charset from Content-Type header (e.g., "text/html; charset=utf-8" → "utf-8")
+ */
+function detectEncoding(contentType?: string): BufferEncoding | undefined {
+	if (!contentType) return undefined;
+
+	// Regex breakdown:
+	// /charset=([^;,\s]+)/i
+	// - charset=           : Match literal "charset=" (case-insensitive due to 'i' flag)
+	// - ([^;,\s]+)        : Capture group that matches one or more characters that are NOT:
+	//                       ^ = negation, ; = semicolon, , = comma, \s = any whitespace
+	// - i                 : Case-insensitive flag (matches "charset=", "CHARSET=", "Charset=", etc.)
+	const charsetMatch = contentType.match(/charset=([^;,\s]+)/i);
+
+	if (charsetMatch) {
+		// charsetMatch[1] contains the captured group (the charset value)
+		// Convert to lowercase and remove any surrounding quotes
+		return charsetMatch[1].toLowerCase().replace(/['"]/g, '') as BufferEncoding;
+	}
+
+	return undefined;
+}
+
+/**
+ * Enhanced binary to string conversion for better handling of non-UTF-8 content
+ */
+export async function binaryToStringWithEncodingDetection(
+	body: Buffer | Readable,
+	contentType: string,
+	helpers: IExecuteFunctions['helpers'],
+): Promise<string> {
+	let bufferedData: Buffer;
+
+	if (body instanceof Buffer) {
+		bufferedData = body;
+	} else {
+		bufferedData = await helpers.binaryToBuffer(body);
+	}
+
+	const encoding = detectEncoding(contentType);
+
+	if (encoding && encoding !== DEFAULT_ENCODING) {
+		return await helpers.binaryToString(bufferedData, encoding);
+	}
+
+	const decodedString = await helpers.binaryToString(bufferedData);
+
+	if (decodedString.includes(REPLACEMENT_CHAR) || HIGH_ASCII_PATTERN.test(decodedString)) {
+		const detected = helpers.detectBinaryEncoding(bufferedData).toLowerCase() as BufferEncoding;
+		if (detected && detected !== DEFAULT_ENCODING) {
+			return await helpers.binaryToString(bufferedData, detected);
+		}
+
+		for (const chinese of CHINESE_ENCODINGS) {
+			try {
+				const reDecoded = await helpers.binaryToString(bufferedData, chinese as BufferEncoding);
+				if (!reDecoded.includes(REPLACEMENT_CHAR) && reDecoded.length > 0) return reDecoded;
+			} catch {}
+		}
+	}
+
+	return decodedString;
+}
--- a/packages/nodes-base/nodes/HttpRequest/V3/utils/test/buffer-decoding.test.ts
+++ b/packages/nodes-base/nodes/HttpRequest/V3/utils/test/buffer-decoding.test.ts
@ -0,0 +1,439 @@
+import { Readable } from 'stream';
+import type { IExecuteFunctions } from 'n8n-workflow';
+
+import { binaryToStringWithEncodingDetection } from '../buffer-decoding';
+
+describe('buffer-decoding utils', () => {
+	let mockHelpers: IExecuteFunctions['helpers'];
+	let mockBinaryToString: jest.MockedFunction<
+		(body: Buffer | Readable, encoding?: BufferEncoding) => Promise<string>
+	>;
+	let mockBinaryToBuffer: jest.MockedFunction<(body: Buffer | Readable) => Promise<Buffer>>;
+	let mockDetectBinaryEncoding: jest.MockedFunction<(buffer: Buffer) => string>;
+
+	beforeEach(() => {
+		jest.clearAllMocks();
+		mockBinaryToString = jest.fn();
+		mockBinaryToBuffer = jest.fn();
+		mockDetectBinaryEncoding = jest.fn();
+		mockHelpers = {
+			binaryToString: mockBinaryToString,
+			binaryToBuffer: mockBinaryToBuffer,
+			detectBinaryEncoding: mockDetectBinaryEncoding,
+		} as unknown as IExecuteFunctions['helpers'];
+	});
+
+	describe('binaryToStringWithEncodingDetection', () => {
+		describe('Content-Type header encoding detection', () => {
+			it('should use encoding from Content-Type header (lowercase)', async () => {
+				const buffer = Buffer.from('test content', 'utf8');
+				const contentType = 'text/html; charset=iso-8859-1';
+
+				mockBinaryToString.mockResolvedValue('test content');
+
+				await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'iso-8859-1');
+			});
+
+			it('should use encoding from Content-Type header (uppercase)', async () => {
+				const buffer = Buffer.from('test content', 'utf8');
+				const contentType = 'text/html; CHARSET=UTF-16';
+
+				mockBinaryToString.mockResolvedValue('test content');
+
+				await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'utf-16');
+			});
+
+			it('should remove quotes from charset value', async () => {
+				const buffer = Buffer.from('test content', 'utf8');
+				const contentType = 'text/html; charset="utf-8"';
+
+				mockBinaryToString.mockResolvedValue('test content');
+
+				await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				// Since utf-8 is the default encoding, it should call without encoding parameter
+				expect(mockBinaryToString).toHaveBeenCalledWith(buffer);
+			});
+
+			it('should handle charset with single quotes', async () => {
+				const buffer = Buffer.from('test content', 'utf8');
+				const contentType = "text/html; charset='iso-8859-1'";
+
+				mockBinaryToString.mockResolvedValue('test content');
+
+				await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'iso-8859-1');
+			});
+
+			it('should handle charset in complex Content-Type header', async () => {
+				const buffer = Buffer.from('test content', 'utf8');
+				const contentType = 'text/html; boundary=something; charset=windows-1252; other=value';
+
+				mockBinaryToString.mockResolvedValue('test content');
+
+				await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'windows-1252');
+			});
+
+			it('should fall back to UTF-8 when no charset in Content-Type', async () => {
+				const buffer = Buffer.from('test content', 'utf8');
+				const contentType = 'text/html';
+
+				mockBinaryToString.mockResolvedValueOnce('test content');
+
+				const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				expect(mockBinaryToString).toHaveBeenCalledWith(buffer);
+				expect(result).toBe('test content');
+			});
+		});
+
+		describe('UTF-8 fallback behavior', () => {
+			it('should return UTF-8 decoded string when no encoding issues detected', async () => {
+				const buffer = Buffer.from('test content', 'utf8');
+				const contentType = 'text/html';
+
+				mockBinaryToString.mockResolvedValue('test content');
+
+				const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				expect(result).toBe('test content');
+				expect(mockBinaryToString).toHaveBeenCalledTimes(1);
+			});
+
+			it('should not trigger re-encoding when UTF-8 content is clean', async () => {
+				const buffer = Buffer.from('Hello World! 🌍', 'utf8');
+				const contentType = 'text/html';
+
+				mockBinaryToString.mockResolvedValue('Hello World! 🌍');
+
+				const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				expect(result).toBe('Hello World! 🌍');
+				expect(mockBinaryToString).toHaveBeenCalledTimes(1);
+			});
+		});
+
+		describe('Replacement character detection and re-encoding', () => {
+			it('should detect replacement characters and try chardet for Buffer', async () => {
+				const buffer = Buffer.from('test content with <20>', 'utf8');
+				const contentType = 'text/html';
+
+				mockBinaryToString
+					.mockResolvedValueOnce('test content with <20>') // First UTF-8 attempt
+					.mockResolvedValueOnce('test content with proper chars'); // Second attempt with detected encoding
+
+				mockDetectBinaryEncoding.mockReturnValue('iso-8859-1');
+
+				const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				expect(mockDetectBinaryEncoding).toHaveBeenCalledWith(buffer);
+				expect(mockBinaryToString).toHaveBeenCalledTimes(2);
+				expect(mockBinaryToString).toHaveBeenNthCalledWith(1, buffer);
+				expect(mockBinaryToString).toHaveBeenNthCalledWith(2, buffer, 'iso-8859-1');
+				expect(result).toBe('test content with proper chars');
+			});
+
+			it('should detect high ASCII pattern and try chardet for Buffer', async () => {
+				const buffer = Buffer.from([0x80, 0x81, 0x82, 0x83]); // High ASCII bytes
+				const contentType = 'text/html';
+				const highAsciiString = String.fromCharCode(0x80, 0x81, 0x82, 0x83);
+
+				mockBinaryToString
+					.mockResolvedValueOnce(highAsciiString) // First UTF-8 attempt
+					.mockResolvedValueOnce('proper decoded content'); // Second attempt with detected encoding
+
+				mockDetectBinaryEncoding.mockReturnValue('windows-1252');
+
+				const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				expect(mockDetectBinaryEncoding).toHaveBeenCalledWith(buffer);
+				expect(mockBinaryToString).toHaveBeenCalledTimes(2);
+				expect(mockBinaryToString).toHaveBeenNthCalledWith(2, buffer, 'windows-1252');
+				expect(result).toBe('proper decoded content');
+			});
+
+			it('should try Chinese encodings for Readable streams with replacement chars', async () => {
+				const readable = new Readable();
+				readable.push('content with <20>');
+				readable.push(null);
+				const contentType = 'text/html';
+				const buffer = Buffer.from('content with <20>');
+
+				mockBinaryToBuffer.mockResolvedValue(buffer);
+				mockBinaryToString
+					.mockResolvedValueOnce('content with <20>') // First UTF-8 attempt
+					.mockResolvedValueOnce('content with proper chars'); // gb18030 attempt (success)
+
+				// Mock chardet to return empty string so it tries Chinese encodings
+				mockDetectBinaryEncoding.mockReturnValue('');
+
+				const result = await binaryToStringWithEncodingDetection(
+					readable,
+					contentType,
+					mockHelpers,
+				);
+
+				expect(mockBinaryToString).toHaveBeenCalledTimes(2);
+				expect(mockBinaryToString).toHaveBeenNthCalledWith(1, buffer);
+				expect(mockBinaryToString).toHaveBeenNthCalledWith(2, buffer, 'gb18030');
+				expect(result).toBe('content with proper chars');
+			});
+
+			it('should try all Chinese encodings for Readable streams if needed', async () => {
+				const readable = new Readable();
+				readable.push('content with <20>');
+				readable.push(null);
+				const contentType = 'text/html';
+				const buffer = Buffer.from('content with <20>');
+
+				mockBinaryToBuffer.mockResolvedValue(buffer);
+				mockBinaryToString
+					.mockResolvedValueOnce('content with <20>') // First UTF-8 attempt
+					.mockResolvedValueOnce('content with <20>') // gb18030 attempt
+					.mockResolvedValueOnce('content with proper chars'); // gbk attempt (success)
+
+				// Mock chardet to return empty string so it tries Chinese encodings
+				mockDetectBinaryEncoding.mockReturnValue('');
+
+				const result = await binaryToStringWithEncodingDetection(
+					readable,
+					contentType,
+					mockHelpers,
+				);
+
+				expect(mockBinaryToString).toHaveBeenCalledTimes(3);
+				expect(mockBinaryToString).toHaveBeenNthCalledWith(3, buffer, 'gbk');
+				expect(result).toBe('content with proper chars');
+			});
+
+			it('should return original string if all Chinese encodings fail for Readable', async () => {
+				const readable = new Readable();
+				readable.push('content with <20>');
+				readable.push(null);
+				const contentType = 'text/html';
+				const buffer = Buffer.from('content with <20>');
+
+				mockBinaryToBuffer.mockResolvedValue(buffer);
+				mockBinaryToString
+					.mockResolvedValueOnce('content with <20>') // First UTF-8 attempt
+					.mockResolvedValueOnce('content with <20>') // gb18030 attempt
+					.mockResolvedValueOnce('content with <20>') // gbk attempt
+					.mockResolvedValueOnce('content with <20>'); // gb2312 attempt
+
+				mockDetectBinaryEncoding.mockReturnValue('');
+
+				const result = await binaryToStringWithEncodingDetection(
+					readable,
+					contentType,
+					mockHelpers,
+				);
+
+				expect(result).toBe('content with <20>');
+			});
+		});
+
+		describe('Error handling', () => {
+			it('should handle chardet returning null for Buffer', async () => {
+				const buffer = Buffer.from('test content with <20>', 'utf8');
+				const contentType = 'text/html';
+
+				mockBinaryToBuffer.mockResolvedValue(buffer);
+				mockBinaryToString.mockResolvedValue('test content with <20>');
+				mockDetectBinaryEncoding.mockReturnValue('');
+
+				const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				expect(result).toBe('test content with <20>');
+				expect(mockBinaryToString).toHaveBeenCalledTimes(4); // 1 initial + 3 Chinese encodings
+			});
+
+			it('should handle chardet returning same encoding as default', async () => {
+				const buffer = Buffer.from('test content with <20>', 'utf8');
+				const contentType = 'text/html';
+
+				mockBinaryToBuffer.mockResolvedValue(buffer);
+				mockBinaryToString.mockResolvedValue('test content with <20>');
+				mockDetectBinaryEncoding.mockReturnValue('utf-8');
+
+				const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				expect(result).toBe('test content with <20>');
+				expect(mockBinaryToString).toHaveBeenCalledTimes(4); // 1 initial + 3 Chinese encodings
+			});
+
+			it('should handle errors in Chinese encoding attempts gracefully', async () => {
+				const readable = new Readable();
+				readable.push('content with <20>');
+				readable.push(null);
+				const contentType = 'text/html';
+				const buffer = Buffer.from('content with <20>');
+
+				mockBinaryToBuffer.mockResolvedValue(buffer);
+				mockBinaryToString
+					.mockResolvedValueOnce('content with <20>') // First UTF-8 attempt
+					.mockRejectedValueOnce(new Error('Encoding error')) // gb18030 error
+					.mockResolvedValueOnce('content with proper chars'); // gbk success
+
+				mockDetectBinaryEncoding.mockReturnValue('');
+
+				const result = await binaryToStringWithEncodingDetection(
+					readable,
+					contentType,
+					mockHelpers,
+				);
+
+				expect(result).toBe('content with proper chars');
+			});
+
+			it('should return original string if all Chinese encodings throw errors', async () => {
+				const readable = new Readable();
+				readable.push('content with <20>');
+				readable.push(null);
+				const contentType = 'text/html';
+				const buffer = Buffer.from('content with <20>');
+
+				mockBinaryToBuffer.mockResolvedValue(buffer);
+				mockBinaryToString
+					.mockResolvedValueOnce('content with <20>') // First UTF-8 attempt
+					.mockRejectedValue(new Error('Encoding error')); // All Chinese encodings fail
+
+				mockDetectBinaryEncoding.mockReturnValue('');
+
+				const result = await binaryToStringWithEncodingDetection(
+					readable,
+					contentType,
+					mockHelpers,
+				);
+
+				expect(result).toBe('content with <20>');
+			});
+		});
+
+		describe('Edge cases', () => {
+			it('should handle empty content', async () => {
+				const buffer = Buffer.from('', 'utf8');
+				const contentType = 'text/html';
+
+				mockBinaryToString.mockResolvedValue('');
+
+				const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				expect(result).toBe('');
+				expect(mockBinaryToString).toHaveBeenCalledTimes(1);
+			});
+
+			it('should handle empty Readable stream', async () => {
+				const readable = new Readable();
+				readable.push(null); // Empty stream
+				const contentType = 'text/html';
+				const buffer = Buffer.from('');
+
+				mockBinaryToBuffer.mockResolvedValue(buffer);
+				mockBinaryToString.mockResolvedValue('');
+
+				const result = await binaryToStringWithEncodingDetection(
+					readable,
+					contentType,
+					mockHelpers,
+				);
+
+				expect(result).toBe('');
+			});
+
+			it('should not re-encode if Chinese encoding returns empty string', async () => {
+				const readable = new Readable();
+				readable.push('content with <20>');
+				readable.push(null);
+				const contentType = 'text/html';
+				const buffer = Buffer.from('content with <20>');
+
+				mockBinaryToBuffer.mockResolvedValue(buffer);
+				mockBinaryToString
+					.mockResolvedValueOnce('content with <20>') // First UTF-8 attempt
+					.mockResolvedValueOnce(''); // gb18030 returns empty
+
+				mockDetectBinaryEncoding.mockReturnValue('');
+
+				const result = await binaryToStringWithEncodingDetection(
+					readable,
+					contentType,
+					mockHelpers,
+				);
+
+				expect(mockBinaryToString).toHaveBeenCalledTimes(4); // Should try all encodings
+				expect(result).toBe('content with <20>'); // Should return original
+			});
+
+			it('should handle very short high ASCII sequences (less than 3 chars)', async () => {
+				const buffer = Buffer.from([0x80, 0x81]); // Only 2 high ASCII bytes
+				const contentType = 'text/html';
+				const shortHighAsciiString = String.fromCharCode(0x80, 0x81);
+
+				mockBinaryToString.mockResolvedValue(shortHighAsciiString);
+
+				const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				expect(result).toBe(shortHighAsciiString);
+				expect(mockBinaryToString).toHaveBeenCalledTimes(1); // Should not trigger re-encoding
+				expect(mockDetectBinaryEncoding).not.toHaveBeenCalled();
+			});
+
+			it('should handle mixed content with both replacement chars and high ASCII', async () => {
+				const buffer = Buffer.from(
+					'test <20> content ' + String.fromCharCode(0x80, 0x81, 0x82),
+					'utf8',
+				);
+				const contentType = 'text/html';
+
+				mockBinaryToString
+					.mockResolvedValueOnce('test <20> content ' + String.fromCharCode(0x80, 0x81, 0x82))
+					.mockResolvedValueOnce('test proper content decoded');
+
+				mockDetectBinaryEncoding.mockReturnValue('windows-1252');
+
+				const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				expect(result).toBe('test proper content decoded');
+				expect(mockDetectBinaryEncoding).toHaveBeenCalledWith(buffer);
+			});
+		});
+
+		describe('Non-UTF-8 encoding specified in Content-Type', () => {
+			it('should skip re-encoding when non-UTF-8 encoding is specified and used', async () => {
+				const buffer = Buffer.from('test content', 'utf8');
+				const contentType = 'text/html; charset=iso-8859-1';
+
+				mockBinaryToString.mockResolvedValue('test content');
+
+				const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				expect(result).toBe('test content');
+				expect(mockBinaryToString).toHaveBeenCalledTimes(1);
+				expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'iso-8859-1');
+				expect(mockDetectBinaryEncoding).not.toHaveBeenCalled();
+			});
+
+			it('should return result from specified encoding even if it contains replacement chars', async () => {
+				const buffer = Buffer.from('test content with <20>', 'utf8');
+				const contentType = 'text/html; charset=iso-8859-1';
+
+				mockBinaryToString.mockResolvedValue('test content with <20>');
+
+				const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
+
+				// When a specific encoding is provided, the function uses it directly without re-encoding
+				expect(result).toBe('test content with <20>');
+				expect(mockBinaryToString).toHaveBeenCalledTimes(1);
+				expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'iso-8859-1');
+				expect(mockDetectBinaryEncoding).not.toHaveBeenCalled();
+			});
+		});
+	});
+});