fix(HTTP Request Node): Detect and handle non-UTF-8 response encodings (#20889)

This commit is contained in:
Michael Kret 2025-12-01 10:59:07 +02:00 committed by GitHub
parent 6e344f0f29
commit 6068fb3b20
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 522 additions and 2 deletions

View File

@ -44,6 +44,7 @@ import {
import { setFilename } from './utils/binaryData';
import { mimeTypeFromResponse } from './utils/parse';
import { configureResponseOptimizer } from '../shared/optimizeResponse';
import { binaryToStringWithEncodingDetection } from './utils/buffer-decoding';
function toText<T>(data: T) {
if (typeof data === 'object' && data !== null) {
@ -922,7 +923,11 @@ export class HttpRequestV3 implements INodeType {
false,
) as boolean;
const data = await this.helpers.binaryToString(response.body as Buffer | Readable);
const data = await binaryToStringWithEncodingDetection(
response.body as Buffer | Readable,
responseContentType,
this.helpers,
);
response.body = jsonParse(data, {
...(neverError
? { fallbackValue: {} }
@ -934,7 +939,11 @@ export class HttpRequestV3 implements INodeType {
} else {
responseFormat = 'text';
if (!response.__bodyResolved) {
const data = await this.helpers.binaryToString(response.body as Buffer | Readable);
const data = await binaryToStringWithEncodingDetection(
response.body as Buffer | Readable,
responseContentType,
this.helpers,
);
response.body = !data ? undefined : data;
}
}

View File

@ -0,0 +1,72 @@
import type { IExecuteFunctions } from 'n8n-workflow';
import type { Readable } from 'stream';
const CHINESE_ENCODINGS = ['gb18030', 'gbk', 'gb2312'] as const;
const REPLACEMENT_CHAR = '<27>';
const HIGH_ASCII_PATTERN = /[\x80-\xFF]{3,}/;
const DEFAULT_ENCODING = 'utf-8';
/**
* Enhanced encoding detection for better handling of non-UTF-8 content
* Extracts charset from Content-Type header (e.g., "text/html; charset=utf-8" "utf-8")
*/
function detectEncoding(contentType?: string): BufferEncoding | undefined {
if (!contentType) return undefined;
// Regex breakdown:
// /charset=([^;,\s]+)/i
// - charset= : Match literal "charset=" (case-insensitive due to 'i' flag)
// - ([^;,\s]+) : Capture group that matches one or more characters that are NOT:
// ^ = negation, ; = semicolon, , = comma, \s = any whitespace
// - i : Case-insensitive flag (matches "charset=", "CHARSET=", "Charset=", etc.)
const charsetMatch = contentType.match(/charset=([^;,\s]+)/i);
if (charsetMatch) {
// charsetMatch[1] contains the captured group (the charset value)
// Convert to lowercase and remove any surrounding quotes
return charsetMatch[1].toLowerCase().replace(/['"]/g, '') as BufferEncoding;
}
return undefined;
}
/**
* Enhanced binary to string conversion for better handling of non-UTF-8 content
*/
export async function binaryToStringWithEncodingDetection(
body: Buffer | Readable,
contentType: string,
helpers: IExecuteFunctions['helpers'],
): Promise<string> {
let bufferedData: Buffer;
if (body instanceof Buffer) {
bufferedData = body;
} else {
bufferedData = await helpers.binaryToBuffer(body);
}
const encoding = detectEncoding(contentType);
if (encoding && encoding !== DEFAULT_ENCODING) {
return await helpers.binaryToString(bufferedData, encoding);
}
const decodedString = await helpers.binaryToString(bufferedData);
if (decodedString.includes(REPLACEMENT_CHAR) || HIGH_ASCII_PATTERN.test(decodedString)) {
const detected = helpers.detectBinaryEncoding(bufferedData).toLowerCase() as BufferEncoding;
if (detected && detected !== DEFAULT_ENCODING) {
return await helpers.binaryToString(bufferedData, detected);
}
for (const chinese of CHINESE_ENCODINGS) {
try {
const reDecoded = await helpers.binaryToString(bufferedData, chinese as BufferEncoding);
if (!reDecoded.includes(REPLACEMENT_CHAR) && reDecoded.length > 0) return reDecoded;
} catch {}
}
}
return decodedString;
}

View File

@ -0,0 +1,439 @@
import { Readable } from 'stream';
import type { IExecuteFunctions } from 'n8n-workflow';
import { binaryToStringWithEncodingDetection } from '../buffer-decoding';
describe('buffer-decoding utils', () => {
let mockHelpers: IExecuteFunctions['helpers'];
let mockBinaryToString: jest.MockedFunction<
(body: Buffer | Readable, encoding?: BufferEncoding) => Promise<string>
>;
let mockBinaryToBuffer: jest.MockedFunction<(body: Buffer | Readable) => Promise<Buffer>>;
let mockDetectBinaryEncoding: jest.MockedFunction<(buffer: Buffer) => string>;
beforeEach(() => {
jest.clearAllMocks();
mockBinaryToString = jest.fn();
mockBinaryToBuffer = jest.fn();
mockDetectBinaryEncoding = jest.fn();
mockHelpers = {
binaryToString: mockBinaryToString,
binaryToBuffer: mockBinaryToBuffer,
detectBinaryEncoding: mockDetectBinaryEncoding,
} as unknown as IExecuteFunctions['helpers'];
});
describe('binaryToStringWithEncodingDetection', () => {
describe('Content-Type header encoding detection', () => {
it('should use encoding from Content-Type header (lowercase)', async () => {
const buffer = Buffer.from('test content', 'utf8');
const contentType = 'text/html; charset=iso-8859-1';
mockBinaryToString.mockResolvedValue('test content');
await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'iso-8859-1');
});
it('should use encoding from Content-Type header (uppercase)', async () => {
const buffer = Buffer.from('test content', 'utf8');
const contentType = 'text/html; CHARSET=UTF-16';
mockBinaryToString.mockResolvedValue('test content');
await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'utf-16');
});
it('should remove quotes from charset value', async () => {
const buffer = Buffer.from('test content', 'utf8');
const contentType = 'text/html; charset="utf-8"';
mockBinaryToString.mockResolvedValue('test content');
await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
// Since utf-8 is the default encoding, it should call without encoding parameter
expect(mockBinaryToString).toHaveBeenCalledWith(buffer);
});
it('should handle charset with single quotes', async () => {
const buffer = Buffer.from('test content', 'utf8');
const contentType = "text/html; charset='iso-8859-1'";
mockBinaryToString.mockResolvedValue('test content');
await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'iso-8859-1');
});
it('should handle charset in complex Content-Type header', async () => {
const buffer = Buffer.from('test content', 'utf8');
const contentType = 'text/html; boundary=something; charset=windows-1252; other=value';
mockBinaryToString.mockResolvedValue('test content');
await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'windows-1252');
});
it('should fall back to UTF-8 when no charset in Content-Type', async () => {
const buffer = Buffer.from('test content', 'utf8');
const contentType = 'text/html';
mockBinaryToString.mockResolvedValueOnce('test content');
const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
expect(mockBinaryToString).toHaveBeenCalledWith(buffer);
expect(result).toBe('test content');
});
});
describe('UTF-8 fallback behavior', () => {
it('should return UTF-8 decoded string when no encoding issues detected', async () => {
const buffer = Buffer.from('test content', 'utf8');
const contentType = 'text/html';
mockBinaryToString.mockResolvedValue('test content');
const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
expect(result).toBe('test content');
expect(mockBinaryToString).toHaveBeenCalledTimes(1);
});
it('should not trigger re-encoding when UTF-8 content is clean', async () => {
const buffer = Buffer.from('Hello World! 🌍', 'utf8');
const contentType = 'text/html';
mockBinaryToString.mockResolvedValue('Hello World! 🌍');
const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
expect(result).toBe('Hello World! 🌍');
expect(mockBinaryToString).toHaveBeenCalledTimes(1);
});
});
describe('Replacement character detection and re-encoding', () => {
it('should detect replacement characters and try chardet for Buffer', async () => {
const buffer = Buffer.from('test content with <20>', 'utf8');
const contentType = 'text/html';
mockBinaryToString
.mockResolvedValueOnce('test content with <20>') // First UTF-8 attempt
.mockResolvedValueOnce('test content with proper chars'); // Second attempt with detected encoding
mockDetectBinaryEncoding.mockReturnValue('iso-8859-1');
const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
expect(mockDetectBinaryEncoding).toHaveBeenCalledWith(buffer);
expect(mockBinaryToString).toHaveBeenCalledTimes(2);
expect(mockBinaryToString).toHaveBeenNthCalledWith(1, buffer);
expect(mockBinaryToString).toHaveBeenNthCalledWith(2, buffer, 'iso-8859-1');
expect(result).toBe('test content with proper chars');
});
it('should detect high ASCII pattern and try chardet for Buffer', async () => {
const buffer = Buffer.from([0x80, 0x81, 0x82, 0x83]); // High ASCII bytes
const contentType = 'text/html';
const highAsciiString = String.fromCharCode(0x80, 0x81, 0x82, 0x83);
mockBinaryToString
.mockResolvedValueOnce(highAsciiString) // First UTF-8 attempt
.mockResolvedValueOnce('proper decoded content'); // Second attempt with detected encoding
mockDetectBinaryEncoding.mockReturnValue('windows-1252');
const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
expect(mockDetectBinaryEncoding).toHaveBeenCalledWith(buffer);
expect(mockBinaryToString).toHaveBeenCalledTimes(2);
expect(mockBinaryToString).toHaveBeenNthCalledWith(2, buffer, 'windows-1252');
expect(result).toBe('proper decoded content');
});
it('should try Chinese encodings for Readable streams with replacement chars', async () => {
const readable = new Readable();
readable.push('content with <20>');
readable.push(null);
const contentType = 'text/html';
const buffer = Buffer.from('content with <20>');
mockBinaryToBuffer.mockResolvedValue(buffer);
mockBinaryToString
.mockResolvedValueOnce('content with <20>') // First UTF-8 attempt
.mockResolvedValueOnce('content with proper chars'); // gb18030 attempt (success)
// Mock chardet to return empty string so it tries Chinese encodings
mockDetectBinaryEncoding.mockReturnValue('');
const result = await binaryToStringWithEncodingDetection(
readable,
contentType,
mockHelpers,
);
expect(mockBinaryToString).toHaveBeenCalledTimes(2);
expect(mockBinaryToString).toHaveBeenNthCalledWith(1, buffer);
expect(mockBinaryToString).toHaveBeenNthCalledWith(2, buffer, 'gb18030');
expect(result).toBe('content with proper chars');
});
it('should try all Chinese encodings for Readable streams if needed', async () => {
const readable = new Readable();
readable.push('content with <20>');
readable.push(null);
const contentType = 'text/html';
const buffer = Buffer.from('content with <20>');
mockBinaryToBuffer.mockResolvedValue(buffer);
mockBinaryToString
.mockResolvedValueOnce('content with <20>') // First UTF-8 attempt
.mockResolvedValueOnce('content with <20>') // gb18030 attempt
.mockResolvedValueOnce('content with proper chars'); // gbk attempt (success)
// Mock chardet to return empty string so it tries Chinese encodings
mockDetectBinaryEncoding.mockReturnValue('');
const result = await binaryToStringWithEncodingDetection(
readable,
contentType,
mockHelpers,
);
expect(mockBinaryToString).toHaveBeenCalledTimes(3);
expect(mockBinaryToString).toHaveBeenNthCalledWith(3, buffer, 'gbk');
expect(result).toBe('content with proper chars');
});
it('should return original string if all Chinese encodings fail for Readable', async () => {
const readable = new Readable();
readable.push('content with <20>');
readable.push(null);
const contentType = 'text/html';
const buffer = Buffer.from('content with <20>');
mockBinaryToBuffer.mockResolvedValue(buffer);
mockBinaryToString
.mockResolvedValueOnce('content with <20>') // First UTF-8 attempt
.mockResolvedValueOnce('content with <20>') // gb18030 attempt
.mockResolvedValueOnce('content with <20>') // gbk attempt
.mockResolvedValueOnce('content with <20>'); // gb2312 attempt
mockDetectBinaryEncoding.mockReturnValue('');
const result = await binaryToStringWithEncodingDetection(
readable,
contentType,
mockHelpers,
);
expect(result).toBe('content with <20>');
});
});
describe('Error handling', () => {
it('should handle chardet returning null for Buffer', async () => {
const buffer = Buffer.from('test content with <20>', 'utf8');
const contentType = 'text/html';
mockBinaryToBuffer.mockResolvedValue(buffer);
mockBinaryToString.mockResolvedValue('test content with <20>');
mockDetectBinaryEncoding.mockReturnValue('');
const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
expect(result).toBe('test content with <20>');
expect(mockBinaryToString).toHaveBeenCalledTimes(4); // 1 initial + 3 Chinese encodings
});
it('should handle chardet returning same encoding as default', async () => {
const buffer = Buffer.from('test content with <20>', 'utf8');
const contentType = 'text/html';
mockBinaryToBuffer.mockResolvedValue(buffer);
mockBinaryToString.mockResolvedValue('test content with <20>');
mockDetectBinaryEncoding.mockReturnValue('utf-8');
const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
expect(result).toBe('test content with <20>');
expect(mockBinaryToString).toHaveBeenCalledTimes(4); // 1 initial + 3 Chinese encodings
});
it('should handle errors in Chinese encoding attempts gracefully', async () => {
const readable = new Readable();
readable.push('content with <20>');
readable.push(null);
const contentType = 'text/html';
const buffer = Buffer.from('content with <20>');
mockBinaryToBuffer.mockResolvedValue(buffer);
mockBinaryToString
.mockResolvedValueOnce('content with <20>') // First UTF-8 attempt
.mockRejectedValueOnce(new Error('Encoding error')) // gb18030 error
.mockResolvedValueOnce('content with proper chars'); // gbk success
mockDetectBinaryEncoding.mockReturnValue('');
const result = await binaryToStringWithEncodingDetection(
readable,
contentType,
mockHelpers,
);
expect(result).toBe('content with proper chars');
});
it('should return original string if all Chinese encodings throw errors', async () => {
const readable = new Readable();
readable.push('content with <20>');
readable.push(null);
const contentType = 'text/html';
const buffer = Buffer.from('content with <20>');
mockBinaryToBuffer.mockResolvedValue(buffer);
mockBinaryToString
.mockResolvedValueOnce('content with <20>') // First UTF-8 attempt
.mockRejectedValue(new Error('Encoding error')); // All Chinese encodings fail
mockDetectBinaryEncoding.mockReturnValue('');
const result = await binaryToStringWithEncodingDetection(
readable,
contentType,
mockHelpers,
);
expect(result).toBe('content with <20>');
});
});
describe('Edge cases', () => {
it('should handle empty content', async () => {
const buffer = Buffer.from('', 'utf8');
const contentType = 'text/html';
mockBinaryToString.mockResolvedValue('');
const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
expect(result).toBe('');
expect(mockBinaryToString).toHaveBeenCalledTimes(1);
});
it('should handle empty Readable stream', async () => {
const readable = new Readable();
readable.push(null); // Empty stream
const contentType = 'text/html';
const buffer = Buffer.from('');
mockBinaryToBuffer.mockResolvedValue(buffer);
mockBinaryToString.mockResolvedValue('');
const result = await binaryToStringWithEncodingDetection(
readable,
contentType,
mockHelpers,
);
expect(result).toBe('');
});
it('should not re-encode if Chinese encoding returns empty string', async () => {
const readable = new Readable();
readable.push('content with <20>');
readable.push(null);
const contentType = 'text/html';
const buffer = Buffer.from('content with <20>');
mockBinaryToBuffer.mockResolvedValue(buffer);
mockBinaryToString
.mockResolvedValueOnce('content with <20>') // First UTF-8 attempt
.mockResolvedValueOnce(''); // gb18030 returns empty
mockDetectBinaryEncoding.mockReturnValue('');
const result = await binaryToStringWithEncodingDetection(
readable,
contentType,
mockHelpers,
);
expect(mockBinaryToString).toHaveBeenCalledTimes(4); // Should try all encodings
expect(result).toBe('content with <20>'); // Should return original
});
it('should handle very short high ASCII sequences (less than 3 chars)', async () => {
const buffer = Buffer.from([0x80, 0x81]); // Only 2 high ASCII bytes
const contentType = 'text/html';
const shortHighAsciiString = String.fromCharCode(0x80, 0x81);
mockBinaryToString.mockResolvedValue(shortHighAsciiString);
const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
expect(result).toBe(shortHighAsciiString);
expect(mockBinaryToString).toHaveBeenCalledTimes(1); // Should not trigger re-encoding
expect(mockDetectBinaryEncoding).not.toHaveBeenCalled();
});
it('should handle mixed content with both replacement chars and high ASCII', async () => {
const buffer = Buffer.from(
'test <20> content ' + String.fromCharCode(0x80, 0x81, 0x82),
'utf8',
);
const contentType = 'text/html';
mockBinaryToString
.mockResolvedValueOnce('test <20> content ' + String.fromCharCode(0x80, 0x81, 0x82))
.mockResolvedValueOnce('test proper content decoded');
mockDetectBinaryEncoding.mockReturnValue('windows-1252');
const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
expect(result).toBe('test proper content decoded');
expect(mockDetectBinaryEncoding).toHaveBeenCalledWith(buffer);
});
});
describe('Non-UTF-8 encoding specified in Content-Type', () => {
it('should skip re-encoding when non-UTF-8 encoding is specified and used', async () => {
const buffer = Buffer.from('test content', 'utf8');
const contentType = 'text/html; charset=iso-8859-1';
mockBinaryToString.mockResolvedValue('test content');
const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
expect(result).toBe('test content');
expect(mockBinaryToString).toHaveBeenCalledTimes(1);
expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'iso-8859-1');
expect(mockDetectBinaryEncoding).not.toHaveBeenCalled();
});
it('should return result from specified encoding even if it contains replacement chars', async () => {
const buffer = Buffer.from('test content with <20>', 'utf8');
const contentType = 'text/html; charset=iso-8859-1';
mockBinaryToString.mockResolvedValue('test content with <20>');
const result = await binaryToStringWithEncodingDetection(buffer, contentType, mockHelpers);
// When a specific encoding is provided, the function uses it directly without re-encoding
expect(result).toBe('test content with <20>');
expect(mockBinaryToString).toHaveBeenCalledTimes(1);
expect(mockBinaryToString).toHaveBeenCalledWith(buffer, 'iso-8859-1');
expect(mockDetectBinaryEncoding).not.toHaveBeenCalled();
});
});
});
});