diff --git a/packages/@n8n/ai-utilities/package.json b/packages/@n8n/ai-utilities/package.json index 14d528ca03f..668c0e14cb2 100644 --- a/packages/@n8n/ai-utilities/package.json +++ b/packages/@n8n/ai-utilities/package.json @@ -102,6 +102,7 @@ "tmp-promise": "3.0.3", "js-tiktoken": "catalog:", "https-proxy-agent": "catalog:", + "@thednp/dommatrix": "^2.0.12", "pdf-parse": "catalog:", "proxy-from-env": "^1.1.0", "undici": "^6.21.0" diff --git a/packages/@n8n/ai-utilities/src/__tests__/utils/loaders/n8n-pdf-loader.test.ts b/packages/@n8n/ai-utilities/src/__tests__/utils/loaders/n8n-pdf-loader.test.ts index c47cac30037..7775187e300 100644 --- a/packages/@n8n/ai-utilities/src/__tests__/utils/loaders/n8n-pdf-loader.test.ts +++ b/packages/@n8n/ai-utilities/src/__tests__/utils/loaders/n8n-pdf-loader.test.ts @@ -225,4 +225,35 @@ describe('N8nPdfLoader', () => { blobType: 'application/pdf', }); }); + + // `pdf-parse` v2 is backed by pdfjs-dist, which references the `DOMMatrix` + // global. Node.js does not provide it, so the loader must polyfill it before + // parsing — otherwise pdfjs throws "DOMMatrix is not defined" on PDFs that + // exercise that code path. + describe('DOMMatrix polyfill', () => { + const hadDomMatrix = 'DOMMatrix' in globalThis; + const originalDomMatrix: unknown = Reflect.get(globalThis, 'DOMMatrix'); + + afterAll(() => { + if (hadDomMatrix) { + Reflect.set(globalThis, 'DOMMatrix', originalDomMatrix); + } else { + Reflect.deleteProperty(globalThis, 'DOMMatrix'); + } + }); + + it('defines a usable DOMMatrix global before parsing when one is absent', async () => { + Reflect.deleteProperty(globalThis, 'DOMMatrix'); + mockGetText.mockResolvedValue({ + pages: [{ num: 1, text: 'page' }], + text: 'page', + total: 1, + }); + + const loader = new N8nPdfLoader(makeBlob()); + await loader.load(); + + expect(typeof Reflect.get(globalThis, 'DOMMatrix')).toBe('function'); + }); + }); }); diff --git a/packages/@n8n/ai-utilities/src/utils/loaders/n8n-pdf-loader.ts b/packages/@n8n/ai-utilities/src/utils/loaders/n8n-pdf-loader.ts index 2078a0a7b7f..587db72528c 100644 --- a/packages/@n8n/ai-utilities/src/utils/loaders/n8n-pdf-loader.ts +++ b/packages/@n8n/ai-utilities/src/utils/loaders/n8n-pdf-loader.ts @@ -25,6 +25,13 @@ export class N8nPdfLoader extends BufferLoader { } protected async parse(raw: Buffer, metadata: Record): Promise { + // pdf-parse v2 is backed by pdfjs-dist, which expects a `DOMMatrix` global + // that Node.js does not provide. Polyfill it before parsing. + if (typeof Reflect.get(globalThis, 'DOMMatrix') === 'undefined') { + const { default: DOMMatrix } = await import('@thednp/dommatrix'); + Reflect.set(globalThis, 'DOMMatrix', DOMMatrix); + } + const { PDFParse } = await import('pdf-parse'); // Buffer extends Uint8Array; PDFParse accepts it directly. diff --git a/packages/@n8n/instance-ai/package.json b/packages/@n8n/instance-ai/package.json index 3fdcfaba11d..4510af94057 100644 --- a/packages/@n8n/instance-ai/package.json +++ b/packages/@n8n/instance-ai/package.json @@ -61,6 +61,7 @@ "@n8n/utils": "workspace:*", "@n8n/workflow-sdk": "workspace:*", "@opentelemetry/api": "^1.9.0", + "@thednp/dommatrix": "^2.0.12", "csv-parse": "catalog:", "fast-glob": "catalog:", "flatted": "catalog:", diff --git a/packages/@n8n/instance-ai/src/parsers/__tests__/pdf-parser.test.ts b/packages/@n8n/instance-ai/src/parsers/__tests__/pdf-parser.test.ts index baf21ef4f61..5502bcc1037 100644 --- a/packages/@n8n/instance-ai/src/parsers/__tests__/pdf-parser.test.ts +++ b/packages/@n8n/instance-ai/src/parsers/__tests__/pdf-parser.test.ts @@ -97,4 +97,34 @@ describe('extractPdfText', () => { }), ).rejects.toThrow(/no extractable text/); }); + + // `pdf-parse` v2 is backed by pdfjs-dist, which references the `DOMMatrix` + // global. Node.js does not provide it, so the parser must polyfill it before + // parsing — otherwise pdfjs throws "DOMMatrix is not defined" on PDFs that + // exercise that code path. + describe('DOMMatrix polyfill', () => { + const hadDomMatrix = 'DOMMatrix' in globalThis; + const originalDomMatrix: unknown = Reflect.get(globalThis, 'DOMMatrix'); + + afterAll(() => { + if (hadDomMatrix) { + Reflect.set(globalThis, 'DOMMatrix', originalDomMatrix); + } else { + Reflect.deleteProperty(globalThis, 'DOMMatrix'); + } + }); + + it('defines a usable DOMMatrix global before parsing when one is absent', async () => { + Reflect.deleteProperty(globalThis, 'DOMMatrix'); + mockGetText.mockResolvedValue({ text: 'Hello world', total: 1 }); + + await extractPdfText({ + data: toBase64('pdf-bytes'), + mimeType: 'application/pdf', + fileName: 'doc.pdf', + }); + + expect(typeof Reflect.get(globalThis, 'DOMMatrix')).toBe('function'); + }); + }); }); diff --git a/packages/@n8n/instance-ai/src/parsers/pdf-parser.ts b/packages/@n8n/instance-ai/src/parsers/pdf-parser.ts index 574c1ae6502..c1a16cf95cf 100644 --- a/packages/@n8n/instance-ai/src/parsers/pdf-parser.ts +++ b/packages/@n8n/instance-ai/src/parsers/pdf-parser.ts @@ -22,6 +22,13 @@ export async function extractPdfText(attachment: AttachmentInfo): Promise