fix(core): Use pdf-parse v2 in Document Loader to fix PDF embedding (#30961)

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Arvin A 2026-05-26 17:13:39 +02:00 committed by GitHub
parent cf1a6fa18c
commit b5c53ff3ea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 516 additions and 86 deletions

View File

@ -124,6 +124,7 @@
"date-fns": "2.30.0",
"date-fns-tz": "2.0.0",
"form-data": "4.0.4",
"pdf-parse": "^2.4.5",
"tmp": "0.2.4",
"nodemailer": "7.0.11",
"validator": "13.15.26",

View File

@ -90,6 +90,7 @@
"tmp-promise": "3.0.3",
"js-tiktoken": "catalog:",
"https-proxy-agent": "catalog:",
"pdf-parse": "2.4.5",
"proxy-from-env": "^1.1.0",
"undici": "^6.21.0"
},

View File

@ -0,0 +1,228 @@
import { LoggerProxy } from 'n8n-workflow';
import { N8nPdfLoader } from 'src/utils/loaders/n8n-pdf-loader';
const mockGetText = jest.fn();
const mockGetInfo = jest.fn();
const mockDestroy = jest.fn();
const mockConstructor = jest.fn();
const mockLoggerDebug = jest.fn();
jest.mock('pdf-parse', () => ({
__esModule: true,
PDFParse: jest.fn().mockImplementation((options: unknown) => {
mockConstructor(options);
return {
getText: mockGetText,
getInfo: mockGetInfo,
destroy: mockDestroy,
};
}),
}));
LoggerProxy.init({
debug: mockLoggerDebug,
info: jest.fn(),
warn: jest.fn(),
error: jest.fn(),
});
function makeBlob(content = 'fake-pdf-bytes'): Blob {
return new Blob([Buffer.from(content)], { type: 'application/pdf' });
}
describe('N8nPdfLoader', () => {
beforeEach(() => {
mockGetText.mockReset();
mockGetInfo.mockReset().mockResolvedValue({ info: undefined, metadata: undefined });
mockDestroy.mockReset().mockResolvedValue(undefined);
mockConstructor.mockReset();
mockLoggerDebug.mockReset();
});
it('produces one Document per page with loc.pageNumber and pdf.totalPages when splitPages is true', async () => {
mockGetText.mockResolvedValue({
pages: [
{ num: 1, text: 'Page one body' },
{ num: 2, text: 'Page two body' },
{ num: 3, text: 'Page three body' },
],
text: 'Page one bodyPage two bodyPage three body',
total: 3,
});
mockGetInfo.mockResolvedValue({
info: { Title: 'Sample', Author: 'Test' },
metadata: { fake: 'xmp' },
});
const loader = new N8nPdfLoader(makeBlob(), { splitPages: true });
const docs = await loader.load();
expect(docs).toHaveLength(3);
expect(docs[0].pageContent).toBe('Page one body');
expect(docs[1].pageContent).toBe('Page two body');
expect(docs[2].pageContent).toBe('Page three body');
expect(docs[0].metadata).toMatchObject({
loc: { pageNumber: 1 },
pdf: {
info: { Title: 'Sample', Author: 'Test' },
metadata: { fake: 'xmp' },
totalPages: 3,
},
});
expect(docs[2].metadata.loc).toEqual({ pageNumber: 3 });
});
it('concatenates pages into a single Document when splitPages is false', async () => {
mockGetText.mockResolvedValue({
pages: [
{ num: 1, text: 'first' },
{ num: 2, text: 'second' },
],
text: 'firstsecond',
total: 2,
});
const loader = new N8nPdfLoader(makeBlob(), { splitPages: false });
const docs = await loader.load();
expect(docs).toHaveLength(1);
expect(docs[0].pageContent).toBe('first\n\nsecond');
expect(docs[0].metadata.pdf).toEqual({
info: undefined,
metadata: undefined,
totalPages: 2,
});
// No `loc.pageNumber` on the merged document — matches the
// behavior of LangChain's PDFLoader when splitPages is false.
expect(docs[0].metadata.loc).toBeUndefined();
});
it('defaults splitPages to true when options are omitted', async () => {
mockGetText.mockResolvedValue({
pages: [{ num: 1, text: 'only page' }],
text: 'only page',
total: 1,
});
const loader = new N8nPdfLoader(makeBlob());
const docs = await loader.load();
expect(docs).toHaveLength(1);
expect(docs[0].metadata.loc).toEqual({ pageNumber: 1 });
});
it('returns [] when the PDF has zero pages and splitPages is false', async () => {
mockGetText.mockResolvedValue({ pages: [], text: '', total: 0 });
const loader = new N8nPdfLoader(makeBlob(), { splitPages: false });
const docs = await loader.load();
expect(docs).toEqual([]);
});
it('disables the per-page joiner so extracted text is not polluted with separators', async () => {
mockGetText.mockResolvedValue({
pages: [{ num: 1, text: 'clean text' }],
text: 'clean text',
total: 1,
});
const loader = new N8nPdfLoader(makeBlob());
await loader.load();
expect(mockGetText).toHaveBeenCalledWith({ pageJoiner: '' });
});
it('passes binary contents to PDFParse as a Uint8Array', async () => {
mockGetText.mockResolvedValue({
pages: [{ num: 1, text: 't' }],
text: 't',
total: 1,
});
const loader = new N8nPdfLoader(makeBlob('hello'));
await loader.load();
expect(mockConstructor).toHaveBeenCalledTimes(1);
const [arg] = mockConstructor.mock.calls[0];
expect(arg.data).toBeInstanceOf(Uint8Array);
expect(Buffer.from(arg.data).toString('utf-8')).toBe('hello');
});
it('calls destroy() exactly once when extraction succeeds', async () => {
mockGetText.mockResolvedValue({
pages: [{ num: 1, text: 'ok' }],
text: 'ok',
total: 1,
});
const loader = new N8nPdfLoader(makeBlob());
await loader.load();
expect(mockDestroy).toHaveBeenCalledTimes(1);
});
it('calls destroy() even when getText() throws (worker cleanup on error path)', async () => {
mockGetText.mockRejectedValue(new Error('Invalid PDF structure'));
const loader = new N8nPdfLoader(makeBlob());
await expect(loader.load()).rejects.toThrow('Invalid PDF structure');
expect(mockDestroy).toHaveBeenCalledTimes(1);
});
it('tolerates getInfo() failure, logs at debug, and still returns documents with structural metadata', async () => {
mockGetText.mockResolvedValue({
pages: [{ num: 1, text: 'page' }],
text: 'page',
total: 1,
});
const infoError = new Error('info unavailable');
mockGetInfo.mockRejectedValue(infoError);
const loader = new N8nPdfLoader(makeBlob());
const docs = await loader.load();
expect(docs).toHaveLength(1);
expect(docs[0].metadata.pdf).toEqual({
info: undefined,
metadata: undefined,
totalPages: 1,
});
expect(mockDestroy).toHaveBeenCalledTimes(1);
expect(mockLoggerDebug).toHaveBeenCalledWith(expect.stringContaining('getInfo() failed'), {
error: infoError,
});
});
it('preserves the parse error when destroy() rejects on the error path', async () => {
const parseError = new Error('Invalid PDF structure');
mockGetText.mockRejectedValue(parseError);
mockDestroy.mockRejectedValue(new Error('worker hang on shutdown'));
const loader = new N8nPdfLoader(makeBlob());
// The original parse error must surface — destroy()'s rejection is swallowed.
await expect(loader.load()).rejects.toBe(parseError);
expect(mockDestroy).toHaveBeenCalledTimes(1);
});
it('passes through source metadata from BufferLoader (blob shape)', async () => {
mockGetText.mockResolvedValue({
pages: [{ num: 1, text: 'one' }],
text: 'one',
total: 1,
});
const loader = new N8nPdfLoader(makeBlob());
const docs = await loader.load();
// BufferLoader.load() sets metadata.source = 'blob' and metadata.blobType
// before calling parse(); these must survive the merge.
expect(docs[0].metadata).toMatchObject({
source: 'blob',
blobType: 'application/pdf',
});
});
});

View File

@ -42,8 +42,8 @@ jest.mock('@langchain/community/document_loaders/fs/epub', () => ({
})),
}));
jest.mock('@langchain/community/document_loaders/fs/pdf', () => ({
PDFLoader: jest.fn().mockImplementation(() => ({
jest.mock('src/utils/loaders/n8n-pdf-loader', () => ({
N8nPdfLoader: jest.fn().mockImplementation(() => ({
load: jest.fn().mockResolvedValue([{ pageContent: 'pdf content', metadata: {} }]),
})),
}));

View File

@ -0,0 +1,79 @@
import { BufferLoader } from '@langchain/classic/document_loaders/fs/buffer';
import { Document } from '@langchain/core/documents';
import { LoggerProxy as Logger } from 'n8n-workflow';
import type { PDFParse as PDFParseClass } from 'pdf-parse';
export interface N8nPdfLoaderOptions {
splitPages?: boolean;
}
/**
* PDF document loader backed by `pdf-parse` v2.
*
* Drop-in replacement for `@langchain/community`'s `PDFLoader`, which
* hardcoded a v1-only deep import. Produces the same `Document[]` shape
* (per-page `pageContent`, `metadata.loc.pageNumber`, `metadata.pdf.*`)
* so downstream consumers (vector stores, summarization chains) remain
* unaffected.
*/
export class N8nPdfLoader extends BufferLoader {
private readonly splitPages: boolean;
constructor(filePathOrBlob: string | Blob, { splitPages = true }: N8nPdfLoaderOptions = {}) {
super(filePathOrBlob);
this.splitPages = splitPages;
}
protected async parse(raw: Buffer, metadata: Record<string, unknown>): Promise<Document[]> {
const { PDFParse } = await import('pdf-parse');
// Buffer extends Uint8Array; PDFParse accepts it directly.
const parser: PDFParseClass = new PDFParse({ data: raw });
try {
// pageJoiner default ('-- page X of Y --') would pollute the extracted
// text — disable it so each page's `text` is purely its content.
const result = await parser.getText({ pageJoiner: '' });
const info = await parser.getInfo().catch((error: unknown) => {
Logger.debug('N8nPdfLoader: getInfo() failed; continuing without pdf.info metadata', {
error,
});
return null;
});
const pdfMeta = {
info: info?.info,
metadata: info?.metadata,
totalPages: result.total,
};
const documents = result.pages.map(
(page) =>
new Document({
pageContent: page.text,
metadata: {
...metadata,
pdf: pdfMeta,
loc: { pageNumber: page.num },
},
}),
);
if (this.splitPages) return documents;
if (documents.length === 0) return [];
return [
new Document({
pageContent: documents.map((doc) => doc.pageContent).join('\n\n'),
metadata: {
...metadata,
pdf: pdfMeta,
},
}),
];
} finally {
// Best-effort cleanup — never let destroy() shadow a parse error.
await parser.destroy().catch(() => undefined);
}
}
}

View File

@ -3,7 +3,6 @@ import { TextLoader } from '@langchain/classic/document_loaders/fs/text';
import { CSVLoader } from '@langchain/community/document_loaders/fs/csv';
import { DocxLoader } from '@langchain/community/document_loaders/fs/docx';
import { EPubLoader } from '@langchain/community/document_loaders/fs/epub';
import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf';
import type { Document } from '@langchain/core/documents';
import type { TextSplitter } from '@langchain/textsplitters';
import { createWriteStream } from 'fs';
@ -18,6 +17,7 @@ import { pipeline } from 'stream/promises';
import { file as tmpFile, type DirectoryResult } from 'tmp-promise';
import { getMetadataFiltersValues } from './helpers';
import { N8nPdfLoader } from './loaders/n8n-pdf-loader';
const SUPPORTED_MIME_TYPES = {
auto: ['*/*'],
@ -105,7 +105,7 @@ export class N8nBinaryLoader {
mimeType: string,
filePathOrBlob: string | Blob,
itemIndex: number,
): Promise<PDFLoader | CSVLoader | EPubLoader | DocxLoader | TextLoader | JSONLoader> {
): Promise<N8nPdfLoader | CSVLoader | EPubLoader | DocxLoader | TextLoader | JSONLoader> {
switch (mimeType) {
case 'application/pdf':
const splitPages = this.context.getNodeParameter(
@ -113,7 +113,7 @@ export class N8nBinaryLoader {
itemIndex,
false,
) as boolean;
return new PDFLoader(filePathOrBlob, { splitPages });
return new N8nPdfLoader(filePathOrBlob, { splitPages });
case 'text/csv':
const column = this.context.getNodeParameter(
`${this.optionsPrefix}column`,
@ -156,7 +156,7 @@ export class N8nBinaryLoader {
}
private async loadDocuments(
loader: PDFLoader | CSVLoader | EPubLoader | DocxLoader | TextLoader | JSONLoader,
loader: N8nPdfLoader | CSVLoader | EPubLoader | DocxLoader | TextLoader | JSONLoader,
): Promise<Document[]> {
return this.textSplitter
? await this.textSplitter.splitDocuments(await loader.load())

View File

@ -19,7 +19,6 @@ import {
// import 'd3-dsv'; // for csv
import 'mammoth'; // for docx
import 'epub2'; // for epub
import 'pdf-parse'; // for pdf
export class DocumentBinaryInputLoader implements INodeType {
description: INodeTypeDescription = {

View File

@ -16,7 +16,6 @@ import { logWrapper, N8nBinaryLoader, N8nJsonLoader, metadataFilterField } from
// import 'd3-dsv'; // for csv
import 'mammoth'; // for docx
import 'epub2'; // for epub
import 'pdf-parse'; // for pdf
/* istanbul ignore next */
function getInputs(parameters: IDataObject) {

View File

@ -2,8 +2,8 @@ import type { Document } from '@langchain/core/documents';
/**
* Metadata fields stored in the vector store per document.
* Includes 'loc' prefix to capture flattened LangChain PDF page location fields
* (e.g. "loc.pageNumber", "loc.lines.from") added by PDFLoader when splitPages is enabled.
* Includes 'loc' prefix to capture flattened PDF page location fields
* (e.g. "loc.pageNumber", "loc.lines.from") added by N8nPdfLoader when splitPages is enabled.
*/
const CHAT_HUB_INSERT_METADATA_KEYS = new Set(['loc', 'fileName', 'agentId', 'fileKnowledgeId']);

View File

@ -296,7 +296,6 @@
"n8n-nodes-base": "workspace:*",
"n8n-workflow": "workspace:*",
"openai": "^6.34.0",
"pdf-parse": "1.1.1",
"pg": "catalog:",
"redis": "4.6.14",
"sanitize-html": "2.12.1",

View File

@ -0,0 +1,37 @@
import { test } from '../../../fixtures/base';
/**
* Regression guard for AI-2505 PDF embedding via Default Data Loader
* In-Memory Vector Store insert was throwing
* "Failed to load pdf-parse. This loader currently supports pdf-parse v1 only…"
* because @langchain/community's PDFLoader resolved pdf-parse@2 in the
* @n8n/ai-utilities install context.
*
* The fix replaces LangChain's PDFLoader with N8nPdfLoader (pdf-parse@2 backed).
* This test exercises the end-to-end path on the real n8n runtime, using
* FakeEmbeddings so the workflow can complete without external API keys.
*/
test.describe(
'AI-2505 — PDF embed regression',
{ annotation: [{ type: 'owner', description: 'AI' }] },
() => {
test('embeds a PDF through Default Data Loader → In-Memory Vector Store without the pdf-parse v1 error', async ({
n8n,
}) => {
await n8n.start.fromImportedWorkflow('AI-2505_pdf_embed_fake_embeddings.json');
await n8n.canvas.clickZoomToFitButton();
await n8n.canvas.deselectAll();
// Partial execution up to and including the Vector Store node —
// mirrors the proven pattern in langchain-vectorstores.spec.ts and
// avoids any flakiness around full-workflow trigger plumbing.
await n8n.canvas.executeNode('Populate VS');
// If PDF parsing fails (the AI-2505 regression), no success
// notification ever appears and this assertion times out.
await n8n.notifications.waitForNotificationAndClose('Workflow executed successfully', {
timeout: 30000,
});
});
},
);

View File

@ -0,0 +1,102 @@
{
"name": "AI-2505 PDF embed (fake embeddings)",
"nodes": [
{
"parameters": {},
"id": "f3f2c2a0-2505-4000-8000-000000000001",
"name": "Manual Trigger",
"type": "n8n-nodes-base.manualTrigger",
"typeVersion": 1,
"position": [240, 300]
},
{
"parameters": {
"jsCode": "// Minimal valid PDF with extractable text 'Hello World'\nconst pdfBase64 = 'JVBERi0xLjQKMSAwIG9iago8PCAvVHlwZSAvQ2F0YWxvZyAvUGFnZXMgMiAwIFIgPj4KZW5kb2JqCjIgMCBvYmoKPDwgL1R5cGUgL1BhZ2VzIC9LaWRzIFszIDAgUl0gL0NvdW50IDEgPj4KZW5kb2JqCjMgMCBvYmoKPDwgL1R5cGUgL1BhZ2UgL1BhcmVudCAyIDAgUiAvTWVkaWFCb3ggWzAgMCA2MTIgNzkyXSAvQ29udGVudHMgNCAwIFIgL1Jlc291cmNlcyA8PCAvRm9udCA8PCAvRjEgNSAwIFIgPj4gPj4gPj4KZW5kb2JqCjQgMCBvYmoKPDwgL0xlbmd0aCA0NCA+PgpzdHJlYW0KQlQKL0YxIDEyIFRmCjEwMCA3MDAgVGQKKEhlbGxvIFdvcmxkKSBUagpFVAplbmRzdHJlYW0KZW5kb2JqCjUgMCBvYmoKPDwgL1R5cGUgL0ZvbnQgL1N1YnR5cGUgL1R5cGUxIC9CYXNlRm9udCAvSGVsdmV0aWNhID4+CmVuZG9iagp4cmVmCjAgNgowMDAwMDAwMDAwIDY1NTM1IGYgCjAwMDAwMDAwMDkgMDAwMDAgbiAKMDAwMDAwMDA1OCAwMDAwMCBuIAowMDAwMDAwMTE1IDAwMDAwIG4gCjAwMDAwMDAyNjYgMDAwMDAgbiAKMDAwMDAwMDM1OSAwMDAwMCBuIAp0cmFpbGVyCjw8IC9TaXplIDYgL1Jvb3QgMSAwIFIgPj4Kc3RhcnR4cmVmCjQ0MQolJUVPRgo=';\n\nconst pdfBuffer = Buffer.from(pdfBase64, 'base64');\nconst data = await this.helpers.prepareBinaryData(pdfBuffer, 'sample.pdf', 'application/pdf');\n\nreturn [{ json: {}, binary: { data } }];"
},
"id": "f3f2c2a0-2505-4000-8000-000000000002",
"name": "Create PDF",
"type": "n8n-nodes-base.code",
"typeVersion": 2,
"position": [460, 300]
},
{
"parameters": {
"mode": "insert",
"memoryKey": "ai_2505_pdf"
},
"id": "f3f2c2a0-2505-4000-8000-000000000003",
"name": "Populate VS",
"type": "@n8n/n8n-nodes-langchain.vectorStoreInMemory",
"typeVersion": 1,
"position": [800, 300]
},
{
"parameters": {
"dataType": "binary",
"binaryMode": "allInputData",
"loader": "pdfLoader",
"binaryDataKey": "data",
"options": {}
},
"id": "f3f2c2a0-2505-4000-8000-000000000004",
"name": "Default Data Loader",
"type": "@n8n/n8n-nodes-langchain.documentDefaultDataLoader",
"typeVersion": 1,
"position": [820, 520]
},
{
"parameters": {
"chunkSize": 1000,
"chunkOverlap": 200
},
"id": "f3f2c2a0-2505-4000-8000-000000000006",
"name": "Character Text Splitter",
"type": "@n8n/n8n-nodes-langchain.textSplitterCharacterTextSplitter",
"typeVersion": 1,
"position": [820, 720]
},
{
"parameters": {
"code": {
"supplyData": {
"code": "const { FakeEmbeddings } = require('@langchain/core/utils/testing');\n\nreturn new FakeEmbeddings();"
}
},
"outputs": {
"output": [{ "type": "ai_embedding" }]
}
},
"id": "f3f2c2a0-2505-4000-8000-000000000005",
"name": "Fake Embeddings",
"type": "@n8n/n8n-nodes-langchain.code",
"typeVersion": 1,
"position": [580, 520]
}
],
"pinData": {},
"connections": {
"Manual Trigger": {
"main": [[{ "node": "Create PDF", "type": "main", "index": 0 }]]
},
"Create PDF": {
"main": [[{ "node": "Populate VS", "type": "main", "index": 0 }]]
},
"Default Data Loader": {
"ai_document": [[{ "node": "Populate VS", "type": "ai_document", "index": 0 }]]
},
"Character Text Splitter": {
"ai_textSplitter": [
[{ "node": "Default Data Loader", "type": "ai_textSplitter", "index": 0 }]
]
},
"Fake Embeddings": {
"ai_embedding": [[{ "node": "Populate VS", "type": "ai_embedding", "index": 0 }]]
}
},
"active": false,
"settings": { "executionOrder": "v1" },
"versionId": "ai-2505-pdf-embed-v2",
"meta": { "instanceId": "ai-2505-regression-fixture" },
"id": "AI2505PDFEmbed",
"tags": []
}

View File

@ -426,6 +426,7 @@ overrides:
date-fns: 2.30.0
date-fns-tz: 2.0.0
form-data: 4.0.4
pdf-parse: ^2.4.5
tmp: 0.2.4
nodemailer: 7.0.11
validator: 13.15.26
@ -750,6 +751,9 @@ importers:
langchain:
specifier: 'catalog:'
version: 1.2.30(@langchain/core@1.1.41(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)))(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(react-dom@18.2.0(react@18.2.0))(react@18.2.0)(vue@3.5.26(typescript@6.0.2))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod-to-json-schema@3.23.3(zod@3.25.67))
pdf-parse:
specifier: ^2.4.5
version: 2.4.5
proxy-from-env:
specifier: ^1.1.0
version: 1.1.0
@ -1818,7 +1822,7 @@ importers:
specifier: ^3.1.0
version: 3.1.0
pdf-parse:
specifier: 2.4.5
specifier: ^2.4.5
version: 2.4.5
psl:
specifier: 1.9.0
@ -2133,7 +2137,7 @@ importers:
version: 1.0.1(@langchain/core@1.1.41(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)))(encoding@0.1.13)
'@langchain/community':
specifier: 'catalog:'
version: 1.1.27(5c482d49b34634b95a0a4ffed759746c)
version: 1.1.27(e4c56db3f35187250d946cc3182ca959)
'@langchain/core':
specifier: 'catalog:'
version: 1.1.41(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))
@ -2305,9 +2309,6 @@ importers:
openai:
specifier: ^6.34.0
version: 6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67)
pdf-parse:
specifier: 1.1.1
version: 1.1.1
pg:
specifier: 'catalog:'
version: 8.17.0
@ -7397,7 +7398,7 @@ packages:
notion-to-md: ^3.1.0
officeparser: ^6.0.4
openai: '*'
pdf-parse: 2.4.5
pdf-parse: ^2.4.5
pg: ^8.11.0
pg-copy-streams: ^7.0.0
pickleparser: ^0.2.1
@ -16917,9 +16918,6 @@ packages:
engines: {node: '>=10.5.0'}
deprecated: Use your platform's native DOMException instead
node-ensure@0.0.0:
resolution: {integrity: sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==}
node-fetch-h2@2.3.0:
resolution: {integrity: sha512-ofRW94Ab0T4AOh5Fk8t0h8OBWrmjb0SSB20xh1H8YnPV9EJ+f5AMoYSUQ2zgJ4Iq2HAK0I2l5/Nequ8YzFS3Hg==}
engines: {node: 4.x || >=6.0.0}
@ -17588,10 +17586,6 @@ packages:
resolution: {integrity: sha512-wfRLBZ0feWRhCIkoMB6ete7czJcnNnqRpcoWQBLqatqXXmelSRqfdDK4F3u9T2s2cXas/hQJcryI/4lAL+XTlA==}
engines: {node: '>=0.12'}
pdf-parse@1.1.1:
resolution: {integrity: sha512-v6ZJ/efsBpGrGGknjtq9J/oC8tZWq0KWL5vQrk2GlzLEQPUDB1ex+13Rmidl1neNN358Jn9EHZw5y07FFtaC7A==}
engines: {node: '>=6.8.1'}
pdf-parse@2.4.5:
resolution: {integrity: sha512-mHU89HGh7v+4u2ubfnevJ03lmPgQ5WU4CxAVmTSh/sxVTEDYd1er/dKS/A6vg77NX47KTEoihq8jZBLr8Cxuwg==}
engines: {node: '>=20.16.0 <21 || >=22.3.0'}
@ -25014,7 +25008,58 @@ snapshots:
- aws-crt
- encoding
'@langchain/community@1.1.27(5c482d49b34634b95a0a4ffed759746c)':
'@langchain/community@1.1.27(9652038fd19705174b61722d4a08ab72)':
dependencies:
'@browserbasehq/stagehand': 1.14.0(@playwright/test@1.60.0)(bufferutil@4.0.9)(deepmerge@4.3.1)(dotenv@17.4.2)(encoding@0.1.13)(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(utf-8-validate@5.0.10)(zod@3.25.67)
'@ibm-cloud/watsonx-ai': 1.1.2
'@langchain/classic': 1.0.27(@langchain/core@1.1.41(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)))(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(cheerio@1.0.0)(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))
'@langchain/core': 1.1.41(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))
'@langchain/openai': 1.4.1(@langchain/core@1.1.41(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))
binary-extensions: 2.2.0
flat: 5.0.2
ibm-cloud-sdk-core: 5.3.2
js-yaml: 4.1.1
langsmith: 0.6.0(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))
math-expression-evaluator: 2.0.7
openai: 6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67)
uuid: 13.0.1
zod: 3.25.67
optionalDependencies:
'@aws-crypto/sha256-js': 5.2.0
'@aws-sdk/credential-provider-node': 3.936.0
'@browserbasehq/sdk': 2.6.0(encoding@0.1.13)
'@getzep/zep-cloud': 1.0.6(d39584e65f36d18e3d2ee3120c56d312)
'@google-cloud/storage': 7.12.1(encoding@0.1.13)
'@libsql/client': 0.17.2(bufferutil@4.0.9)(encoding@0.1.13)(utf-8-validate@5.0.10)
'@mozilla/readability': 0.6.0
'@smithy/protocol-http': 5.3.12
'@smithy/util-utf8': 4.2.2
'@supabase/supabase-js': 2.50.0(bufferutil@4.0.9)(utf-8-validate@5.0.10)
'@zilliz/milvus2-sdk-node': 2.5.7
chromadb: 3.2.0
crypto-js: 4.2.0
epub2: 3.0.2(ts-toolbelt@9.6.0)
fast-xml-parser: 5.7.2
google-auth-library: 10.1.0
html-to-text: 9.0.5
ignore: 7.0.5
ioredis: 5.3.2
jsdom: 23.0.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)
jsonwebtoken: 9.0.3
lodash: 4.18.1
mammoth: 1.12.0
pdf-parse: 2.4.5
pg: 8.17.0
playwright: 1.60.0
puppeteer: 24.41.0(bufferutil@4.0.9)(typescript@6.0.2)(utf-8-validate@5.0.10)
ws: 8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)
transitivePeerDependencies:
- '@opentelemetry/api'
- '@opentelemetry/exporter-trace-otlp-proto'
- '@opentelemetry/sdk-trace-base'
- peggy
'@langchain/community@1.1.27(e4c56db3f35187250d946cc3182ca959)':
dependencies:
'@browserbasehq/stagehand': 1.14.0(@playwright/test@1.60.0)(bufferutil@4.0.9)(deepmerge@4.3.1)(dotenv@17.4.2)(encoding@0.1.13)(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(utf-8-validate@5.0.10)(zod@3.25.67)
'@ibm-cloud/watsonx-ai': 1.1.2
@ -25065,7 +25110,7 @@ snapshots:
mammoth: 1.12.0
mongodb: 6.21.0(@aws-sdk/credential-providers@3.808.0)(gcp-metadata@5.3.0)(socks@2.8.3)
mysql2: 3.17.0
pdf-parse: 1.1.1
pdf-parse: 2.4.5
pg: 8.17.0
playwright: 1.60.0
puppeteer: 24.41.0(bufferutil@4.0.9)(typescript@6.0.2)(utf-8-validate@5.0.10)
@ -25078,57 +25123,6 @@ snapshots:
- '@opentelemetry/sdk-trace-base'
- peggy
'@langchain/community@1.1.27(9652038fd19705174b61722d4a08ab72)':
dependencies:
'@browserbasehq/stagehand': 1.14.0(@playwright/test@1.60.0)(bufferutil@4.0.9)(deepmerge@4.3.1)(dotenv@17.4.2)(encoding@0.1.13)(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(utf-8-validate@5.0.10)(zod@3.25.67)
'@ibm-cloud/watsonx-ai': 1.1.2
'@langchain/classic': 1.0.27(@langchain/core@1.1.41(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)))(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(cheerio@1.0.0)(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))
'@langchain/core': 1.1.41(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))
'@langchain/openai': 1.4.1(@langchain/core@1.1.41(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))
binary-extensions: 2.2.0
flat: 5.0.2
ibm-cloud-sdk-core: 5.3.2
js-yaml: 4.1.1
langsmith: 0.6.0(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))
math-expression-evaluator: 2.0.7
openai: 6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67)
uuid: 13.0.1
zod: 3.25.67
optionalDependencies:
'@aws-crypto/sha256-js': 5.2.0
'@aws-sdk/credential-provider-node': 3.936.0
'@browserbasehq/sdk': 2.6.0(encoding@0.1.13)
'@getzep/zep-cloud': 1.0.6(d39584e65f36d18e3d2ee3120c56d312)
'@google-cloud/storage': 7.12.1(encoding@0.1.13)
'@libsql/client': 0.17.2(bufferutil@4.0.9)(encoding@0.1.13)(utf-8-validate@5.0.10)
'@mozilla/readability': 0.6.0
'@smithy/protocol-http': 5.3.12
'@smithy/util-utf8': 4.2.2
'@supabase/supabase-js': 2.50.0(bufferutil@4.0.9)(utf-8-validate@5.0.10)
'@zilliz/milvus2-sdk-node': 2.5.7
chromadb: 3.2.0
crypto-js: 4.2.0
epub2: 3.0.2(ts-toolbelt@9.6.0)
fast-xml-parser: 5.7.2
google-auth-library: 10.1.0
html-to-text: 9.0.5
ignore: 7.0.5
ioredis: 5.3.2
jsdom: 23.0.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)
jsonwebtoken: 9.0.3
lodash: 4.18.1
mammoth: 1.12.0
pdf-parse: 2.4.5
pg: 8.17.0
playwright: 1.60.0
puppeteer: 24.41.0(bufferutil@4.0.9)(typescript@6.0.2)(utf-8-validate@5.0.10)
ws: 8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)
transitivePeerDependencies:
- '@opentelemetry/api'
- '@opentelemetry/exporter-trace-otlp-proto'
- '@opentelemetry/sdk-trace-base'
- peggy
'@langchain/core@1.1.41(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))':
dependencies:
'@cfworker/json-schema': 4.1.0
@ -36370,8 +36364,6 @@ snapshots:
node-domexception@1.0.0: {}
node-ensure@0.0.0: {}
node-fetch-h2@2.3.0:
dependencies:
http2-client: 1.3.5
@ -37154,13 +37146,6 @@ snapshots:
sha.js: 2.4.12
to-buffer: 1.2.1
pdf-parse@1.1.1:
dependencies:
debug: 3.2.7(supports-color@5.5.0)
node-ensure: 0.0.0
transitivePeerDependencies:
- supports-color
pdf-parse@2.4.5:
dependencies:
'@napi-rs/canvas': 0.1.80