diff --git a/package.json b/package.json index 590c6776ab0..ab78f636265 100644 --- a/package.json +++ b/package.json @@ -125,7 +125,7 @@ "date-fns": "2.30.0", "date-fns-tz": "2.0.0", "form-data": "4.0.4", - "pdf-parse": "^2.4.5", + "pdf-parse": "catalog:", "tmp": "0.2.4", "nodemailer": "7.0.11", "validator": "13.15.26", diff --git a/packages/@n8n/ai-utilities/package.json b/packages/@n8n/ai-utilities/package.json index bacb05ad4e2..2b7db4abcf8 100644 --- a/packages/@n8n/ai-utilities/package.json +++ b/packages/@n8n/ai-utilities/package.json @@ -98,7 +98,7 @@ "tmp-promise": "3.0.3", "js-tiktoken": "catalog:", "https-proxy-agent": "catalog:", - "pdf-parse": "2.4.5", + "pdf-parse": "catalog:", "proxy-from-env": "^1.1.0", "undici": "^6.21.0" }, diff --git a/packages/@n8n/api-types/src/agents/agent-files.constants.ts b/packages/@n8n/api-types/src/agents/agent-files.constants.ts new file mode 100644 index 00000000000..8a649b3ec82 --- /dev/null +++ b/packages/@n8n/api-types/src/agents/agent-files.constants.ts @@ -0,0 +1,4 @@ +export const MAX_AGENT_FILE_SIZE_MB = 50; +export const MAX_AGENT_FILE_SIZE_BYTES = MAX_AGENT_FILE_SIZE_MB * 1024 * 1024; +export const MAX_AGENT_FILES_PER_UPLOAD = 10; +export const ALLOWED_AGENT_FILE_EXTENSIONS = ['.csv', '.md', '.markdown', '.pdf', '.txt'] as const; diff --git a/packages/@n8n/api-types/src/agents/index.ts b/packages/@n8n/api-types/src/agents/index.ts index 8b44a675e4e..59ebd578eec 100644 --- a/packages/@n8n/api-types/src/agents/index.ts +++ b/packages/@n8n/api-types/src/agents/index.ts @@ -1,3 +1,4 @@ +export * from './agent-files.constants'; export * from './agent-integration.schema'; export * from './agent-json-config.schema'; export * from './dto'; diff --git a/packages/@n8n/api-types/src/agents/types.ts b/packages/@n8n/api-types/src/agents/types.ts index 347a99f0eb2..5bb3617b655 100644 --- a/packages/@n8n/api-types/src/agents/types.ts +++ b/packages/@n8n/api-types/src/agents/types.ts @@ -121,6 +121,15 @@ export interface AgentVersionDto { author: string; } +export interface AgentFileDto { + id: string; + agentId: string; + fileName: string; + mimeType: string; + fileSizeBytes: number; + createdAt: string; +} + export interface AgentVersionListItemDto { versionId: string; agentId: string; diff --git a/packages/@n8n/config/src/configs/agents.config.ts b/packages/@n8n/config/src/configs/agents.config.ts index 7212c7c42f0..1daa5aa0760 100644 --- a/packages/@n8n/config/src/configs/agents.config.ts +++ b/packages/@n8n/config/src/configs/agents.config.ts @@ -6,7 +6,7 @@ import { Config, Env } from '../decorators'; * `N8N_AGENTS_MODULES`. The backend fails fast on unknown tokens so typos * surface at startup instead of silently disabling a feature. */ -export const AGENTS_MODULE_NAMES = ['node-tools-searcher'] as const; +export const AGENTS_MODULE_NAMES = ['node-tools-searcher', 'knowledge-base'] as const; export type AgentsModuleName = (typeof AGENTS_MODULE_NAMES)[number]; @@ -36,6 +36,9 @@ export class AgentsConfig { * Currently known: * - `node-tools-searcher` — surfaces the "Built-in node tools" toggle in * the agent editor. + * - `knowledge-base` — enables the agent knowledge base: file upload/list/ + * delete endpoints, the files panel in the editor, and the + * `search_knowledge` runtime tool. * * Gates the UI surface only — existing agents persisted with a given * capability turned on continue to run even if its token is removed here. diff --git a/packages/@n8n/db/src/entities/binary-data-file.ts b/packages/@n8n/db/src/entities/binary-data-file.ts index 06e535379d3..1d0393b5a6b 100644 --- a/packages/@n8n/db/src/entities/binary-data-file.ts +++ b/packages/@n8n/db/src/entities/binary-data-file.ts @@ -3,7 +3,7 @@ import { z } from 'zod'; import { BinaryColumn, WithTimestamps } from './abstract-entity'; -export const SourceTypeSchema = z.enum(['execution', 'chat_message_attachment']); +export const SourceTypeSchema = z.enum(['execution', 'chat_message_attachment', 'agent_file']); export type SourceType = z.infer; diff --git a/packages/@n8n/db/src/migrations/common/1784000000018-CreateAgentFilesTable.ts b/packages/@n8n/db/src/migrations/common/1784000000018-CreateAgentFilesTable.ts new file mode 100644 index 00000000000..250b6c3a3ac --- /dev/null +++ b/packages/@n8n/db/src/migrations/common/1784000000018-CreateAgentFilesTable.ts @@ -0,0 +1,52 @@ +import type { MigrationContext, ReversibleMigration } from '../migration-types'; + +const binaryDataTableName = 'binary_data'; +const sourceTypeColumn = 'sourceType'; +const sourceTypesBefore = ['execution', 'chat_message_attachment']; +const sourceTypesAfter = [...sourceTypesBefore, 'agent_file']; + +export class CreateAgentFilesTable1784000000018 implements ReversibleMigration { + async up(ctx: MigrationContext) { + const { createTable, column } = ctx.schemaBuilder; + + await createTable('agent_files') + .withColumns( + column('id').varchar(16).primary.comment('Application-generated n8n nano ID'), + // FK to agents.id, which is declared varchar(36); the column type + // mirrors the referenced primary key. + column('agentId') + .varchar(36) + .notNull.comment('Agent that owns this uploaded file'), + column('binaryDataId').text.notNull.comment( + 'Opaque BinaryDataService reference (mode-prefixed, e.g. "filesystem-v2:"); not an FK to binary_data, which only has rows in DB storage mode', + ), + column('fileName').varchar(255).notNull, + column('mimeType').varchar(255).notNull, + column('fileSizeBytes').int.notNull.comment('Uploaded file size in bytes'), + ) + .withIndexOn(['agentId', 'createdAt']) + .withForeignKey('agentId', { + tableName: 'agents', + columnName: 'id', + onDelete: 'CASCADE', + }).withTimestamps; + + await this.replaceSourceTypeCheck(ctx, sourceTypesAfter); + } + + async down(ctx: MigrationContext) { + await ctx.runQuery( + `DELETE FROM ${ctx.escape.tableName(binaryDataTableName)} WHERE ${ctx.escape.columnName(sourceTypeColumn)} = 'agent_file'`, + ); + await this.replaceSourceTypeCheck(ctx, sourceTypesBefore); + await ctx.schemaBuilder.dropTable('agent_files'); + } + + private async replaceSourceTypeCheck( + { schemaBuilder: { addEnumCheck, dropEnumCheck } }: MigrationContext, + sourceTypes: string[], + ) { + await dropEnumCheck(binaryDataTableName, sourceTypeColumn); + await addEnumCheck(binaryDataTableName, sourceTypeColumn, sourceTypes); + } +} diff --git a/packages/@n8n/db/src/migrations/postgresdb/index.ts b/packages/@n8n/db/src/migrations/postgresdb/index.ts index 9b5dbe6799a..a0940659fb6 100644 --- a/packages/@n8n/db/src/migrations/postgresdb/index.ts +++ b/packages/@n8n/db/src/migrations/postgresdb/index.ts @@ -193,6 +193,7 @@ import { PersistInstanceAiPendingConfirmations1784000000014 } from '../common/17 import { AddSourceWorkflowIdToWorkflow1784000000015 } from '../common/1784000000015-AddSourceWorkflowIdToWorkflow'; import { UseSlugAsPrimaryKeyInMcpRegistryServer1784000000016 } from '../common/1784000000016-UseSlugAsPrimaryKeyInMcpRegistryServer'; import { AddLastUsedAtToApiKey1784000000017 } from '../common/1784000000017-AddLastUsedAtToApiKey'; +import { CreateAgentFilesTable1784000000018 } from '../common/1784000000018-CreateAgentFilesTable'; import type { Migration } from '../migration-types'; export const postgresMigrations: Migration[] = [ @@ -391,4 +392,5 @@ export const postgresMigrations: Migration[] = [ AddSourceWorkflowIdToWorkflow1784000000015, UseSlugAsPrimaryKeyInMcpRegistryServer1784000000016, AddLastUsedAtToApiKey1784000000017, + CreateAgentFilesTable1784000000018, ]; diff --git a/packages/@n8n/db/src/migrations/sqlite/index.ts b/packages/@n8n/db/src/migrations/sqlite/index.ts index 436d7c8aa98..d16b13fbfd2 100644 --- a/packages/@n8n/db/src/migrations/sqlite/index.ts +++ b/packages/@n8n/db/src/migrations/sqlite/index.ts @@ -186,6 +186,7 @@ import { PersistInstanceAiPendingConfirmations1784000000014 } from '../common/17 import { AddSourceWorkflowIdToWorkflow1784000000015 } from '../common/1784000000015-AddSourceWorkflowIdToWorkflow'; import { UseSlugAsPrimaryKeyInMcpRegistryServer1784000000016 } from '../common/1784000000016-UseSlugAsPrimaryKeyInMcpRegistryServer'; import { AddLastUsedAtToApiKey1784000000017 } from '../common/1784000000017-AddLastUsedAtToApiKey'; +import { CreateAgentFilesTable1784000000018 } from '../common/1784000000018-CreateAgentFilesTable'; import type { Migration } from '../migration-types'; const sqliteMigrations: Migration[] = [ @@ -377,6 +378,7 @@ const sqliteMigrations: Migration[] = [ AddSourceWorkflowIdToWorkflow1784000000015, UseSlugAsPrimaryKeyInMcpRegistryServer1784000000016, AddLastUsedAtToApiKey1784000000017, + CreateAgentFilesTable1784000000018, ]; export { sqliteMigrations }; diff --git a/packages/@n8n/instance-ai/package.json b/packages/@n8n/instance-ai/package.json index 490e55f6e2c..b94f9f5ec6f 100644 --- a/packages/@n8n/instance-ai/package.json +++ b/packages/@n8n/instance-ai/package.json @@ -69,7 +69,7 @@ "n8n-workflow": "workspace:*", "nanoid": "catalog:", "p-limit": "^3.1.0", - "pdf-parse": "2.4.5", + "pdf-parse": "catalog:", "psl": "1.9.0", "turndown": "^7.2.0", "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.2/xlsx-0.20.2.tgz", diff --git a/packages/cli/package.json b/packages/cli/package.json index af3c46b6d0f..7f69326be9c 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -170,6 +170,7 @@ "express-rate-limit": "7.5.0", "fast-glob": "catalog:", "fast-json-patch": "catalog:", + "fastest-levenshtein": "catalog:", "flat": "5.0.2", "flatted": "catalog:", "formidable": "3.5.4", @@ -201,6 +202,7 @@ "p-cancelable": "2.1.1", "p-lazy": "3.1.0", "p-limit": "^3.1.0", + "pdf-parse": "catalog:", "pg": "catalog:", "picocolors": "catalog:", "pkce-challenge": "5.0.0", diff --git a/packages/cli/src/modules/agents/__tests__/agent-knowledge-command.service.test.ts b/packages/cli/src/modules/agents/__tests__/agent-knowledge-command.service.test.ts new file mode 100644 index 00000000000..95915417e37 --- /dev/null +++ b/packages/cli/src/modules/agents/__tests__/agent-knowledge-command.service.test.ts @@ -0,0 +1,105 @@ +import { mkdtemp, rm, symlink, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; + +import { AgentKnowledgeCommandService } from '../agent-knowledge-command.service'; + +jest.unmock('node:fs/promises'); + +async function withTempWorkspace(operation: (workspaceRoot: string) => Promise) { + const workspaceRoot = await mkdtemp(path.join(tmpdir(), 'n8n-agent-knowledge-test-')); + try { + await operation(workspaceRoot); + } finally { + await rm(workspaceRoot, { recursive: true, force: true }); + } +} + +describe('AgentKnowledgeCommandService', () => { + let service: AgentKnowledgeCommandService; + + beforeEach(() => { + service = new AgentKnowledgeCommandService(); + }); + + it('searches text files with git grep', async () => { + await withTempWorkspace(async (workspaceRoot) => { + await writeFile(path.join(workspaceRoot, 'notes.txt'), 'alpha\nneedle\nomega\n'); + + const result = await service.run(workspaceRoot, { + command: 'git_grep', + pattern: 'needle', + fixedStrings: true, + }); + + expect(result.exitCode).toBe(0); + expect(result.stdout).toContain('notes.txt:2:needle'); + expect(result.truncated).toBe(false); + }); + }); + it('truncates command output to the byte budget for non-ASCII content', async () => { + await withTempWorkspace(async (workspaceRoot) => { + await writeFile(path.join(workspaceRoot, 'notes.txt'), 'é'.repeat(40_000)); + + const result = await service.run(workspaceRoot, { + command: 'cat', + file: 'notes.txt', + }); + + expect(result.truncated).toBe(true); + expect(Buffer.byteLength(result.stdout)).toBeLessThanOrEqual(64 * 1024); + }); + }); + + it('rejects parent path traversal and symlink escapes', async () => { + await withTempWorkspace(async (workspaceRoot) => { + const outsideDirectory = await mkdtemp(path.join(tmpdir(), 'n8n-agent-knowledge-outside-')); + try { + await writeFile(path.join(outsideDirectory, 'secret.txt'), 'secret\n'); + await symlink( + path.join(outsideDirectory, 'secret.txt'), + path.join(workspaceRoot, 'secret-link'), + ); + + await expect( + service.run(workspaceRoot, { command: 'cat', file: '../secret.txt' }), + ).rejects.toThrow('Parent path segments are not allowed'); + await expect( + service.run(workspaceRoot, { command: 'cat', file: 'secret-link' }), + ).rejects.toThrow('Path escapes the knowledge workspace'); + } finally { + await rm(outsideDirectory, { recursive: true, force: true }); + } + }); + }); + + it('rejects absolute paths and control characters', async () => { + await withTempWorkspace(async (workspaceRoot) => { + await expect( + service.run(workspaceRoot, { command: 'cat', file: '/etc/passwd' }), + ).rejects.toThrow('Absolute paths are not allowed'); + await expect( + service.run(workspaceRoot, { command: 'cat', file: 'notes\u0000.txt' }), + ).rejects.toThrow('Invalid path'); + }); + }); + + it('reuses a cached workspace for the same key and re-materializes for a new key', async () => { + let materializeCount = 0; + const materialize = async (root: string) => { + materializeCount++; + await writeFile(path.join(root, 'notes.txt'), 'needle\n'); + }; + const operation = async (root: string) => + await service.run(root, { command: 'git_grep', pattern: 'needle', fixedStrings: true }); + + const first = await service.withCachedWorkspace('key-a', materialize, operation); + const second = await service.withCachedWorkspace('key-a', materialize, operation); + expect(first.exitCode).toBe(0); + expect(second.exitCode).toBe(0); + expect(materializeCount).toBe(1); + + await service.withCachedWorkspace('key-b', materialize, operation); + expect(materializeCount).toBe(2); + }); +}); diff --git a/packages/cli/src/modules/agents/__tests__/agent-knowledge.service.test.ts b/packages/cli/src/modules/agents/__tests__/agent-knowledge.service.test.ts new file mode 100644 index 00000000000..3be823fc69d --- /dev/null +++ b/packages/cli/src/modules/agents/__tests__/agent-knowledge.service.test.ts @@ -0,0 +1,478 @@ +import type { BinaryDataService } from 'n8n-core'; +import { generateNanoId } from '@n8n/utils'; +import { mock } from 'jest-mock-extended'; +import { access, mkdtemp, readFile, rm, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import { Readable } from 'node:stream'; + +import { BadRequestError } from '@/errors/response-errors/bad-request.error'; +import { NotFoundError } from '@/errors/response-errors/not-found.error'; + +import { AgentKnowledgeService } from '../agent-knowledge.service'; +import type { AgentFileRepository } from '../repositories/agent-file.repository'; +import type { AgentRepository } from '../repositories/agent.repository'; + +jest.unmock('node:fs'); +jest.unmock('node:fs/promises'); + +const mockGetText = jest.fn, []>(); +const mockDestroy = jest.fn, []>(); + +jest.mock('pdf-parse', () => ({ + __esModule: true, + PDFParse: jest.fn().mockImplementation(() => ({ + getText: mockGetText, + destroy: mockDestroy, + })), +})); + +jest.mock('@n8n/utils', () => ({ + ...jest.requireActual('@n8n/utils'), + generateNanoId: jest.fn(() => 'file-1'), +})); + +const agentId = 'agent-1'; +const projectId = 'project-1'; + +function makeMulterFile(overrides: Partial = {}): Express.Multer.File { + return { + fieldname: 'files', + originalname: 'document.txt', + encoding: '7bit', + mimetype: 'text/plain', + buffer: Buffer.from('hello'), + size: 5, + stream: null as never, + destination: '', + filename: '', + path: '', + ...overrides, + }; +} + +describe('AgentKnowledgeService', () => { + let agentRepository: jest.Mocked; + let agentFileRepository: jest.Mocked; + let binaryDataService: jest.Mocked; + let service: AgentKnowledgeService; + + beforeEach(() => { + agentRepository = mock(); + agentFileRepository = mock(); + binaryDataService = mock(); + + agentFileRepository.create.mockImplementation((data?: Partial) => data as never); + binaryDataService.store.mockResolvedValue({ id: 'binary-1' } as never); + agentFileRepository.save.mockImplementation( + async (file) => + ({ + createdAt: new Date('2026-05-24T12:00:00.000Z'), + ...file, + }) as never, + ); + binaryDataService.getAsStream.mockImplementation(async () => + Readable.from(Buffer.from('stored text')), + ); + jest.mocked(generateNanoId).mockReset().mockReturnValue('file-1'); + mockGetText.mockReset(); + mockDestroy.mockReset().mockResolvedValue(undefined); + + service = new AgentKnowledgeService(agentRepository, agentFileRepository, binaryDataService); + }); + + it('rejects files for agents outside the project', async () => { + agentRepository.findByIdAndProjectId.mockResolvedValue(null); + + await expect(service.uploadFiles(agentId, projectId, [makeMulterFile()])).rejects.toThrow( + NotFoundError, + ); + + expect(binaryDataService.store).not.toHaveBeenCalled(); + expect(agentFileRepository.save).not.toHaveBeenCalled(); + }); + + it('rejects listing files for agents outside the project', async () => { + agentRepository.findByIdAndProjectId.mockResolvedValue(null); + + await expect(service.listFiles(agentId, projectId)).rejects.toThrow(NotFoundError); + + expect(agentFileRepository.findByAgentId).not.toHaveBeenCalled(); + }); + + it('rejects deleting files for agents outside the project', async () => { + agentRepository.findByIdAndProjectId.mockResolvedValue(null); + + await expect(service.deleteFile(agentId, projectId, 'file-1')).rejects.toThrow(NotFoundError); + + expect(agentFileRepository.findByIdAndAgentId).not.toHaveBeenCalled(); + expect(binaryDataService.deleteManyByBinaryDataId).not.toHaveBeenCalled(); + }); + + it('lists file rows for the agent', async () => { + agentRepository.findByIdAndProjectId.mockResolvedValue({ id: agentId, projectId } as never); + agentFileRepository.findByAgentId.mockResolvedValue([ + { + id: 'file-1', + agentId, + fileName: 'document.txt', + mimeType: 'text/plain', + fileSizeBytes: 5, + createdAt: new Date('2026-05-24T12:00:00.000Z'), + }, + ] as never); + + await expect(service.listFiles(agentId, projectId)).resolves.toEqual([ + { + id: 'file-1', + agentId, + fileName: 'document.txt', + mimeType: 'text/plain', + fileSizeBytes: 5, + createdAt: '2026-05-24T12:00:00.000Z', + }, + ]); + expect(agentFileRepository.findByAgentId).toHaveBeenCalledWith(agentId); + }); + it('stores binary data and creates file rows for the agent', async () => { + agentRepository.findByIdAndProjectId.mockResolvedValue({ id: agentId, projectId } as never); + + const [file] = await service.uploadFiles(agentId, projectId, [makeMulterFile()]); + + expect(binaryDataService.store).toHaveBeenCalledWith( + expect.objectContaining({ + sourceType: 'agent_file', + sourceId: 'file-1', + pathSegments: ['agents', agentId, 'files', 'file-1'], + }), + Buffer.from('hello'), + expect.objectContaining({ + fileName: 'document.txt', + mimeType: 'text/plain', + fileSize: '5', + bytes: 5, + }), + ); + expect(agentFileRepository.save).toHaveBeenCalledWith( + expect.objectContaining({ + id: 'file-1', + agentId, + binaryDataId: 'binary-1', + fileName: 'document.txt', + mimeType: 'text/plain', + fileSizeBytes: 5, + }), + ); + expect(file).toEqual({ + id: 'file-1', + agentId, + fileName: 'document.txt', + mimeType: 'text/plain', + fileSizeBytes: 5, + createdAt: '2026-05-24T12:00:00.000Z', + }); + }); + + it('rolls back stored files and removes temp files when batch upload fails', async () => { + agentRepository.findByIdAndProjectId.mockResolvedValue({ id: agentId, projectId } as never); + jest.mocked(generateNanoId).mockReturnValueOnce('file-1').mockReturnValueOnce('file-2'); + binaryDataService.store + .mockResolvedValueOnce({ id: 'binary-1' } as never) + .mockRejectedValueOnce(new Error('disk full')); + const tempDirectory = await mkdtemp(path.join(tmpdir(), 'agent-knowledge-upload-')); + const firstPath = path.join(tempDirectory, 'first-upload'); + const secondPath = path.join(tempDirectory, 'second-upload'); + await writeFile(firstPath, 'first'); + await writeFile(secondPath, 'second'); + + try { + await expect( + service.uploadFiles(agentId, projectId, [ + makeMulterFile({ + originalname: 'first.txt', + buffer: undefined as never, + path: firstPath, + size: 5, + }), + makeMulterFile({ + originalname: 'second.txt', + buffer: undefined as never, + path: secondPath, + size: 6, + }), + ]), + ).rejects.toThrow('disk full'); + + expect(agentFileRepository.delete).toHaveBeenCalledWith(['file-1']); + expect(binaryDataService.deleteManyByBinaryDataId).toHaveBeenCalledWith(['binary-1']); + await expect(access(firstPath)).rejects.toThrow(); + await expect(access(secondPath)).rejects.toThrow(); + } finally { + await rm(tempDirectory, { recursive: true, force: true }); + } + }); + + it('rejects file names longer than the metadata column limit', async () => { + agentRepository.findByIdAndProjectId.mockResolvedValue({ id: agentId, projectId } as never); + + await expect( + service.uploadFiles(agentId, projectId, [ + makeMulterFile({ originalname: `${'a'.repeat(256)}.txt` }), + ]), + ).rejects.toThrow(BadRequestError); + + expect(binaryDataService.store).not.toHaveBeenCalled(); + expect(agentFileRepository.save).not.toHaveBeenCalled(); + }); + + it('rejects MIME types longer than the metadata column limit', async () => { + agentRepository.findByIdAndProjectId.mockResolvedValue({ id: agentId, projectId } as never); + + await expect( + service.uploadFiles(agentId, projectId, [ + makeMulterFile({ mimetype: 'text/'.concat('a'.repeat(256)) }), + ]), + ).rejects.toThrow(BadRequestError); + + expect(binaryDataService.store).not.toHaveBeenCalled(); + expect(agentFileRepository.save).not.toHaveBeenCalled(); + }); + + it('deletes the file row and stored binary data for the agent', async () => { + agentRepository.findByIdAndProjectId.mockResolvedValue({ id: agentId, projectId } as never); + agentFileRepository.findByIdAndAgentId.mockResolvedValue({ + id: 'file-1', + agentId, + binaryDataId: 'binary-1', + fileName: 'document.txt', + mimeType: 'text/plain', + fileSizeBytes: 5, + createdAt: new Date('2026-05-24T12:00:00.000Z'), + } as never); + + await service.deleteFile(agentId, projectId, 'file-1'); + + expect(agentFileRepository.delete).toHaveBeenCalledWith({ id: 'file-1', agentId }); + expect(binaryDataService.deleteManyByBinaryDataId).toHaveBeenCalledWith(['binary-1']); + expect(binaryDataService.deleteManyByBinaryDataId.mock.invocationCallOrder[0]).toBeLessThan( + agentFileRepository.delete.mock.invocationCallOrder[0], + ); + }); + + it('deletes all stored binary data before deleting agent file rows', async () => { + agentFileRepository.findByAgentId.mockResolvedValue([ + { + id: 'file-1', + agentId, + binaryDataId: 'binary-1', + fileName: 'document.txt', + mimeType: 'text/plain', + fileSizeBytes: 5, + createdAt: new Date('2026-05-24T12:00:00.000Z'), + }, + { + id: 'file-2', + agentId, + binaryDataId: 'binary-2', + fileName: 'notes.md', + mimeType: 'text/markdown', + fileSizeBytes: 9, + createdAt: new Date('2026-05-24T12:00:00.000Z'), + }, + ] as never); + + await service.deleteAllFilesForAgent(agentId); + + expect(binaryDataService.deleteManyByBinaryDataId).toHaveBeenCalledWith([ + 'binary-1', + 'binary-2', + ]); + expect(agentFileRepository.delete).toHaveBeenCalledWith({ agentId }); + expect(binaryDataService.deleteManyByBinaryDataId.mock.invocationCallOrder[0]).toBeLessThan( + agentFileRepository.delete.mock.invocationCallOrder[0], + ); + }); + + it('rejects deleting files that are not attached to the agent', async () => { + agentRepository.findByIdAndProjectId.mockResolvedValue({ id: agentId, projectId } as never); + agentFileRepository.findByIdAndAgentId.mockResolvedValue(null); + + await expect(service.deleteFile(agentId, projectId, 'file-1')).rejects.toThrow(NotFoundError); + + expect(agentFileRepository.delete).not.toHaveBeenCalled(); + expect(binaryDataService.deleteManyByBinaryDataId).not.toHaveBeenCalled(); + }); + + it('stores extracted PDF text as the binary payload while preserving the PDF filename', async () => { + agentRepository.findByIdAndProjectId.mockResolvedValue({ id: agentId, projectId } as never); + mockGetText.mockResolvedValue({ text: 'Extracted PDF text', total: 1 }); + + const [file] = await service.uploadFiles(agentId, projectId, [ + makeMulterFile({ + originalname: 'document.pdf', + mimetype: 'application/pdf', + buffer: Buffer.from('%PDF original bytes'), + size: 19, + }), + ]); + + expect(binaryDataService.store).toHaveBeenCalledWith( + expect.objectContaining({ + sourceType: 'agent_file', + sourceId: 'file-1', + }), + Buffer.from('Extracted PDF text', 'utf8'), + expect.objectContaining({ + fileName: 'document.pdf.txt', + mimeType: 'text/plain', + fileSize: '18', + bytes: 18, + fileExtension: 'txt', + }), + ); + expect(agentFileRepository.save).toHaveBeenCalledWith( + expect.objectContaining({ + fileName: 'document.pdf', + mimeType: 'text/plain', + fileSizeBytes: 19, + }), + ); + expect(file).toMatchObject({ + fileName: 'document.pdf', + mimeType: 'text/plain', + fileSizeBytes: 19, + }); + expect(mockDestroy).toHaveBeenCalledTimes(1); + }); + + it('rejects PDFs with no extractable text', async () => { + agentRepository.findByIdAndProjectId.mockResolvedValue({ id: agentId, projectId } as never); + mockGetText.mockResolvedValue({ text: ' ', total: 1 }); + + await expect( + service.uploadFiles(agentId, projectId, [ + makeMulterFile({ + originalname: 'empty.pdf', + mimetype: 'application/pdf', + buffer: Buffer.from('%PDF original bytes'), + }), + ]), + ).rejects.toThrow(BadRequestError); + + expect(binaryDataService.store).not.toHaveBeenCalled(); + expect(agentFileRepository.save).not.toHaveBeenCalled(); + expect(mockDestroy).toHaveBeenCalledTimes(1); + }); + + it('materializes stored PDF text as a text file', async () => { + agentRepository.findByIdAndProjectId.mockResolvedValue({ id: agentId, projectId } as never); + agentFileRepository.findByAgentId.mockResolvedValue([ + { + id: 'file-1', + agentId, + binaryDataId: 'binary-1', + fileName: 'document.pdf', + mimeType: 'text/plain', + fileSizeBytes: 19, + createdAt: new Date('2026-05-24T12:00:00.000Z'), + }, + ] as never); + binaryDataService.getAsStream.mockImplementation(async () => + Readable.from(Buffer.from('stored PDF text')), + ); + const workspaceRoot = await mkdtemp(path.join(tmpdir(), 'agent-knowledge-service-')); + try { + const files = await service.materializeWorkspace(agentId, projectId, workspaceRoot); + + expect(files).toEqual([ + expect.objectContaining({ + fileName: 'document.pdf', + mimeType: 'text/plain', + relativePath: 'file-1.pdf.txt', + }), + ]); + await expect(readFile(path.join(workspaceRoot, 'file-1.pdf.txt'), 'utf8')).resolves.toBe( + 'stored PDF text', + ); + } finally { + await rm(workspaceRoot, { recursive: true, force: true }); + } + }); + it('materializes only requested files when file references are provided', async () => { + agentRepository.findByIdAndProjectId.mockResolvedValue({ id: agentId, projectId } as never); + agentFileRepository.findByAgentId.mockResolvedValue([ + { + id: 'file-1', + agentId, + binaryDataId: 'binary-1', + fileName: 'data.csv', + mimeType: 'text/csv', + fileSizeBytes: 17, + createdAt: new Date('2026-05-24T12:00:00.000Z'), + }, + { + id: 'file-2', + agentId, + binaryDataId: 'binary-2', + fileName: 'notes.txt', + mimeType: 'text/plain', + fileSizeBytes: 10, + createdAt: new Date('2026-05-24T12:00:00.000Z'), + }, + ] as never); + binaryDataService.getAsStream.mockImplementation(async () => + Readable.from(Buffer.from('name,age\nAlice,30\n')), + ); + const workspaceRoot = await mkdtemp(path.join(tmpdir(), 'agent-knowledge-service-')); + try { + const files = await service.materializeWorkspace(agentId, projectId, workspaceRoot, { + fileReferences: ['file-1'], + }); + + expect(files).toEqual([expect.objectContaining({ id: 'file-1' })]); + expect(binaryDataService.getAsStream).toHaveBeenCalledTimes(1); + expect(binaryDataService.getAsStream).toHaveBeenCalledWith('binary-1'); + } finally { + await rm(workspaceRoot, { recursive: true, force: true }); + } + }); + + it('materializes files requested by display file name', async () => { + agentRepository.findByIdAndProjectId.mockResolvedValue({ id: agentId, projectId } as never); + agentFileRepository.findByAgentId.mockResolvedValue([ + { + id: 'file-1', + agentId, + binaryDataId: 'binary-1', + fileName: 'data.csv', + mimeType: 'text/csv', + fileSizeBytes: 17, + createdAt: new Date('2026-05-24T12:00:00.000Z'), + }, + { + id: 'file-2', + agentId, + binaryDataId: 'binary-2', + fileName: 'notes.txt', + mimeType: 'text/plain', + fileSizeBytes: 10, + createdAt: new Date('2026-05-24T12:00:00.000Z'), + }, + ] as never); + binaryDataService.getAsStream.mockImplementation(async () => + Readable.from(Buffer.from('name,age\nAlice,30\n')), + ); + const workspaceRoot = await mkdtemp(path.join(tmpdir(), 'agent-knowledge-service-')); + try { + const files = await service.materializeWorkspace(agentId, projectId, workspaceRoot, { + fileReferences: ['data.csv'], + }); + + expect(files).toEqual([expect.objectContaining({ id: 'file-1', fileName: 'data.csv' })]); + expect(binaryDataService.getAsStream).toHaveBeenCalledTimes(1); + expect(binaryDataService.getAsStream).toHaveBeenCalledWith('binary-1'); + } finally { + await rm(workspaceRoot, { recursive: true, force: true }); + } + }); +}); diff --git a/packages/cli/src/modules/agents/__tests__/agent-upload.middleware.test.ts b/packages/cli/src/modules/agents/__tests__/agent-upload.middleware.test.ts new file mode 100644 index 00000000000..037fd925b2a --- /dev/null +++ b/packages/cli/src/modules/agents/__tests__/agent-upload.middleware.test.ts @@ -0,0 +1,17 @@ +import { isAllowedAgentFile } from '../agent-upload.middleware'; + +describe('AgentUploadMiddleware', () => { + it.each(['data.csv', 'notes.md', 'notes.markdown', 'document.pdf', 'plain.txt'])( + 'allows %s', + (originalname) => { + expect(isAllowedAgentFile({ originalname })).toBe(true); + }, + ); + + it.each(['archive.zip', 'image.png', 'script.js', 'document.pdf.exe', 'README'])( + 'rejects %s', + (originalname) => { + expect(isAllowedAgentFile({ originalname })).toBe(false); + }, + ); +}); diff --git a/packages/cli/src/modules/agents/__tests__/agents-service-reconstruct-gating.test.ts b/packages/cli/src/modules/agents/__tests__/agents-service-reconstruct-gating.test.ts index 0922371d1c7..0ecfe0bd5e6 100644 --- a/packages/cli/src/modules/agents/__tests__/agents-service-reconstruct-gating.test.ts +++ b/packages/cli/src/modules/agents/__tests__/agents-service-reconstruct-gating.test.ts @@ -81,6 +81,8 @@ function makeService( mock(), mock(), mock(), + mock(), + mock(), ); } diff --git a/packages/cli/src/modules/agents/__tests__/agents-service-sync.test.ts b/packages/cli/src/modules/agents/__tests__/agents-service-sync.test.ts index 46d369441a0..65c777ab1e3 100644 --- a/packages/cli/src/modules/agents/__tests__/agents-service-sync.test.ts +++ b/packages/cli/src/modules/agents/__tests__/agents-service-sync.test.ts @@ -83,6 +83,8 @@ describe('AgentsService — updateName / updateDescription schema sync', () => { mock(), mock(), mock(), + mock(), + mock(), ); }); diff --git a/packages/cli/src/modules/agents/__tests__/agents.controller.test.ts b/packages/cli/src/modules/agents/__tests__/agents.controller.test.ts index 52c29839124..c0ec79a62e2 100644 --- a/packages/cli/src/modules/agents/__tests__/agents.controller.test.ts +++ b/packages/cli/src/modules/agents/__tests__/agents.controller.test.ts @@ -1,6 +1,7 @@ import { ControllerRegistryMetadata } from '@n8n/decorators'; import { Container } from '@n8n/di'; import { mock } from 'jest-mock-extended'; +import multer from 'multer'; import type { CredentialsService } from '@/credentials/credentials.service'; import { BadRequestError } from '@/errors/response-errors/bad-request.error'; @@ -13,6 +14,7 @@ import type { AgentScheduleService } from '../integrations/agent-schedule.servic import type { ChatIntegrationService } from '../integrations/chat-integration.service'; import type { SlackAppSetupService } from '../integrations/slack-app-setup.service'; import type { AgentExecutionService } from '../agent-execution.service'; +import type { AgentKnowledgeService } from '../agent-knowledge.service'; import type { AgentRepository } from '../repositories/agent.repository'; import { AgentsController } from '../agents.controller'; import { AgentsCredentialProvider } from '../adapters/agents-credential-provider'; @@ -43,6 +45,7 @@ function makeController({ agentRepository = mock(), chatIntegrationRegistry = mock(), slackAppSetupService = mock(), + agentKnowledgeService = mock(), }: { agentsService?: jest.Mocked; credentialsService?: jest.Mocked; @@ -51,6 +54,7 @@ function makeController({ agentRepository?: jest.Mocked; chatIntegrationRegistry?: jest.Mocked; slackAppSetupService?: jest.Mocked; + agentKnowledgeService?: jest.Mocked; } = {}) { if (!chatIntegrationRegistry.require.getMockImplementation()) { chatIntegrationRegistry.require.mockImplementation( @@ -63,6 +67,10 @@ function makeController({ ); } + // Default the knowledge-base module to enabled so file-endpoint tests pass; + // the disabled-gating test overrides this on the returned mock. + agentsService.isKnowledgeBaseModuleEnabled.mockReturnValue(true); + const controller = new AgentsController( agentsService, mock(), @@ -73,6 +81,7 @@ function makeController({ mock(), chatIntegrationRegistry, slackAppSetupService, + agentKnowledgeService, ); return { @@ -84,6 +93,7 @@ function makeController({ agentRepository, chatIntegrationRegistry, slackAppSetupService, + agentKnowledgeService, }; } @@ -110,6 +120,9 @@ describe('AgentsController route access scopes', () => { ['updateSkill', 'agent:update'], ['deleteSkill', 'agent:update'], ['revertToPublished', 'agent:update'], + ['listFiles', 'agent:read'], + ['uploadFiles', 'agent:update'], + ['deleteFile', 'agent:update'], ['revertToVersion', 'agent:update'], ['createSlackApp', 'agent:update'], ['getSlackAppManifest', 'agent:read'], @@ -119,6 +132,70 @@ describe('AgentsController route access scopes', () => { }); }); +describe('AgentsController file uploads', () => { + it('rejects empty uploads', async () => { + const { controller } = makeController(); + + await expect( + controller.uploadFiles( + { params: { projectId: 'project-1' }, files: [] } as never, + undefined as never, + 'project-1', + 'agent-1', + ), + ).rejects.toThrow(BadRequestError); + }); + + it('maps multer upload validation errors to bad requests', async () => { + const { controller } = makeController(); + + await expect( + controller.uploadFiles( + { + params: { projectId: 'project-1' }, + fileUploadError: new multer.MulterError('LIMIT_FILE_COUNT'), + } as never, + undefined as never, + 'project-1', + 'agent-1', + ), + ).rejects.toThrow(BadRequestError); + }); +}); + +describe('AgentsController knowledge base gating', () => { + it('returns not found for file endpoints when the knowledge-base module is disabled', async () => { + const { controller, agentsService } = makeController(); + agentsService.isKnowledgeBaseModuleEnabled.mockReturnValue(false); + + await expect( + controller.listFiles( + { params: { projectId: 'project-1' } } as never, + undefined as never, + 'project-1', + 'agent-1', + ), + ).rejects.toThrow(NotFoundError); + await expect( + controller.uploadFiles( + { params: { projectId: 'project-1' }, files: [] } as never, + undefined as never, + 'project-1', + 'agent-1', + ), + ).rejects.toThrow(NotFoundError); + await expect( + controller.deleteFile( + { params: { projectId: 'project-1' } } as never, + undefined as never, + 'project-1', + 'agent-1', + 'file-1', + ), + ).rejects.toThrow(NotFoundError); + }); +}); + describe('AgentsController publish history', () => { it('lists publish history with pagination forwarded from the query', async () => { const { controller, agentsService } = makeController(); @@ -215,6 +292,7 @@ describe('AgentsController integration credentials', () => { mock(), mock(), mock(), + mock(), ); await expect( @@ -766,6 +844,7 @@ describe('AgentsController agent resource', () => { mock(), mock(), mock(), + mock(), ); const result = await controller.get( @@ -810,6 +889,7 @@ describe('AgentsController agent resource', () => { mock(), mock(), mock(), + mock(), ); const result = await controller.get( @@ -843,6 +923,7 @@ describe('AgentsController chat message history', () => { mock(), mock(), mock(), + mock(), ); return { controller, agentsService }; diff --git a/packages/cli/src/modules/agents/__tests__/agents.service.test.ts b/packages/cli/src/modules/agents/__tests__/agents.service.test.ts index 6df6224caee..621497eabb9 100644 --- a/packages/cli/src/modules/agents/__tests__/agents.service.test.ts +++ b/packages/cli/src/modules/agents/__tests__/agents.service.test.ts @@ -34,6 +34,7 @@ import { import type { N8NCheckpointStorage } from '../integrations/n8n-checkpoint-storage'; import type { N8nMemory } from '../integrations/n8n-memory'; import type { AgentExecutionService } from '../agent-execution.service'; +import type { AgentKnowledgeService } from '../agent-knowledge.service'; import type { AgentHistoryRepository } from '../repositories/agent-history.repository'; import type { AgentRepository } from '../repositories/agent.repository'; @@ -83,6 +84,7 @@ describe('AgentsService', () => { let agentExecutionService: jest.Mocked; let scheduleService: jest.Mocked; let chatIntegrationService: jest.Mocked; + let agentKnowledgeService: jest.Mocked; let publisher: jest.Mocked; let agentsConfig: AgentsConfig; let globalConfig: jest.Mocked; @@ -101,6 +103,7 @@ describe('AgentsService', () => { agentExecutionService.recordMessage.mockResolvedValue('exec-id'); scheduleService = mock(); chatIntegrationService = mock(); + agentKnowledgeService = mock(); publisher = mock(); publisher.publishCommand.mockResolvedValue(); agentsConfig = { modules: [] } as unknown as AgentsConfig; @@ -134,6 +137,8 @@ describe('AgentsService', () => { globalConfig, telemetry, chatIntegrationService, + agentKnowledgeService, + mock(), mock(), ); }); @@ -2332,6 +2337,28 @@ describe('AgentsService', () => { expect(memoryBackend.deleteThread).toHaveBeenCalledWith(chatThreadId(agentId)); }); + it('deletes knowledge file content before removing the agent row', async () => { + const agent = makeAgent(); + agentRepository.findByIdAndProjectId.mockResolvedValue(agent); + + await service.delete(agentId, projectId); + + expect(agentKnowledgeService.deleteAllFilesForAgent).toHaveBeenCalledWith(agentId); + expect(agentKnowledgeService.deleteAllFilesForAgent.mock.invocationCallOrder[0]).toBeLessThan( + agentRepository.remove.mock.invocationCallOrder[0], + ); + }); + + it('still removes the agent when knowledge file cleanup fails', async () => { + const agent = makeAgent(); + agentRepository.findByIdAndProjectId.mockResolvedValue(agent); + agentKnowledgeService.deleteAllFilesForAgent.mockRejectedValueOnce(new Error('storage down')); + + await expect(service.delete(agentId, projectId)).resolves.toBe(true); + + expect(agentRepository.remove).toHaveBeenCalledWith(agent); + }); + it('stops the local schedule when deleting the agent', async () => { const agent = makeAgent(); agentRepository.findByIdAndProjectId.mockResolvedValue(agent); diff --git a/packages/cli/src/modules/agents/__tests__/execution-recorder.test.ts b/packages/cli/src/modules/agents/__tests__/execution-recorder.test.ts index 522a7273ee0..4a16016bf02 100644 --- a/packages/cli/src/modules/agents/__tests__/execution-recorder.test.ts +++ b/packages/cli/src/modules/agents/__tests__/execution-recorder.test.ts @@ -151,6 +151,31 @@ describe('ExecutionRecorder', () => { }); }); + it('pairs same-name flat tool calls by toolCallId when results arrive out of order', () => { + const recorder = new ExecutionRecorder(); + + recorder.record(makeToolCallChunk('search_knowledge', { file: 'first.md' }, 'call-1')); + recorder.record(makeToolCallChunk('search_knowledge', { file: 'second.md' }, 'call-2')); + recorder.record(makeToolResultChunk('search_knowledge', { fileName: 'second.md' }, 'call-2')); + recorder.record(makeToolResultChunk('search_knowledge', { fileName: 'first.md' }, 'call-1')); + recorder.record({ type: 'finish', finishReason: 'stop' } as StreamChunk); + + const record = recorder.getMessageRecord(); + + expect(record.toolCalls).toEqual([ + { + name: 'search_knowledge', + input: { file: 'first.md' }, + output: { fileName: 'first.md' }, + }, + { + name: 'search_knowledge', + input: { file: 'second.md' }, + output: { fileName: 'second.md' }, + }, + ]); + }); + it('still concatenates assistantResponse from all text deltas', () => { const recorder = new ExecutionRecorder(); diff --git a/packages/cli/src/modules/agents/agent-knowledge-command.service.ts b/packages/cli/src/modules/agents/agent-knowledge-command.service.ts new file mode 100644 index 00000000000..6dc5eec5bae --- /dev/null +++ b/packages/cli/src/modules/agents/agent-knowledge-command.service.ts @@ -0,0 +1,293 @@ +import { Service } from '@n8n/di'; +import { spawn } from 'node:child_process'; +import { mkdtemp, realpath, rm } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; +import pLimit from 'p-limit'; + +const MAX_OUTPUT_BYTES = 64 * 1024; +const COMMAND_TIMEOUT_MS = 5_000; +/** + * Cap concurrent knowledge workspaces per process. Each workspace reads files + * off the binary store and spawns a child process, so unbounded concurrency + * could saturate CPU/disk on a shared (multi-tenant) host. + */ +const MAX_CONCURRENT_WORKSPACES = 4; +/** Evict a cached workspace after this much idle time. */ +const WORKSPACE_CACHE_TTL_MS = 10 * 60_000; +/** Hard cap on retained workspaces to bound temp-dir disk usage. */ +const MAX_CACHED_WORKSPACES = 25; +export const AGENT_KNOWLEDGE_COMMANDS = ['git_grep', 'cat', 'sed'] as const; + +/** Bounds concurrent workspace usage; queued calls run in FIFO order. */ +const workspaceLimit = pLimit(MAX_CONCURRENT_WORKSPACES); + +interface CachedWorkspace { + root: string; + lastUsedAt: number; +} + +export type AgentKnowledgeCommand = (typeof AGENT_KNOWLEDGE_COMMANDS)[number]; + +export type AgentKnowledgeCommandRequest = + | { + command: 'git_grep'; + pattern: string; + outputMode?: 'count'; + caseInsensitive?: boolean; + fixedStrings?: boolean; + context?: number; + files?: string[]; + } + | { + command: 'cat'; + file: string; + } + | { + command: 'sed'; + file: string; + startLine: number; + endLine: number; + }; + +export interface AgentKnowledgeCommandResult { + command: AgentKnowledgeCommand; + exitCode: number | null; + stdout: string; + stderr: string; + truncated: boolean; +} + +type SafePathOptions = { allowRoot?: boolean }; + +@Service() +export class AgentKnowledgeCommandService { + private readonly cachedWorkspaces = new Map(); + private readonly workspaceLocks = new Map>(); + + async run(workspaceRoot: string, request: AgentKnowledgeCommandRequest) { + const root = await realpath(workspaceRoot); + const { executable, args } = await this.toSpawnArgs(root, request); + return await this.spawnCommand(root, executable, args, request.command); + } + + /** + * Runs an operation against a materialized workspace, reusing it across + * calls keyed by `cacheKey` (which must encode the agent + exact file set + + * content). Calls for the same key are serialized so the shared directory is + * never materialized or read concurrently; idle workspaces are evicted by + * TTL/LRU rather than per call. This avoids re-writing the whole knowledge + * base to disk on every tool call within a conversation. + */ + async withCachedWorkspace( + cacheKey: string, + materialize: (workspaceRoot: string) => Promise, + operation: (workspaceRoot: string) => Promise, + ): Promise { + return await this.serializeByKey( + cacheKey, + async () => + await workspaceLimit(async () => { + const workspaceRoot = await this.ensureCachedWorkspace(cacheKey, materialize); + return await operation(workspaceRoot); + }), + ); + } + + /** Run `fn`s sharing a key strictly one at a time (FIFO). */ + private async serializeByKey(key: string, fn: () => Promise): Promise { + const previous = this.workspaceLocks.get(key) ?? Promise.resolve(); + const run = previous.then(fn, fn); + const tail = run.then( + () => undefined, + () => undefined, + ); + this.workspaceLocks.set(key, tail); + try { + return await run; + } finally { + if (this.workspaceLocks.get(key) === tail) this.workspaceLocks.delete(key); + } + } + + private async ensureCachedWorkspace( + cacheKey: string, + materialize: (workspaceRoot: string) => Promise, + ): Promise { + const existing = this.cachedWorkspaces.get(cacheKey); + if (existing && (await this.directoryExists(existing.root))) { + existing.lastUsedAt = Date.now(); + return existing.root; + } + if (existing) this.cachedWorkspaces.delete(cacheKey); + + const workspaceRoot = await mkdtemp(path.join(tmpdir(), 'n8n-agent-knowledge-')); + try { + await materialize(workspaceRoot); + } catch (error) { + await rm(workspaceRoot, { recursive: true, force: true }).catch(() => {}); + throw error; + } + this.cachedWorkspaces.set(cacheKey, { root: workspaceRoot, lastUsedAt: Date.now() }); + await this.evictStaleWorkspaces(); + return workspaceRoot; + } + + private async evictStaleWorkspaces() { + const now = Date.now(); + const evictable: Array<[string, CachedWorkspace]> = []; + const fresh: Array<[string, CachedWorkspace]> = []; + for (const entry of this.cachedWorkspaces) { + (now - entry[1].lastUsedAt > WORKSPACE_CACHE_TTL_MS ? evictable : fresh).push(entry); + } + if (fresh.length > MAX_CACHED_WORKSPACES) { + fresh.sort((left, right) => left[1].lastUsedAt - right[1].lastUsedAt); + evictable.push(...fresh.slice(0, fresh.length - MAX_CACHED_WORKSPACES)); + } + for (const [key, workspace] of evictable) { + this.cachedWorkspaces.delete(key); + await rm(workspace.root, { recursive: true, force: true }).catch(() => {}); + } + } + + private async directoryExists(directory: string) { + try { + await realpath(directory); + return true; + } catch { + return false; + } + } + + private async toSpawnArgs( + root: string, + request: AgentKnowledgeCommandRequest, + ): Promise<{ executable: string; args: string[] }> { + switch (request.command) { + case 'git_grep': { + if (request.pattern.trim() === '') throw new Error('Search pattern is required'); + const args = ['grep', '--no-index', '-n', '-I']; + if (request.caseInsensitive) args.push('-i'); + if (request.fixedStrings) args.push('-F'); + if (request.fixedStrings === false) args.push('-E'); + if (request.outputMode === 'count') args.push('-c'); + if (request.context !== undefined) { + args.push('-C', String(Math.min(Math.max(request.context, 0), 5))); + } + args.push('--', request.pattern); + const files = await Promise.all( + (request.files ?? ['.']).map( + async (file) => await this.safePath(root, file, { allowRoot: true }), + ), + ); + args.push(...files.map((file) => path.relative(root, file) || '.')); + return { executable: 'git', args }; + } + case 'cat': { + const file = await this.safePath(root, request.file); + return { executable: 'cat', args: [path.relative(root, file)] }; + } + case 'sed': { + const file = await this.safePath(root, request.file); + const startLine = Math.max(1, request.startLine); + const endLine = Math.max(startLine, request.endLine); + return { + executable: 'sed', + args: [ + '-n', + `${startLine},${Math.min(endLine, startLine + 500)}p`, + path.relative(root, file), + ], + }; + } + } + } + + private async safePath(root: string, requestedPath: string, options: SafePathOptions = {}) { + if (this.hasControlCharacters(requestedPath)) throw new Error('Invalid path'); + if (path.isAbsolute(requestedPath)) throw new Error('Absolute paths are not allowed'); + if (requestedPath.split(/[\\/]/).includes('..')) { + throw new Error('Parent path segments are not allowed'); + } + const resolved = path.resolve(root, requestedPath); + const actual = await realpath(resolved); + const relative = path.relative(root, actual); + if ( + (!options.allowRoot && relative === '') || + relative.startsWith('..') || + path.isAbsolute(relative) + ) { + throw new Error('Path escapes the knowledge workspace'); + } + return actual; + } + + private hasControlCharacters(value: string) { + for (const character of value) { + const code = character.charCodeAt(0); + if (code <= 0x1f || code === 0x7f) return true; + } + return false; + } + + private async spawnCommand( + cwd: string, + executable: string, + args: string[], + command: AgentKnowledgeCommand, + ): Promise { + return await new Promise((resolve, reject) => { + const child = spawn(executable, args, { + cwd, + shell: false, + // Minimal env: PATH so the allow-listed binaries resolve, plus git + // isolation so no host/user gitconfig or credential prompt can + // influence `git grep`. No n8n secrets are exposed to the child. + env: { + PATH: process.env.PATH, + HOME: cwd, + GIT_CONFIG_NOSYSTEM: '1', + GIT_CONFIG_GLOBAL: '/dev/null', + GIT_TERMINAL_PROMPT: '0', + }, + }); + let stdout = ''; + let stderr = ''; + let truncated = false; + const timer = setTimeout(() => { + child.kill('SIGKILL'); + truncated = true; + }, COMMAND_TIMEOUT_MS); + + const append = (current: string, chunk: Buffer) => { + const next = Buffer.concat([Buffer.from(current, 'utf8'), chunk]); + if (next.length > MAX_OUTPUT_BYTES) { + truncated = true; + return truncateBufferToUtf8String(next, MAX_OUTPUT_BYTES); + } + return next.toString('utf8'); + }; + + child.stdout.on('data', (chunk: Buffer) => { + stdout = append(stdout, chunk); + }); + child.stderr.on('data', (chunk: Buffer) => { + stderr = append(stderr, chunk); + }); + child.on('error', reject); + child.on('close', (exitCode) => { + clearTimeout(timer); + resolve({ command, exitCode, stdout, stderr, truncated }); + }); + }); + } +} + +function truncateBufferToUtf8String(buffer: Buffer, maxBytes: number) { + for (let end = maxBytes; end >= 0; end--) { + const output = buffer.subarray(0, end).toString('utf8'); + if (Buffer.byteLength(output) <= maxBytes) return output; + } + + return ''; +} diff --git a/packages/cli/src/modules/agents/agent-knowledge.service.ts b/packages/cli/src/modules/agents/agent-knowledge.service.ts new file mode 100644 index 00000000000..f0983d3f88b --- /dev/null +++ b/packages/cli/src/modules/agents/agent-knowledge.service.ts @@ -0,0 +1,380 @@ +import type { AgentFileDto } from '@n8n/api-types'; +import { Service } from '@n8n/di'; +import { generateNanoId, sanitizeFilename } from '@n8n/utils'; +import { BinaryDataService, FileLocation } from 'n8n-core'; +import { UnexpectedError, type IBinaryData } from 'n8n-workflow'; +import { createWriteStream } from 'node:fs'; +import { mkdir, readFile, unlink } from 'node:fs/promises'; +import path from 'node:path'; +import { pipeline } from 'node:stream/promises'; + +import { BadRequestError } from '@/errors/response-errors/bad-request.error'; +import { NotFoundError } from '@/errors/response-errors/not-found.error'; + +import { AgentFile } from './entities/agent-file.entity'; +import { AgentFileRepository } from './repositories/agent-file.repository'; +import { AgentRepository } from './repositories/agent.repository'; + +/** + * A knowledge file as seen by the agent runtime's `search_knowledge` tool. + * Carries the stored metadata plus `relativePath`, the path the file is + * written to inside the materialized workspace (see {@link + * AgentKnowledgeService.materializeWorkspace}). This is distinct from the + * API-facing `AgentFileDto`, which instead exposes `createdAt` for the UI. + */ +export interface KnowledgeWorkspaceFile { + id: string; + fileName: string; + mimeType: string; + fileSizeBytes: number; + relativePath: string; +} + +interface MaterializeWorkspaceOptions { + fileReferences?: string[]; +} + +interface StoredFileContent { + buffer: Buffer; + mimeType: string; + fileName: string; + fileExtension: string | undefined; +} + +type StoredAgentFile = AgentFile & { binaryDataId: string }; + +const MAX_AGENT_FILE_METADATA_LENGTH = 255; + +/** + * Abuse guardrails for a single materialization. Deliberately generous so + * normal knowledge bases never hit them — they exist to stop a pathological + * corpus from writing unbounded data to the shared temp dir per call. + */ +const MAX_WORKSPACE_FILES = 2_000; +const MAX_WORKSPACE_BYTES = 2 * 1024 * 1024 * 1024; + +@Service() +export class AgentKnowledgeService { + constructor( + private readonly agentRepository: AgentRepository, + private readonly agentFileRepository: AgentFileRepository, + private readonly binaryDataService: BinaryDataService, + ) {} + + async uploadFiles( + agentId: string, + projectId: string, + files: Express.Multer.File[], + ): Promise { + await this.ensureAgentBelongsToProject(agentId, projectId); + + const storedFiles: StoredAgentFile[] = []; + + try { + // Process sequentially to bound peak memory: each file is read into + // a buffer and PDFs are parsed in-process, so storing the whole + // batch in parallel could spike RSS for large uploads. + for (const file of files) { + storedFiles.push(await this.storeFile(agentId, file)); + } + } catch (error) { + await this.cleanupStoredFiles(storedFiles).catch(() => {}); + throw error; + } finally { + await this.cleanupUploadTempFiles(files); + } + + return storedFiles.map((file) => this.toDto(file)); + } + + /** + * List files for the UI/API. Returns `AgentFileDto`s (with `createdAt`, + * no workspace path) for the Agent Builder and REST responses. + */ + async listFiles(agentId: string, projectId: string): Promise { + await this.ensureAgentBelongsToProject(agentId, projectId); + + const files = await this.agentFileRepository.findByAgentId(agentId); + return files.map((file) => this.toDto(file)); + } + + /** + * List files for the agent runtime's `search_knowledge` tool. Returns + * `KnowledgeWorkspaceFile`s, which add the on-disk `relativePath` used + * inside the materialized workspace and omit API-only fields like + * `createdAt`. + */ + async listWorkspaceFiles(agentId: string, projectId: string) { + await this.ensureAgentBelongsToProject(agentId, projectId); + + const files = await this.agentFileRepository.findByAgentId(agentId); + return files.map((file) => this.toWorkspaceFile(file)); + } + + async deleteFile(agentId: string, projectId: string, fileId: string): Promise { + await this.ensureAgentBelongsToProject(agentId, projectId); + + const file = await this.agentFileRepository.findByIdAndAgentId(fileId, agentId); + if (!file) { + throw new NotFoundError(`Agent file "${fileId}" not found`); + } + + await this.binaryDataService.deleteManyByBinaryDataId([file.binaryDataId]); + await this.agentFileRepository.delete({ id: fileId, agentId }); + } + + async deleteAllFilesForAgent(agentId: string): Promise { + const files = await this.agentFileRepository.findByAgentId(agentId); + if (files.length === 0) return; + + await this.binaryDataService.deleteManyByBinaryDataId(files.map((file) => file.binaryDataId)); + await this.agentFileRepository.delete({ agentId }); + } + + /** + * Resolve the workspace-file metadata that {@link materializeWorkspace} + * would write for these references, without touching the binary store. Used + * to build a stable workspace cache key and to drive operations against a + * reused workspace. + */ + async resolveWorkspaceFiles( + agentId: string, + projectId: string, + fileReferences?: string[], + ): Promise { + await this.ensureAgentBelongsToProject(agentId, projectId); + const files = this.filterFilesForWorkspace( + await this.agentFileRepository.findByAgentId(agentId), + fileReferences, + ); + this.assertWorkspaceWithinLimits(files); + return files.map((file) => this.toWorkspaceFile(file)); + } + + async materializeWorkspace( + agentId: string, + projectId: string, + workspaceRoot: string, + options: MaterializeWorkspaceOptions = {}, + ) { + await this.ensureAgentBelongsToProject(agentId, projectId); + await mkdir(workspaceRoot, { recursive: true }); + + const files = this.filterFilesForWorkspace( + await this.agentFileRepository.findByAgentId(agentId), + options.fileReferences, + ); + this.assertWorkspaceWithinLimits(files); + const materializedFiles: KnowledgeWorkspaceFile[] = []; + + for (const file of files) { + const relativePath = this.getWorkspaceRelativePath(file); + const targetPath = path.join(workspaceRoot, relativePath); + + // Stream the stored content straight to the workspace file rather + // than buffering the whole file in memory — knowledge files can be + // up to the upload size limit. + const contentStream = await this.binaryDataService.getAsStream(file.binaryDataId); + await pipeline(contentStream, createWriteStream(targetPath)); + + materializedFiles.push(this.toWorkspaceFile(file)); + } + + return materializedFiles; + } + + private async ensureAgentBelongsToProject(agentId: string, projectId: string) { + const agent = await this.agentRepository.findByIdAndProjectId(agentId, projectId); + if (!agent) { + throw new NotFoundError(`Agent "${agentId}" not found`); + } + } + + private async storeFile(agentId: string, file: Express.Multer.File): Promise { + let storedBinaryDataId: string | undefined; + try { + const fileId = generateNanoId(); + const fileName = sanitizeFilename( + Buffer.from(file.originalname, 'latin1').toString('utf8'), + MAX_AGENT_FILE_METADATA_LENGTH + 1, + ); + this.validateMetadataLength('File name', fileName); + const buffer = file.buffer ?? (await readFile(file.path)); + const storedContent = await this.prepareStoredContent(fileName, file.mimetype, buffer); + this.validateMetadataLength('MIME type', storedContent.mimeType); + const binaryData: IBinaryData = { + data: '', + mimeType: storedContent.mimeType, + fileName: storedContent.fileName, + fileSize: `${storedContent.buffer.length}`, + bytes: storedContent.buffer.length, + fileExtension: storedContent.fileExtension, + }; + + const storedBinaryData = await this.binaryDataService.store( + FileLocation.ofCustom({ + sourceType: 'agent_file', + sourceId: fileId, + pathSegments: ['agents', agentId, 'files', fileId], + }), + storedContent.buffer, + binaryData, + ); + + if (!storedBinaryData.id) { + throw new UnexpectedError('Agent file upload requires persisted binary data'); + } + storedBinaryDataId = storedBinaryData.id; + + const agentFile = this.agentFileRepository.create({ + id: fileId, + agentId, + binaryDataId: storedBinaryDataId, + fileName, + mimeType: storedContent.mimeType, + fileSizeBytes: buffer.length, + }); + + return await this.agentFileRepository.save(agentFile); + } catch (error) { + if (storedBinaryDataId) { + await this.binaryDataService.deleteManyByBinaryDataId([storedBinaryDataId]); + } + throw error; + } finally { + if (file.path) { + await unlink(file.path).catch(() => {}); + } + } + } + + private toDto(file: AgentFile): AgentFileDto { + return { + id: file.id, + agentId: file.agentId, + fileName: file.fileName, + mimeType: file.mimeType, + fileSizeBytes: file.fileSizeBytes, + createdAt: file.createdAt.toISOString(), + }; + } + + private toWorkspaceFile(file: AgentFile): KnowledgeWorkspaceFile { + return { + id: file.id, + fileName: file.fileName, + mimeType: file.mimeType, + fileSizeBytes: file.fileSizeBytes, + relativePath: this.getWorkspaceRelativePath(file), + }; + } + + private assertWorkspaceWithinLimits(files: AgentFile[]) { + if (files.length > MAX_WORKSPACE_FILES) { + throw new BadRequestError( + `Cannot materialize ${files.length} knowledge files at once (limit ${MAX_WORKSPACE_FILES}). Pass file references to narrow the operation.`, + ); + } + const totalBytes = files.reduce((total, file) => total + file.fileSizeBytes, 0); + if (totalBytes > MAX_WORKSPACE_BYTES) { + throw new BadRequestError( + `Cannot materialize ${totalBytes} bytes of knowledge files at once (limit ${MAX_WORKSPACE_BYTES}). Pass file references to narrow the operation.`, + ); + } + } + + private filterFilesForWorkspace(files: AgentFile[], fileReferences: string[] | undefined) { + if (!fileReferences) return files; + + const requested = new Set(fileReferences); + return files.filter( + (file) => + requested.has(file.id) || + requested.has(this.getWorkspaceRelativePath(file)) || + requested.has(file.fileName), + ); + } + + private getWorkspaceRelativePath(file: AgentFile) { + const extension = path.extname(file.fileName).toLowerCase(); + if (extension === '.pdf' && file.mimeType === 'text/plain') { + return `${file.id}.pdf.txt`; + } + return `${file.id}${path.extname(file.fileName)}`; + } + + private async prepareStoredContent( + fileName: string, + mimeType: string, + buffer: Buffer, + ): Promise { + if (!this.isPdf(fileName, mimeType)) { + return { + buffer, + mimeType: mimeType || 'application/octet-stream', + fileName, + fileExtension: fileName.split('.').pop(), + }; + } + + const extractedText = await this.extractPdfText(fileName, buffer); + const extractedBuffer = Buffer.from(extractedText, 'utf8'); + + return { + buffer: extractedBuffer, + mimeType: 'text/plain', + fileName: `${fileName}.txt`, + fileExtension: 'txt', + }; + } + + private isPdf(fileName: string, mimeType: string) { + return path.extname(fileName).toLowerCase() === '.pdf' || mimeType === 'application/pdf'; + } + + private async extractPdfText(fileName: string, buffer: Buffer) { + const { PDFParse } = await import('pdf-parse'); + const parser = new PDFParse({ data: buffer }); + try { + const result = await parser.getText(); + const text = result.text.trim(); + if (!text) { + throw new BadRequestError( + `PDF "${fileName}" contains no extractable text and cannot be added to knowledge`, + ); + } + return text; + } catch (error) { + if (error instanceof BadRequestError) throw error; + const message = error instanceof Error ? error.message : 'unknown error'; + throw new BadRequestError(`Failed to extract text from PDF "${fileName}": ${message}`); + } finally { + await parser.destroy(); + } + } + + private validateMetadataLength(label: string, value: string) { + if (value.length <= MAX_AGENT_FILE_METADATA_LENGTH) return; + + throw new BadRequestError( + `${label} must be ${MAX_AGENT_FILE_METADATA_LENGTH} characters or less`, + ); + } + + private async cleanupStoredFiles(files: StoredAgentFile[]) { + if (files.length === 0) return; + + await this.agentFileRepository.delete(files.map((file) => file.id)); + await this.binaryDataService.deleteManyByBinaryDataId(files.map((file) => file.binaryDataId)); + } + + private async cleanupUploadTempFiles(files: Express.Multer.File[]) { + await Promise.all(files.map(async (file) => await this.cleanupUploadTempFile(file))); + } + + private async cleanupUploadTempFile(file: Express.Multer.File) { + if (!file.path) return; + + await unlink(file.path).catch(() => {}); + } +} diff --git a/packages/cli/src/modules/agents/agent-upload.middleware.ts b/packages/cli/src/modules/agents/agent-upload.middleware.ts new file mode 100644 index 00000000000..f4dd5f6d9ef --- /dev/null +++ b/packages/cli/src/modules/agents/agent-upload.middleware.ts @@ -0,0 +1,62 @@ +import { + ALLOWED_AGENT_FILE_EXTENSIONS, + MAX_AGENT_FILE_SIZE_BYTES, + MAX_AGENT_FILES_PER_UPLOAD, +} from '@n8n/api-types'; +import { Service } from '@n8n/di'; +import type { RequestHandler } from 'express'; +import multer from 'multer'; +import { unlink } from 'node:fs/promises'; +import path from 'node:path'; + +import { BadRequestError } from '@/errors/response-errors/bad-request.error'; + +const allowedAgentFileExtensions = new Set(ALLOWED_AGENT_FILE_EXTENSIONS); + +export function isAllowedAgentFile(file: Pick) { + const extension = path.extname(file.originalname).toLowerCase(); + + return allowedAgentFileExtensions.has(extension); +} + +/** + * Best-effort removal of multer's on-disk temp files. The upload handler hands + * successful uploads to AgentKnowledgeService (which cleans up its own temp + * files), but early bail-outs (knowledge base disabled, upload error, no files) + * return before that, so the controller calls this to avoid leaking temp files. + */ +export async function cleanupUploadedTempFiles(files: Express.Multer.File[]) { + await Promise.all( + files.map(async (file) => { + if (!file.path) return; + await unlink(file.path).catch(() => {}); + }), + ); +} + +@Service() +export class AgentUploadMiddleware { + private readonly upload: multer.Multer = multer({ + storage: multer.diskStorage({}), + limits: { fileSize: MAX_AGENT_FILE_SIZE_BYTES }, + fileFilter: (_req, file, done) => { + if (!isAllowedAgentFile(file)) { + done(new BadRequestError('Only CSV, PDF, Markdown, and TXT files are allowed')); + return; + } + + done(null, true); + }, + }); + + array(fieldName: string): RequestHandler { + return (req, res, next) => { + void this.upload.array(fieldName, MAX_AGENT_FILES_PER_UPLOAD)(req, res, (error) => { + if (error) { + (req as typeof req & { fileUploadError?: Error }).fileUploadError = error as Error; + } + next(); + }); + }; + } +} diff --git a/packages/cli/src/modules/agents/agents.controller.ts b/packages/cli/src/modules/agents/agents.controller.ts index 9d34d8ecc74..ac3b7abd77b 100644 --- a/packages/cli/src/modules/agents/agents.controller.ts +++ b/packages/cli/src/modules/agents/agents.controller.ts @@ -39,8 +39,10 @@ import { Query, RestController, } from '@n8n/decorators'; +import { Container } from '@n8n/di'; import { randomUUID } from 'crypto'; import type { Request, Response } from 'express'; +import multer from 'multer'; import { CredentialsService } from '@/credentials/credentials.service'; import { BadRequestError } from '@/errors/response-errors/bad-request.error'; @@ -48,7 +50,9 @@ import { NotFoundError } from '@/errors/response-errors/not-found.error'; import { AgentsCredentialProvider } from './adapters/agents-credential-provider'; import { AgentExecutionService, threadBelongsTo } from './agent-execution.service'; +import { AgentKnowledgeService } from './agent-knowledge.service'; import { messagesToDto } from './agent-message-mapper'; +import { AgentUploadMiddleware, cleanupUploadedTempFiles } from './agent-upload.middleware'; import { type FlushableResponse, initSseStream, @@ -66,6 +70,8 @@ import { AgentRepository } from './repositories/agent.repository'; import { draftChatMemoryResourceId } from './utils/agent-memory-scope'; import type { Agent } from './entities/agent.entity'; +const agentUploadMiddleware = Container.get(AgentUploadMiddleware); + /** * Builder side-effects: when the LLM streams arguments for `build_custom_tool` * we re-emit each delta as a `code-delta` event so the FE editor can render @@ -114,6 +120,7 @@ export class AgentsController { private readonly agentExecutionService: AgentExecutionService, private readonly chatIntegrationRegistry: ChatIntegrationRegistry, private readonly slackAppSetupService: SlackAppSetupService, + private readonly agentKnowledgeService: AgentKnowledgeService, ) {} private async validateIntegration(dto: unknown) { @@ -387,6 +394,77 @@ export class AgentsController { return await this.withRunnableState(agent, req.params.projectId, req.user); } + /** Knowledge base endpoints are gated behind the `knowledge-base` agents module. */ + private assertKnowledgeBaseEnabled() { + if (!this.agentsService.isKnowledgeBaseModuleEnabled()) { + throw new NotFoundError('Agent knowledge base is not enabled'); + } + } + + @Get('/:agentId/files') + @ProjectScope('agent:read') + async listFiles( + _req: AuthenticatedRequest<{ projectId: string }>, + _res: Response, + @Param('projectId') projectId: string, + @Param('agentId') agentId: string, + ) { + this.assertKnowledgeBaseEnabled(); + return await this.agentKnowledgeService.listFiles(agentId, projectId); + } + + @Post('/:agentId/files', { + middlewares: [agentUploadMiddleware.array('files')], + }) + @ProjectScope('agent:update') + async uploadFiles( + req: AuthenticatedRequest<{ projectId: string }> & { + files?: Express.Multer.File[]; + fileUploadError?: Error; + }, + _res: Response, + @Param('projectId') projectId: string, + @Param('agentId') agentId: string, + ) { + const files = req.files ?? []; + try { + this.assertKnowledgeBaseEnabled(); + if (req.fileUploadError) { + const error = req.fileUploadError; + if (error instanceof multer.MulterError) { + throw new BadRequestError(`File upload error: ${error.message}`); + } + throw error; + } + + if (files.length === 0) { + throw new BadRequestError('No files uploaded'); + } + + return await this.agentKnowledgeService.uploadFiles(agentId, projectId, files); + } catch (error) { + // Multer wrote temp files to disk before this handler ran. The success + // path hands them to AgentKnowledgeService (which cleans up its own temp + // files), but these early bail-outs return first, so clean up here. + await cleanupUploadedTempFiles(files); + throw error; + } + } + + @Delete('/:agentId/files/:fileId') + @ProjectScope('agent:update') + async deleteFile( + _req: AuthenticatedRequest<{ projectId: string }>, + _res: Response, + @Param('projectId') projectId: string, + @Param('agentId') agentId: string, + @Param('fileId') fileId: string, + ) { + this.assertKnowledgeBaseEnabled(); + await this.agentKnowledgeService.deleteFile(agentId, projectId, fileId); + return { success: true }; + } + @Delete('/:agentId') @ProjectScope('agent:delete') async delete( diff --git a/packages/cli/src/modules/agents/agents.module.ts b/packages/cli/src/modules/agents/agents.module.ts index 34be5318678..3a9d8a71212 100644 --- a/packages/cli/src/modules/agents/agents.module.ts +++ b/packages/cli/src/modules/agents/agents.module.ts @@ -86,6 +86,7 @@ export class AgentsModule implements ModuleInterface { async entities() { const { Agent } = await import('./entities/agent.entity'); + const { AgentFile } = await import('./entities/agent-file.entity'); const { AgentCheckpoint } = await import('./entities/agent-checkpoint.entity'); const { AgentResourceEntity } = await import('./entities/agent-resource.entity'); const { AgentThreadEntity } = await import('./entities/agent-thread.entity'); @@ -111,6 +112,7 @@ export class AgentsModule implements ModuleInterface { return [ Agent, + AgentFile, AgentCheckpoint, AgentResourceEntity, AgentThreadEntity, diff --git a/packages/cli/src/modules/agents/agents.service.ts b/packages/cli/src/modules/agents/agents.service.ts index 57b24c2760e..b77dd47e9ad 100644 --- a/packages/cli/src/modules/agents/agents.service.ts +++ b/packages/cli/src/modules/agents/agents.service.ts @@ -104,6 +104,8 @@ import { AgentRepository } from './repositories/agent.repository'; import { AgentSecureRuntime } from './runtime/agent-secure-runtime'; import { buildToolRegistry, type ToolRegistry } from './tool-registry'; import { ChatIntegrationService } from './integrations/chat-integration.service'; +import { AgentKnowledgeCommandService } from './agent-knowledge-command.service'; +import { AgentKnowledgeService } from './agent-knowledge.service'; type AgentToolEntries = Agent['tools']; @@ -310,6 +312,8 @@ export class AgentsService { private readonly globalConfig: GlobalConfig, private readonly telemetry: Telemetry, private readonly chatIntegrationService: ChatIntegrationService, + private readonly agentKnowledgeService: AgentKnowledgeService, + private readonly agentKnowledgeCommandService: AgentKnowledgeCommandService, private readonly oauthService: OauthService, ) {} @@ -317,6 +321,15 @@ export class AgentsService { return this.agentsConfig.modules.includes('node-tools-searcher'); } + /** + * Whether the agent knowledge base sub-feature is enabled via + * `N8N_AGENTS_MODULES`. Gates the file endpoints and the `search_knowledge` + * runtime tool. Public so the controller can guard its file endpoints. + */ + isKnowledgeBaseModuleEnabled(): boolean { + return this.agentsConfig.modules.includes('knowledge-base'); + } + /** * Best-effort close of an agent instance. Delegates to `agent.close()` * which disposes the runtime and disconnects any attached MCP clients. @@ -753,6 +766,19 @@ export class AgentsService { return false; } + // Best-effort, non-transactional cleanup: deleteAllFilesForAgent removes + // binary blobs from the filesystem/object store, which a DB transaction + // can't roll back. The agent_files rows are removed via the agentId FK's + // ON DELETE CASCADE when the agent is removed below, so a failure here + // only risks orphaned blobs (logged) and must not block agent deletion. + try { + await this.agentKnowledgeService.deleteAllFilesForAgent(agentId); + } catch (error) { + this.logger.warn('Failed to delete knowledge files on agent delete', { + agentId, + error: error instanceof Error ? error.message : error, + }); + } await this.agentRepository.remove(agent); this.clearRuntimes(agentId); @@ -933,6 +959,30 @@ export class AgentsService { // per request don't bust system-prompt prompt caching. agent.tool(createGetEnvironmentTool()); + // search_knowledge is gated behind the `knowledge-base` agents module. + // It's also an optional capability: if wiring it up fails (e.g. dynamic + // import or service construction error), degrade gracefully and keep the + // rest of the runtime usable rather than failing the whole agent. The + // failure is logged so it stays observable. + if (this.isKnowledgeBaseModuleEnabled()) { + try { + const { createSearchKnowledgeTool } = await import('./tools/knowledge/tool'); + agent.tool( + createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: this.agentKnowledgeService, + commandService: this.agentKnowledgeCommandService, + }), + ); + } catch (toolError) { + this.logger.warn('Failed to inject search_knowledge tool', { + agentId, + error: toolError instanceof Error ? toolError.message : String(toolError), + }); + } + } + // Inject the rich_interaction tool only for platforms that can actually // render its suspend/resume HITL cards. Two gates: // - A registered integration in ChatIntegrationRegistry. The in-app diff --git a/packages/cli/src/modules/agents/entities/agent-file.entity.ts b/packages/cli/src/modules/agents/entities/agent-file.entity.ts new file mode 100644 index 00000000000..69263c82595 --- /dev/null +++ b/packages/cli/src/modules/agents/entities/agent-file.entity.ts @@ -0,0 +1,38 @@ +import { WithTimestampsAndStringId } from '@n8n/db'; +import { Column, Entity, Index, JoinColumn, ManyToOne, type Relation } from '@n8n/typeorm'; + +import { Agent } from './agent.entity'; + +@Entity({ name: 'agent_files' }) +@Index(['agentId', 'createdAt']) +export class AgentFile extends WithTimestampsAndStringId { + @ManyToOne(() => Agent, { onDelete: 'CASCADE' }) + @JoinColumn({ name: 'agentId' }) + agent: Relation; + + @Column({ type: 'varchar', length: 36 }) + agentId: string; + + /** + * Opaque BinaryDataService reference (mode-prefixed, e.g. + * `filesystem-v2:` or `s3:`). Not a DB FK: in filesystem/object- + * store modes there is no `binary_data` row to reference. + */ + @Column({ type: 'text' }) + binaryDataId: string; + + // fileName/mimeType/fileSizeBytes are intentionally denormalized rather than + // joined from binary_data: (1) binaryDataId is an opaque storage reference, + // not an FK, and binary_data only holds rows in DB storage mode; (2) we keep + // the original user-facing values, which differ from the stored binary for + // converted uploads (a PDF is stored as extracted `*.pdf.txt` text/plain with + // a different byte size). + @Column({ type: 'varchar', length: 255 }) + fileName: string; + + @Column({ type: 'varchar', length: 255 }) + mimeType: string; + + @Column({ type: 'int' }) + fileSizeBytes: number; +} diff --git a/packages/cli/src/modules/agents/execution-recorder.ts b/packages/cli/src/modules/agents/execution-recorder.ts index 5e259895357..e9d69cdd30f 100644 --- a/packages/cli/src/modules/agents/execution-recorder.ts +++ b/packages/cli/src/modules/agents/execution-recorder.ts @@ -150,6 +150,8 @@ export interface RecordedToolCall { output: unknown; } +type PendingRecordedToolCall = RecordedToolCall & { toolCallId?: string }; + export type TimelineEvent = | { type: 'text'; content: string; timestamp: number; endTime?: number } | { @@ -220,7 +222,7 @@ export class ExecutionRecorder { private totalCost: number | null = null; - private toolCalls: RecordedToolCall[] = []; + private toolCalls: PendingRecordedToolCall[] = []; private timeline: TimelineEvent[] = []; @@ -297,7 +299,7 @@ export class ExecutionRecorder { finishReason: this.finishReason, usage: this.usage, totalCost: this.totalCost, - toolCalls: this.toolCalls, + toolCalls: this.toolCalls.map(({ toolCallId: _toolCallId, ...toolCall }) => toolCall), timeline: this.timeline, startTime: this.startTime, duration: Date.now() - this.startTime, @@ -332,7 +334,7 @@ export class ExecutionRecorder { private recordToolCall(toolCallId: string, name: string, input: unknown): void { this.flushTextBuffer(); - this.toolCalls.push({ name, input, output: undefined }); + this.toolCalls.push({ name, input, output: undefined, toolCallId }); const entry = this.registry.get(name); // Resolve both `$fromAI(...)` placeholders and simple `={{ $json.x }}` @@ -365,6 +367,19 @@ export class ExecutionRecorder { }); } + /** + * Find the still-open flat tool-call entry to attach a result to. Prefers + * an exact match on `toolCallId`; when the stream omits the id (empty + * string), falls back to the most recent open entry (`output === undefined`) + * with the same tool name. + */ + private findOpenToolCall(toolCallId: string, name: string): PendingRecordedToolCall | undefined { + if (toolCallId !== '') { + return this.toolCalls.find((tc) => tc.toolCallId === toolCallId && tc.output === undefined); + } + return [...this.toolCalls].reverse().find((tc) => tc.name === name && tc.output === undefined); + } + /** * Record a discrete `tool-result` chunk from the stream. Closes the * matching open timeline entry by `toolCallId` (preferred) or by name as @@ -383,9 +398,7 @@ export class ExecutionRecorder { ): void { const recordedOutput = isError ? normaliseToolErrorOutput(output) : output; - const pendingFlat = [...this.toolCalls] - .reverse() - .find((tc) => tc.name === name && tc.output === undefined); + const pendingFlat = this.findOpenToolCall(toolCallId, name); if (pendingFlat) { pendingFlat.output = recordedOutput; } else { diff --git a/packages/cli/src/modules/agents/repositories/agent-file.repository.ts b/packages/cli/src/modules/agents/repositories/agent-file.repository.ts new file mode 100644 index 00000000000..92de49de6d1 --- /dev/null +++ b/packages/cli/src/modules/agents/repositories/agent-file.repository.ts @@ -0,0 +1,22 @@ +import { Service } from '@n8n/di'; +import { DataSource, Repository } from '@n8n/typeorm'; + +import { AgentFile } from '../entities/agent-file.entity'; + +@Service() +export class AgentFileRepository extends Repository { + constructor(dataSource: DataSource) { + super(AgentFile, dataSource.manager); + } + + async findByAgentId(agentId: string): Promise { + return await this.find({ + where: { agentId }, + order: { createdAt: 'DESC' }, + }); + } + + async findByIdAndAgentId(fileId: string, agentId: string): Promise { + return await this.findOne({ where: { id: fileId, agentId } }); + } +} diff --git a/packages/cli/src/modules/agents/tools/__tests__/knowledge-tool.test.ts b/packages/cli/src/modules/agents/tools/__tests__/knowledge-tool.test.ts new file mode 100644 index 00000000000..48b49902705 --- /dev/null +++ b/packages/cli/src/modules/agents/tools/__tests__/knowledge-tool.test.ts @@ -0,0 +1,1396 @@ +import { AgentKnowledgeCommandService } from '../../agent-knowledge-command.service'; +import type { AgentKnowledgeService } from '../../agent-knowledge.service'; +import { createSearchKnowledgeTool } from '../knowledge/tool'; +import { searchKnowledgeInputSchema, searchKnowledgeParsingSchema } from '../knowledge/schemas'; +import type { JSONSchema7 } from 'json-schema'; + +jest.unmock('node:fs'); +jest.unmock('node:fs/promises'); + +const agentId = 'agent-1'; +const projectId = 'project-1'; + +describe('search_knowledge tool', () => { + let commandService: AgentKnowledgeCommandService; + let knowledgeService: jest.Mocked< + Pick< + AgentKnowledgeService, + 'listWorkspaceFiles' | 'materializeWorkspace' | 'resolveWorkspaceFiles' + > + >; + + function mockKnowledgeService() { + return knowledgeService as unknown as AgentKnowledgeService; + } + + beforeEach(() => { + commandService = new AgentKnowledgeCommandService(); + knowledgeService = { + listWorkspaceFiles: jest.fn(), + materializeWorkspace: jest.fn(), + // The real method does a metadata-only DB query. For tests we mirror + // whatever materializeWorkspace is configured to produce (using a + // throwaway dir) so the tool's cache key reflects the same file set. + resolveWorkspaceFiles: jest.fn(async (resolveAgentId, resolveProjectId, fileReferences) => { + const { mkdtemp, rm } = await import('node:fs/promises'); + const { tmpdir } = await import('node:os'); + const nodePath = await import('node:path'); + const dir = await mkdtemp(nodePath.join(tmpdir(), 'resolve-')); + try { + return await knowledgeService.materializeWorkspace( + resolveAgentId, + resolveProjectId, + dir, + { + fileReferences, + }, + ); + } finally { + await rm(dir, { recursive: true, force: true }); + } + }), + }; + }); + + it('describes a top-level object input schema for providers', () => { + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + expect(tool.inputSchema).toMatchObject({ + type: 'object', + properties: expect.objectContaining({ + operation: expect.objectContaining({ type: 'string' }), + where: expect.any(Object), + select: expect.any(Object), + }), + }); + expect((tool.inputSchema as JSONSchema7).properties).not.toHaveProperty('request'); + expect(tool.inputSchema).not.toHaveProperty('oneOf'); + const properties = (tool.inputSchema as JSONSchema7).properties as Record< + string, + { default?: unknown; description?: string } + >; + expect(properties.output_mode.default).toBe('files_with_matches'); + expect(properties.head_limit.default).toBe(250); + expect(properties.match_mode.default).toBe('any'); + expect(String(properties.queries.description)).toContain('multiple literal search terms'); + expect((tool.inputSchema as JSONSchema7).properties).not.toHaveProperty('mode'); + expect((tool.inputSchema as JSONSchema7).properties).not.toHaveProperty('maxResults'); + expect(String(properties.file.description)).toContain( + 'cite the returned fileName and lineRange instead', + ); + }); + + it('keeps the provider JSON schema property names in sync with the Zod parsing schema', () => { + // The flat JSON schema (steered at the LLM) and the strict Zod discriminated + // union (used for validation) are maintained by hand. Guard against drift by + // asserting they expose the exact same set of field names. + const zodKeys = new Set( + searchKnowledgeParsingSchema.options.flatMap((option) => Object.keys(option.shape)), + ); + const jsonKeys = new Set(Object.keys(searchKnowledgeInputSchema.properties ?? {})); + + expect(jsonKeys).toEqual(zodKeys); + }); + + it('lists uploaded knowledge files', async () => { + knowledgeService.listWorkspaceFiles.mockResolvedValue([ + { + id: 'file-1', + fileName: 'notes.txt', + mimeType: 'text/plain', + fileSizeBytes: 12, + relativePath: 'file-1-notes.txt', + }, + ]); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect(tool.handler?.({ operation: 'list' }, {} as never)).resolves.toMatchObject({ + operation: 'list', + files: [ + { + id: 'file-1', + relativePath: 'file-1-notes.txt', + }, + ], + }); + expect(knowledgeService.materializeWorkspace).not.toHaveBeenCalled(); + }); + + it('returns a tool error when workspace materialization fails', async () => { + knowledgeService.materializeWorkspace.mockRejectedValue(new Error('storage unavailable')); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.({ operation: 'search', query: 'needle' }, {} as never), + ).resolves.toMatchObject({ + operation: 'search', + files: [], + error: 'storage unavailable', + }); + }); + + it('searches materialized text files', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile(path.join(workspaceRoot, 'file-1-notes.txt'), 'hello\nneedle\n'); + return [ + { + id: 'file-1', + fileName: 'notes.txt', + mimeType: 'text/plain', + fileSizeBytes: 13, + relativePath: 'file-1-notes.txt', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + const result = await tool.handler?.({ operation: 'search', query: 'needle' }, {} as never); + + expect(result).toMatchObject({ + operation: 'search', + result: { + command: 'git_grep', + exitCode: 0, + }, + search: { + mode: 'files_with_matches', + files: [expect.objectContaining({ relativePath: 'file-1-notes.txt' })], + matches: [], + }, + }); + const stdout = (result as { result: { stdout: string } }).result.stdout; + expect(stdout).toContain('notes.txt'); + expect(stdout).not.toContain('needle'); + }); + + it('accepts a singular file reference for search', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile(path.join(workspaceRoot, 'file-1-notes.txt'), 'needle\n'); + await writeFile(path.join(workspaceRoot, 'file-2-notes.txt'), 'needle\n'); + return [ + { + id: 'file-1', + fileName: 'notes.txt', + mimeType: 'text/plain', + fileSizeBytes: 7, + relativePath: 'file-1-notes.txt', + }, + { + id: 'file-2', + fileName: 'other-notes.txt', + mimeType: 'text/plain', + fileSizeBytes: 7, + relativePath: 'file-2-notes.txt', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + const result = await tool.handler?.( + { operation: 'search', query: 'needle', file: 'notes.txt' }, + {} as never, + ); + + expect(result).toMatchObject({ + operation: 'search', + search: { + files: [expect.objectContaining({ fileName: 'notes.txt' })], + }, + }); + expect((result as { search: { files: unknown[] } }).search.files).toHaveLength(1); + expect(knowledgeService.materializeWorkspace).toHaveBeenCalledWith( + agentId, + projectId, + expect.any(String), + { fileReferences: ['notes.txt'] }, + ); + }); + + it('rejects search file references when the file alias would exceed the cap', async () => { + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.( + { + operation: 'search', + query: 'needle', + file: 'extra.md', + files: Array.from({ length: 10 }, (_, index) => `file-${index + 1}.md`), + }, + {} as never, + ), + ).resolves.toMatchObject({ + operation: 'search', + files: [], + error: expect.stringContaining('Search can target at most 10 files.'), + }); + expect(knowledgeService.materializeWorkspace).not.toHaveBeenCalled(); + }); + + it('limits content results with head_limit', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + const repeatedNeedles = Array.from( + { length: 12 }, + (_, index) => `needle ${index + 1}`, + ).join('\n'); + await writeFile(path.join(workspaceRoot, 'file-1.md'), repeatedNeedles); + await writeFile(path.join(workspaceRoot, 'file-2.md'), repeatedNeedles); + return [ + { + id: 'file-1', + fileName: 'book-one.md', + mimeType: 'text/markdown', + fileSizeBytes: 120, + relativePath: 'file-1.md', + }, + { + id: 'file-2', + fileName: 'book-two.md', + mimeType: 'text/markdown', + fileSizeBytes: 120, + relativePath: 'file-2.md', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + const result = await tool.handler?.( + { + operation: 'search', + query: 'needle', + output_mode: 'content', + files: ['file-1', 'file-2'], + head_limit: 20, + }, + {} as never, + ); + const stdout = (result as { result: { stdout: string } }).result.stdout; + + expect(result).toMatchObject({ + operation: 'search', + result: { + truncated: true, + }, + search: { + mode: 'content', + matches: expect.any(Array), + appliedLimit: 20, + nextOffset: 20, + hint: expect.stringContaining('Continue with offset=20 and head_limit=20'), + }, + }); + expect(stdout).toContain('book-one.md:12:needle 12'); + expect(stdout).toContain('book-two.md:8:needle 8'); + expect(stdout).not.toContain('book-two.md:9:needle 9'); + expect(stdout).toContain('Continue with offset=20 and head_limit=20'); + }); + it('returns content matches only when requested', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile(path.join(workspaceRoot, 'file-1.md'), 'first\nneedle\n'); + return [ + { + id: 'file-1', + fileName: 'book-one.md', + mimeType: 'text/markdown', + fileSizeBytes: 20, + relativePath: 'file-1.md', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.({ operation: 'search', query: 'needle', output_mode: 'content' }, {} as never), + ).resolves.toMatchObject({ + operation: 'search', + search: { + mode: 'content', + totalMatchingFiles: 1, + totalMatchingLines: 1, + matches: [ + expect.objectContaining({ + fileId: 'file-1', + lineNumber: 2, + text: 'needle', + readRange: { start: 1, end: 8 }, + }), + ], + }, + }); + }); + + it('defaults broad searches to matching files without line dumps', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile( + path.join(workspaceRoot, 'file-1.md'), + Array.from({ length: 30 }, (_, index) => `needle ${index + 1}`).join('\n'), + ); + await writeFile( + path.join(workspaceRoot, 'file-2.md'), + Array.from({ length: 5 }, (_, index) => `needle ${index + 1}`).join('\n'), + ); + return [ + { + id: 'file-1', + fileName: 'book-one.md', + mimeType: 'text/markdown', + fileSizeBytes: 300, + relativePath: 'file-1.md', + }, + { + id: 'file-2', + fileName: 'book-two.md', + mimeType: 'text/markdown', + fileSizeBytes: 50, + relativePath: 'file-2.md', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + const result = await tool.handler?.({ operation: 'search', query: 'needle' }, {} as never); + + expect(result).toMatchObject({ + operation: 'search', + result: { + truncated: false, + stdout: expect.stringContaining('book-one.md'), + }, + search: { + mode: 'files_with_matches', + totalMatchingFiles: 2, + totalMatchingLines: 35, + files: expect.arrayContaining([ + expect.objectContaining({ + id: 'file-1', + matchCount: 30, + }), + ]), + matches: [], + hint: expect.stringContaining('Use read'), + }, + }); + expect((result as { result: { stdout: string } }).result.stdout).not.toContain('needle'); + expect((result as { result: { stdout: string } }).result.stdout).not.toContain('file-1.md'); + }); + it('returns per-file counts', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile(path.join(workspaceRoot, 'file-1.md'), 'needle\nneedle\n'); + await writeFile(path.join(workspaceRoot, 'file-2.md'), 'needle\n'); + return [ + { + id: 'file-1', + fileName: 'book-one.md', + mimeType: 'text/markdown', + fileSizeBytes: 20, + relativePath: 'file-1.md', + }, + { + id: 'file-2', + fileName: 'book-two.md', + mimeType: 'text/markdown', + fileSizeBytes: 10, + relativePath: 'file-2.md', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.({ operation: 'search', query: 'needle', output_mode: 'count' }, {} as never), + ).resolves.toMatchObject({ + search: { + mode: 'count', + totalMatchingFiles: 2, + totalMatchingLines: 3, + files: [ + expect.objectContaining({ id: 'file-1', matchCount: 2 }), + expect.objectContaining({ id: 'file-2', matchCount: 1 }), + ], + }, + }); + }); + it('uses extended regex for non-fixed search patterns', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile(path.join(workspaceRoot, 'file-1.md'), 'freedom\nnecessity\n'); + return [ + { + id: 'file-1', + fileName: 'book-one.md', + mimeType: 'text/markdown', + fileSizeBytes: 20, + relativePath: 'file-1.md', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.( + { + operation: 'search', + query: 'freedom|necessity', + output_mode: 'content', + fixedStrings: false, + }, + {} as never, + ), + ).resolves.toMatchObject({ + search: { + totalMatchingLines: 2, + matches: [ + expect.objectContaining({ text: 'freedom' }), + expect.objectContaining({ text: 'necessity' }), + ], + }, + }); + }); + + it('trims very long content match lines while preserving read ranges', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile(path.join(workspaceRoot, 'file-1.md'), `needle ${'x'.repeat(700)}\n`); + return [ + { + id: 'file-1', + fileName: 'book-one.md', + mimeType: 'text/markdown', + fileSizeBytes: 720, + relativePath: 'file-1.md', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.({ operation: 'search', query: 'needle', output_mode: 'content' }, {} as never), + ).resolves.toMatchObject({ + search: { + matches: [ + expect.objectContaining({ + lineNumber: 1, + readRange: { start: 1, end: 7 }, + text: expect.stringContaining('[line truncated; use read for full text]'), + truncated: true, + }), + ], + }, + result: { + stdout: expect.stringContaining('[line truncated; use read for full text]'), + }, + }); + }); + + it('supports multi-query any search without hand-written regex', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile(path.join(workspaceRoot, 'file-1.md'), 'necessity\nfreedom\n'); + return [ + { + id: 'file-1', + fileName: 'book-one.md', + mimeType: 'text/markdown', + fileSizeBytes: 20, + relativePath: 'file-1.md', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.( + { + operation: 'search', + queries: ['necessity', 'freedom'], + output_mode: 'content', + match_mode: 'any', + }, + {} as never, + ), + ).resolves.toMatchObject({ + search: { + query: 'necessity', + queries: ['necessity', 'freedom'], + matchMode: 'any', + totalMatchingFiles: 1, + totalMatchingLines: 2, + matches: [ + expect.objectContaining({ text: 'necessity' }), + expect.objectContaining({ text: 'freedom' }), + ], + }, + }); + }); + + it('filters multi-query matches using full line text before display truncation', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile( + path.join(workspaceRoot, 'file-1.md'), + `needle ${'x'.repeat(700)} tailterm\n`, + ); + return [ + { + id: 'file-1', + fileName: 'book-one.md', + mimeType: 'text/markdown', + fileSizeBytes: 720, + relativePath: 'file-1.md', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.( + { + operation: 'search', + queries: ['needle', 'tailterm'], + output_mode: 'content', + match_mode: 'all_on_same_line', + }, + {} as never, + ), + ).resolves.toMatchObject({ + search: { + totalMatchingLines: 1, + matches: [ + expect.objectContaining({ + lineNumber: 1, + text: expect.stringContaining('[line truncated; use read for full text]'), + truncated: true, + }), + ], + }, + }); + }); + + it('supports multi-query all_within_lines search without hand-written regex', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile( + path.join(workspaceRoot, 'file-1.md'), + ['necessity governs history', 'bridge line', 'free will is constrained'].join('\n'), + ); + await writeFile( + path.join(workspaceRoot, 'file-2.md'), + [ + 'necessity appears here', + 'many lines later', + 'still later', + 'more distance', + 'free will appears', + ].join('\n'), + ); + return [ + { + id: 'file-1', + fileName: 'book-one.md', + mimeType: 'text/markdown', + fileSizeBytes: 80, + relativePath: 'file-1.md', + }, + { + id: 'file-2', + fileName: 'book-two.md', + mimeType: 'text/markdown', + fileSizeBytes: 80, + relativePath: 'file-2.md', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.( + { + operation: 'search', + queries: ['necessity', 'free will'], + output_mode: 'content', + match_mode: 'all_within_lines', + }, + {} as never, + ), + ).resolves.toMatchObject({ + search: { + matchMode: 'all_within_lines', + totalMatchingFiles: 1, + totalMatchingLines: 2, + matches: [ + expect.objectContaining({ fileId: 'file-1', lineNumber: 1 }), + expect.objectContaining({ fileId: 'file-1', lineNumber: 3 }), + ], + }, + }); + }); + it('rejects public command operations without materializing files', async () => { + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.( + { + operation: 'command', + request: { command: 'cat', file: 'file-1' }, + }, + {} as never, + ), + ).resolves.toMatchObject({ + files: [], + error: expect.stringContaining('Invalid discriminator value'), + }); + expect(knowledgeService.materializeWorkspace).not.toHaveBeenCalled(); + }); + + it('reads extracted PDF text when materialized as text', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile(path.join(workspaceRoot, 'file-1.pdf.txt'), 'extracted PDF text\n'); + return [ + { + id: 'file-1', + fileName: 'document.pdf', + mimeType: 'text/plain', + fileSizeBytes: 200, + relativePath: 'file-1.pdf.txt', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.({ operation: 'read', file: 'file-1' }, {} as never), + ).resolves.toMatchObject({ + operation: 'read', + files: [ + expect.objectContaining({ + fileName: 'document.pdf', + relativePath: 'file-1.pdf.txt', + }), + ], + result: { + command: 'cat', + stdout: 'extracted PDF text\n', + citation: { + fileName: 'document.pdf', + instruction: expect.stringContaining('Do not cite file ids'), + }, + }, + }); + expect(knowledgeService.materializeWorkspace).toHaveBeenCalledWith( + agentId, + projectId, + expect.any(String), + { fileReferences: ['file-1'] }, + ); + }); + + it('reads materialized files by display file name', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile(path.join(workspaceRoot, 'file-1.md'), 'book text\n'); + return [ + { + id: 'file-1', + fileName: 'Moby Dick.md', + mimeType: 'text/markdown', + fileSizeBytes: 10, + relativePath: 'file-1.md', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.({ operation: 'read', file: 'Moby Dick.md' }, {} as never), + ).resolves.toMatchObject({ + operation: 'read', + result: { + command: 'cat', + stdout: 'book text\n', + citation: { + fileName: 'Moby Dick.md', + }, + }, + }); + expect(knowledgeService.materializeWorkspace).toHaveBeenCalledWith( + agentId, + projectId, + expect.any(String), + { fileReferences: ['Moby Dick.md'] }, + ); + }); + + it('queries CSV rows with selected columns in one operation', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile( + path.join(workspaceRoot, 'file-1.csv'), + [ + 'country,year,population,co2,co2_per_capita', + 'Germany,2022,84086227,667.843,7.942', + 'France,2022,66277412,295.304,4.456', + 'Germany,2021,83196078,677.998,8.149', + ].join('\n'), + ); + return [ + { + id: 'file-1', + fileName: 'owid-co2-data.csv', + mimeType: 'text/csv', + fileSizeBytes: 200, + relativePath: 'file-1.csv', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.( + { + operation: 'csv_query', + file: 'file-1', + where: [ + { column: 'country', op: 'in', value: ['Germany', 'France'] }, + { column: 'year', op: 'eq', value: '2022' }, + ], + select: ['country', 'year', 'population', 'co2', 'co2_per_capita'], + limit: 10, + }, + {} as never, + ), + ).resolves.toMatchObject({ + operation: 'csv_query', + csv: { + fileName: 'owid-co2-data.csv', + relativePath: 'file-1.csv', + columns: ['country', 'year', 'population', 'co2', 'co2_per_capita'], + rowNumbers: [2, 3], + rows: [ + ['Germany', '2022', '84086227', '667.843', '7.942'], + ['France', '2022', '66277412', '295.304', '4.456'], + ], + rowCount: 2, + truncated: false, + }, + }); + }); + it('queries CSV columns with quoted commas in their header names', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile( + path.join(workspaceRoot, 'file-1.csv'), + ['"country,name",year', '"Germany,Federal Republic",2022'].join('\n'), + ); + return [ + { + id: 'file-1', + fileName: 'quoted.csv', + mimeType: 'text/csv', + fileSizeBytes: 53, + relativePath: 'file-1.csv', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.( + { + operation: 'csv_query', + file: 'file-1', + select: ['country,name', 'year'], + }, + {} as never, + ), + ).resolves.toMatchObject({ + operation: 'csv_query', + csv: { + columns: ['country,name', 'year'], + rows: [['Germany,Federal Republic', '2022']], + }, + }); + }); + it('profiles CSV schemas with sample rows, inferred types, and disambiguating columns', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile( + path.join(workspaceRoot, 'file-1.csv'), + [ + 'Source,Year,Mean,Reviewed,Date,Notes', + 'GCAG,1880,-0.12,true,1880-01-01,', + 'GCAG,1881,-0.09,false,1881-01-01,estimated', + 'GISTEMP,1880,-0.2,true,1880-01-01,', + ].join('\n'), + ); + return [ + { + id: 'file-1', + fileName: 'temperature.csv', + mimeType: 'text/csv', + fileSizeBytes: 180, + relativePath: 'file-1.csv', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.({ operation: 'csv_profile', file: 'file-1', sampleSize: 2 }, {} as never), + ).resolves.toMatchObject({ + operation: 'csv_profile', + csvProfile: { + fileName: 'temperature.csv', + columns: ['Source', 'Year', 'Mean', 'Reviewed', 'Date', 'Notes'], + rowCount: 3, + sampleRows: [ + { + Source: 'GCAG', + Year: '1880', + Mean: '-0.12', + }, + { + Source: 'GCAG', + Year: '1881', + Mean: '-0.09', + }, + ], + columnProfiles: expect.arrayContaining([ + expect.objectContaining({ + name: 'Year', + inferredType: 'integer', + emptyCount: 0, + distinctCount: 2, + }), + expect.objectContaining({ + name: 'Mean', + inferredType: 'number', + }), + expect.objectContaining({ + name: 'Reviewed', + inferredType: 'boolean', + }), + expect.objectContaining({ + name: 'Date', + inferredType: 'date', + }), + expect.objectContaining({ + name: 'Notes', + inferredType: 'string', + emptyCount: 2, + }), + ]), + likelyDisambiguatingColumns: expect.arrayContaining(['Year', 'Source']), + }, + }); + }); + + it('returns CSV row metadata and ambiguity guidance for repeated filters', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile( + path.join(workspaceRoot, 'file-1.csv'), + [ + 'Source,Year,Mean', + 'GCAG,1880,-0.12', + 'GCAG,1881,-0.09', + 'GCAG,1882,-0.1', + 'GISTEMP,1880,-0.2', + ].join('\n'), + ); + return [ + { + id: 'file-1', + fileName: 'temperature.csv', + mimeType: 'text/csv', + fileSizeBytes: 120, + relativePath: 'file-1.csv', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.( + { + operation: 'csv_query', + file: 'file-1', + where: [{ column: 'Source', op: 'eq', value: 'GCAG' }], + select: ['Mean'], + limit: 1, + }, + {} as never, + ), + ).resolves.toMatchObject({ + operation: 'csv_query', + csv: { + columns: ['Mean'], + rows: [['-0.12']], + rowNumbers: [2], + records: [ + { + rowNumber: 2, + fileLineNumber: 2, + values: { Mean: '-0.12' }, + }, + ], + rowCount: 3, + truncated: true, + ambiguity: { + matchedRows: 3, + suggestedColumns: expect.arrayContaining(['Year']), + sampleDistinctValues: { + Year: ['1880', '1881', '1882'], + }, + }, + }, + }); + }); + + it('fetches exact CSV rows by row number with file-line metadata', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile( + path.join(workspaceRoot, 'file-1.csv'), + ['Source,Year,Mean', 'GCAG,1880,-0.12', 'GCAG,1881,-0.09'].join('\n'), + ); + return [ + { + id: 'file-1', + fileName: 'temperature.csv', + mimeType: 'text/csv', + fileSizeBytes: 80, + relativePath: 'file-1.csv', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.( + { + operation: 'csv_query', + file: 'file-1', + rowNumber: 3, + }, + {} as never, + ), + ).resolves.toMatchObject({ + operation: 'csv_query', + csv: { + columns: ['Source', 'Year', 'Mean'], + rowNumbers: [3], + rows: [['GCAG', '1881', '-0.09']], + records: [ + { + rowNumber: 3, + fileLineNumber: 3, + values: { + Source: 'GCAG', + Year: '1881', + Mean: '-0.09', + }, + }, + ], + rowCount: 1, + truncated: false, + }, + }); + }); + + it('returns distinct CSV values for filtered rows', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile( + path.join(workspaceRoot, 'file-1.csv'), + [ + 'Source,Year,Mean', + 'GCAG,1880,-0.12', + 'GCAG,1881,-0.09', + 'GCAG,1881,-0.08', + 'GISTEMP,1880,-0.2', + ].join('\n'), + ); + return [ + { + id: 'file-1', + fileName: 'temperature.csv', + mimeType: 'text/csv', + fileSizeBytes: 120, + relativePath: 'file-1.csv', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.( + { + operation: 'csv_distinct', + file: 'file-1', + column: 'Year', + where: [{ column: 'Source', op: 'eq', value: 'GCAG' }], + }, + {} as never, + ), + ).resolves.toMatchObject({ + operation: 'csv_distinct', + csvDistinct: { + column: 'Year', + values: ['1880', '1881'], + distinctCount: 2, + truncated: false, + }, + }); + }); + + it('computes CSV aggregates from all streamed matches', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile( + path.join(workspaceRoot, 'file-1.csv'), + [ + 'Source,Year,Mean', + 'GCAG,1880,-0.12', + 'GCAG,1881,-0.09', + 'GISTEMP,1880,-0.2', + 'GISTEMP,1881,n/a', + 'GISTEMP,1882, ', + ].join('\n'), + ); + return [ + { + id: 'file-1', + fileName: 'temperature.csv', + mimeType: 'text/csv', + fileSizeBytes: 120, + relativePath: 'file-1.csv', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.( + { + operation: 'csv_aggregate', + file: 'file-1', + metric: 'Mean', + functions: ['count', 'min', 'max', 'sum', 'avg'], + groupBy: ['Source'], + orderBy: { column: 'Source', direction: 'asc' }, + }, + {} as never, + ), + ).resolves.toMatchObject({ + operation: 'csv_aggregate', + csvAggregate: { + rowCount: 5, + functions: ['count', 'min', 'max', 'sum', 'avg'], + metrics: ['Mean'], + groupBy: ['Source'], + results: [ + { + Source: 'GCAG', + count: 2, + min_Mean: -0.12, + max_Mean: -0.09, + sum_Mean: -0.21, + avg_Mean: -0.105, + }, + { + Source: 'GISTEMP', + count: 3, + min_Mean: -0.2, + max_Mean: -0.2, + sum_Mean: -0.2, + avg_Mean: -0.2, + }, + ], + skippedNonNumeric: { + Mean: 2, + }, + }, + }); + }); + + it('suggests close CSV column names for bad column requests', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + await writeFile(path.join(workspaceRoot, 'file-1.csv'), 'country,year\nGermany,2022\n'); + return [ + { + id: 'file-1', + fileName: 'owid-co2-data.csv', + mimeType: 'text/csv', + fileSizeBytes: 27, + relativePath: 'file-1.csv', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.( + { + operation: 'csv_query', + file: 'file-1', + select: ['coutry'], + }, + {} as never, + ), + ).resolves.toMatchObject({ + operation: 'csv_query', + error: expect.stringContaining('Did you mean "country"?'), + }); + }); + it('continues streaming CSV queries past ten thousand rows', async () => { + knowledgeService.materializeWorkspace.mockImplementation( + async (_agentId, _projectId, workspaceRoot) => { + const { writeFile } = await import('node:fs/promises'); + const path = await import('node:path'); + const rows = ['country,year']; + for (let index = 0; index < 10_000; index++) { + rows.push(`Other ${index},2022`); + } + rows.push('Germany,2022'); + await writeFile(path.join(workspaceRoot, 'file-1.csv'), rows.join('\n')); + return [ + { + id: 'file-1', + fileName: 'large.csv', + mimeType: 'text/csv', + fileSizeBytes: 50 * 1024 * 1024, + relativePath: 'file-1.csv', + }, + ]; + }, + ); + const tool = createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService: mockKnowledgeService(), + commandService, + }); + + await expect( + tool.handler?.( + { + operation: 'csv_query', + file: 'file-1', + where: [{ column: 'country', op: 'eq', value: 'Germany' }], + select: ['country', 'year'], + }, + {} as never, + ), + ).resolves.toMatchObject({ + operation: 'csv_query', + csv: { + fileName: 'large.csv', + rows: [['Germany', '2022']], + rowNumbers: [10002], + rowCount: 1, + truncated: false, + }, + }); + }); +}); diff --git a/packages/cli/src/modules/agents/tools/knowledge/csv-helpers.ts b/packages/cli/src/modules/agents/tools/knowledge/csv-helpers.ts new file mode 100644 index 00000000000..c73c1bd5439 --- /dev/null +++ b/packages/cli/src/modules/agents/tools/knowledge/csv-helpers.ts @@ -0,0 +1,372 @@ +import { createReadStream } from 'node:fs'; +import path from 'node:path'; + +import { distance } from 'fastest-levenshtein'; + +import { resolveFileReference, type WorkspaceFiles } from './file-references'; +import type { CsvAggregateInput, CsvFilter } from './schemas'; + +export const CSV_SAMPLE_VALUE_LIMIT = 5; +export const CSV_PROFILE_DISTINCT_LIMIT = 100; +export const CSV_DISTINCT_TRACK_LIMIT = 10_000; +/** Cap distinct aggregate groups to bound memory on high-cardinality group-by. */ +export const CSV_MAX_AGGREGATE_GROUPS = 50_000; +/** Wall-clock safety net for a single CSV operation (files are upload-size-capped). */ +const CSV_OPERATION_TIMEOUT_MS = 15_000; + +function isCsvFile(file: WorkspaceFiles[number]) { + return file.mimeType === 'text/csv' || file.relativePath.toLowerCase().endsWith('.csv'); +} + +export function resolveCsvFile(files: WorkspaceFiles, reference: string) { + const resolvedFile = resolveFileReference(files, reference); + if (resolvedFile.status !== 'found') { + throw new Error(resolvedFile.error); + } + const { file } = resolvedFile; + if (!isCsvFile(file)) { + throw new Error(`File "${file.fileName}" is not queryable as CSV.`); + } + return file; +} + +export async function streamCsvRecords( + workspaceRoot: string, + file: WorkspaceFiles[number], + handlers: { + onHeaders?: (headers: string[]) => void; + onRecord: (record: { record: Record; fileLineNumber: number }) => void; + }, +) { + const filePath = path.join(workspaceRoot, file.relativePath); + const { parse } = await import('csv-parse'); + const readStream = createReadStream(filePath); + const parser = readStream.pipe( + parse({ + columns: (parsedHeaders: string[]) => { + handlers.onHeaders?.(parsedHeaders); + return parsedHeaders; + }, + skip_empty_lines: true, + bom: true, + info: true, + relax_column_count: true, + }), + ); + // Safety net: destroying the parser rejects the async iterator below so a + // pathologically slow file can't tie up the event loop indefinitely. + const timeout = setTimeout(() => { + parser.destroy(new Error('CSV operation exceeded the time limit')); + readStream.destroy(); + }, CSV_OPERATION_TIMEOUT_MS); + try { + for await (const { record, info } of parser as AsyncIterable<{ + record: Record; + info: { lines: number }; + }>) { + handlers.onRecord({ record, fileLineNumber: info.lines }); + } + } finally { + clearTimeout(timeout); + readStream.destroy(); + parser.destroy(); + } +} + +export function validateCsvColumns(headers: string[], fileName: string, columns: string[]) { + for (const column of columns) { + if (!headers.includes(column)) { + throw new Error(formatMissingCsvColumnError(fileName, column, headers)); + } + } +} + +export function matchesFilters(record: Record, filters: CsvFilter[]) { + return filters.every((filter) => { + const value = normaliseCsvValue(record[filter.column]); + if (filter.op === 'eq') return value === filter.value; + if (filter.op === 'contains') return value.includes(filter.value); + return filter.value.includes(value); + }); +} + +export function normaliseCsvValue(value: unknown) { + if (value === null || value === undefined) return ''; + return String(value); +} + +export function toCsvRecordValues(record: Record, columns: string[]) { + return Object.fromEntries(columns.map((column) => [column, normaliseCsvValue(record[column])])); +} + +function formatMissingCsvColumnError(fileName: string, requestedColumn: string, headers: string[]) { + const suggestions = getClosestColumnMatches(requestedColumn, headers); + const didYouMean = + suggestions.length > 0 + ? ` Did you mean ${suggestions.map((value) => `"${value}"`).join(', ')}?` + : ''; + return `CSV column "${requestedColumn}" not found in "${fileName}". Available columns: ${headers.join(', ')}.${didYouMean} Run csv_profile if you are uncertain about the schema.`; +} + +function getClosestColumnMatches(requestedColumn: string, headers: string[]) { + const requested = requestedColumn.toLowerCase(); + return headers + .map((header) => ({ header, distance: distance(requested, header.toLowerCase()) })) + .filter(({ header, distance: editDistance }) => { + return header.toLowerCase().includes(requested) || editDistance <= 3; + }) + .sort( + (left, right) => left.distance - right.distance || left.header.localeCompare(right.header), + ) + .slice(0, 3) + .map(({ header }) => header); +} + +export type CsvDistinctTracker = ReturnType; + +export function createCsvDistinctTracker(columns: string[], limit: number) { + const values = new Map(columns.map((column) => [column, new Set()])); + return { + add(record: Record) { + for (const [column, distinctValues] of values) { + if (distinctValues.size < limit) distinctValues.add(normaliseCsvValue(record[column])); + } + }, + toOutput() { + return Object.fromEntries( + Array.from(values.entries()).flatMap(([column, distinctValues]) => + distinctValues.size > 0 ? [[column, Array.from(distinctValues)]] : [], + ), + ); + }, + columns: Array.from(values.keys()), + }; +} + +export function buildCsvAmbiguity( + matchedRows: number, + limit: number, + tracker: CsvDistinctTracker | undefined, +) { + return { + matchedRows, + message: + matchedRows > limit + ? `Matched ${matchedRows} rows and returned only the first ${limit}. This is not a unique result. Refine filters before answering.` + : `Matched ${matchedRows} rows. This is not a unique result. Refine filters before answering.`, + suggestedColumns: tracker?.columns ?? [], + sampleDistinctValues: tracker?.toOutput(), + }; +} + +export function getSuggestedDisambiguatingColumns( + headers: string[], + filters: CsvFilter[], + selectedColumns: string[], +) { + const alreadyUsed = new Set([...filters.map((filter) => filter.column), ...selectedColumns]); + return headers + .filter((header) => !alreadyUsed.has(header)) + .sort((left, right) => preferenceScore(left) - preferenceScore(right)) + .slice(0, 5); +} + +/** + * Column-name heuristics used to rank likely disambiguating columns. Shared by + * getSuggestedDisambiguatingColumns and getLikelyDisambiguatingColumns. + */ +const PREFERRED_DISAMBIGUATING_COLUMNS = [ + 'Year', + 'Date', + 'Month', + 'Country', + 'Country Name', + 'Source', + 'Category', + 'Name', +]; + +function preferenceScore(column: string) { + const exactIndex = PREFERRED_DISAMBIGUATING_COLUMNS.findIndex( + (candidate) => candidate.toLowerCase() === column.toLowerCase(), + ); + if (exactIndex !== -1) return exactIndex; + const partialIndex = PREFERRED_DISAMBIGUATING_COLUMNS.findIndex((candidate) => + column.toLowerCase().includes(candidate.toLowerCase()), + ); + return partialIndex === -1 ? PREFERRED_DISAMBIGUATING_COLUMNS.length + 1 : partialIndex + 0.5; +} + +type CsvColumnType = 'empty' | 'integer' | 'number' | 'boolean' | 'date' | 'string'; + +// Bounded streaming accumulator for csv_profile; avoids loading full CSV columns into memory. +export function createCsvColumnProfileState(distinctLimit: number) { + const distinctValues = new Set(); + const sampleValues: string[] = []; + let distinctCountTruncated = false; + let emptyCount = 0; + let nonEmptyCount = 0; + let allInteger = true; + let allNumber = true; + let allBoolean = true; + let allDate = true; + return { + add(value: string) { + if (value === '') { + emptyCount++; + return; + } + nonEmptyCount++; + if (distinctValues.size < distinctLimit) { + distinctValues.add(value); + } else if (!distinctValues.has(value)) { + distinctCountTruncated = true; + } + if (!sampleValues.includes(value) && sampleValues.length < CSV_SAMPLE_VALUE_LIMIT) { + sampleValues.push(value); + } + allInteger &&= /^-?\d+$/.test(value); + allNumber &&= Number.isFinite(Number(value)); + allBoolean &&= /^(true|false|yes|no|0|1)$/i.test(value); + allDate &&= isLikelyDate(value); + }, + toOutput(name: string) { + return { + name, + inferredType: inferCsvColumnType({ + nonEmptyCount, + allInteger, + allNumber, + allBoolean, + allDate, + }), + emptyCount, + distinctCount: distinctValues.size, + distinctCountTruncated, + sampleValues, + }; + }, + }; +} + +export type CsvColumnProfileState = ReturnType; + +function inferCsvColumnType({ + nonEmptyCount, + allInteger, + allNumber, + allBoolean, + allDate, +}: { + nonEmptyCount: number; + allInteger: boolean; + allNumber: boolean; + allBoolean: boolean; + allDate: boolean; +}): CsvColumnType { + if (nonEmptyCount === 0) return 'empty'; + if (allBoolean) return 'boolean'; + if (allInteger) return 'integer'; + if (allNumber) return 'number'; + if (allDate) return 'date'; + return 'string'; +} + +function isLikelyDate(value: string) { + if (!/^\d{4}[-/]\d{1,2}([-/]\d{1,2})?$/.test(value)) return false; + return Number.isFinite(Date.parse(value)); +} + +export function getLikelyDisambiguatingColumns( + columnProfiles: Array<{ + name: string; + distinctCount?: number; + distinctCountTruncated?: boolean; + }>, + rowCount: number, +) { + return columnProfiles + .filter((column) => { + const distinctCount = column.distinctCount ?? 0; + return distinctCount > 1 && distinctCount < rowCount && !column.distinctCountTruncated; + }) + .sort((left, right) => preferenceScore(left.name) - preferenceScore(right.name)) + .slice(0, 5) + .map((column) => column.name); +} + +export function createCsvAggregateGroup(groupValues: Record, metrics: string[]) { + return { + groupValues, + count: 0, + metrics: Object.fromEntries(metrics.map((metric) => [metric, createNumericAggregateState()])), + }; +} + +export type CsvAggregateGroup = ReturnType; + +function createNumericAggregateState() { + return { + count: 0, + sum: 0, + min: undefined as number | undefined, + max: undefined as number | undefined, + skipped: 0, + add(value: string) { + const trimmedValue = value.trim(); + const numericValue = Number(trimmedValue); + if (trimmedValue === '' || !Number.isFinite(numericValue)) { + this.skipped++; + return; + } + this.count++; + this.sum += numericValue; + this.min = this.min === undefined ? numericValue : Math.min(this.min, numericValue); + this.max = this.max === undefined ? numericValue : Math.max(this.max, numericValue); + }, + }; +} + +export function formatCsvAggregateGroup( + group: CsvAggregateGroup, + functions: Array<'count' | 'min' | 'max' | 'sum' | 'avg'>, + metrics: string[], +) { + const output: Record = { ...group.groupValues }; + for (const fn of functions) { + if (fn === 'count') output.count = group.count; + } + for (const metric of metrics) { + const state = group.metrics[metric]; + for (const fn of functions) { + if (fn === 'min') output[`min_${metric}`] = state.min ?? null; + if (fn === 'max') output[`max_${metric}`] = state.max ?? null; + if (fn === 'sum') output[`sum_${metric}`] = state.count > 0 ? state.sum : null; + if (fn === 'avg') output[`avg_${metric}`] = state.count > 0 ? state.sum / state.count : null; + } + } + return output; +} + +export function sortCsvAggregateResults( + results: Array>, + orderBy: CsvAggregateInput['orderBy'], +) { + if (!orderBy) return; + const direction = orderBy.direction === 'desc' ? -1 : 1; + results.sort( + (left, right) => + compareCsvAggregateValues(left[orderBy.column], right[orderBy.column]) * direction, + ); +} + +function compareCsvAggregateValues( + left: string | number | null | undefined, + right: string | number | null | undefined, +) { + if (left === right) return 0; + if (left === null || left === undefined) return 1; + if (right === null || right === undefined) return -1; + if (typeof left === 'number' && typeof right === 'number') return left - right; + return String(left).localeCompare(String(right)); +} diff --git a/packages/cli/src/modules/agents/tools/knowledge/csv.operation.ts b/packages/cli/src/modules/agents/tools/knowledge/csv.operation.ts new file mode 100644 index 00000000000..008507f4f99 --- /dev/null +++ b/packages/cli/src/modules/agents/tools/knowledge/csv.operation.ts @@ -0,0 +1,264 @@ +import type { + CsvAggregateInput, + CsvDistinctInput, + CsvProfileInput, + CsvQueryInput, +} from './schemas'; +import type { WorkspaceFiles } from './file-references'; +import { + CSV_DISTINCT_TRACK_LIMIT, + CSV_MAX_AGGREGATE_GROUPS, + CSV_PROFILE_DISTINCT_LIMIT, + CSV_SAMPLE_VALUE_LIMIT, + buildCsvAmbiguity, + createCsvAggregateGroup, + createCsvColumnProfileState, + createCsvDistinctTracker, + formatCsvAggregateGroup, + getLikelyDisambiguatingColumns, + getSuggestedDisambiguatingColumns, + matchesFilters, + normaliseCsvValue, + resolveCsvFile, + sortCsvAggregateResults, + streamCsvRecords, + toCsvRecordValues, + validateCsvColumns, + type CsvAggregateGroup, + type CsvColumnProfileState, + type CsvDistinctTracker, +} from './csv-helpers'; + +export async function queryCsv(workspaceRoot: string, files: WorkspaceFiles, input: CsvQueryInput) { + const file = resolveCsvFile(files, input.file); + const headers: string[] = []; + const limit = input.limit ?? 20; + const select = input.select; + const rows: string[][] = []; + const rowNumbers: number[] = []; + const records: Array<{ + rowNumber: number; + fileLineNumber: number; + values: Record; + }> = []; + let ambiguityTracker: CsvDistinctTracker | undefined; + let matched = 0; + if (input.rowNumber === undefined && select === undefined) { + throw new Error('csv_query requires select unless rowNumber is provided.'); + } + + await streamCsvRecords(workspaceRoot, file, { + onHeaders: (parsedHeaders) => { + headers.push(...parsedHeaders); + validateCsvColumns(headers, file.fileName, [ + ...(select ?? []), + ...(input.where ?? []).map((filter) => filter.column), + ]); + ambiguityTracker = createCsvDistinctTracker( + getSuggestedDisambiguatingColumns(headers, input.where ?? [], select ?? []), + CSV_SAMPLE_VALUE_LIMIT, + ); + }, + onRecord: ({ record, fileLineNumber }) => { + if (input.rowNumber !== undefined && fileLineNumber !== input.rowNumber) return; + if (input.rowNumber === undefined && !matchesFilters(record, input.where ?? [])) return; + + matched++; + ambiguityTracker?.add(record); + const columns = select ?? headers; + if (rows.length < limit) { + const values = toCsvRecordValues(record, columns); + rows.push(columns.map((column) => values[column])); + rowNumbers.push(fileLineNumber); + records.push({ rowNumber: fileLineNumber, fileLineNumber, values }); + } + }, + }); + if (headers.length === 0) validateCsvColumns(headers, file.fileName, select ?? []); + + const columns = select ?? headers; + const truncated = matched > rows.length; + + return { + fileName: file.fileName, + relativePath: file.relativePath, + columns, + rowNumbers, + rows, + records, + rowCount: matched, + truncated, + rowNumberBase: 'rowNumber is the CSV file line number; line 1 is the header row.', + ambiguity: + input.rowNumber === undefined && (matched > 1 || truncated) + ? buildCsvAmbiguity(matched, input.limit ?? 20, ambiguityTracker) + : undefined, + }; +} + +export async function profileCsv( + workspaceRoot: string, + files: WorkspaceFiles, + input: CsvProfileInput, +) { + const file = resolveCsvFile(files, input.file); + const headers: string[] = []; + const sampleRows: Array> = []; + const rowCountByColumn = new Map(); + let rowCount = 0; + const distinctLimit = input.distinctLimit ?? CSV_PROFILE_DISTINCT_LIMIT; + + await streamCsvRecords(workspaceRoot, file, { + onHeaders: (parsedHeaders) => { + headers.push(...parsedHeaders); + for (const header of headers) { + rowCountByColumn.set(header, createCsvColumnProfileState(distinctLimit)); + } + }, + onRecord: ({ record }) => { + rowCount++; + if (sampleRows.length < (input.sampleSize ?? 5)) { + sampleRows.push(toCsvRecordValues(record, headers)); + } + for (const header of headers) { + rowCountByColumn.get(header)?.add(normaliseCsvValue(record[header])); + } + }, + }); + + const columnProfiles = headers.map((header) => { + const profile = rowCountByColumn.get(header) ?? createCsvColumnProfileState(distinctLimit); + return profile.toOutput(header); + }); + + return { + fileName: file.fileName, + relativePath: file.relativePath, + columns: headers, + rowCount, + sampleRows, + columnProfiles, + likelyKeyColumns: columnProfiles + .filter((column) => column.distinctCount === rowCount && rowCount > 0) + .map((column) => column.name), + likelyDisambiguatingColumns: getLikelyDisambiguatingColumns(columnProfiles, rowCount), + }; +} + +export async function distinctCsv( + workspaceRoot: string, + files: WorkspaceFiles, + input: CsvDistinctInput, +) { + const file = resolveCsvFile(files, input.file); + const values = new Set(); + let distinctTruncated = false; + const outputValues: string[] = []; + + await streamCsvRecords(workspaceRoot, file, { + onHeaders: (headers) => { + validateCsvColumns(headers, file.fileName, [ + input.column, + ...(input.where ?? []).map((filter) => filter.column), + ]); + }, + onRecord: ({ record }) => { + if (!matchesFilters(record, input.where ?? [])) return; + const value = normaliseCsvValue(record[input.column]); + if (!values.has(value)) { + if (values.size < CSV_DISTINCT_TRACK_LIMIT) { + values.add(value); + } else { + distinctTruncated = true; + } + if (outputValues.length < (input.limit ?? 50)) outputValues.push(value); + } + }, + }); + + return { + fileName: file.fileName, + relativePath: file.relativePath, + column: input.column, + values: outputValues, + distinctCount: values.size, + truncated: distinctTruncated || values.size > outputValues.length, + }; +} + +export async function aggregateCsv( + workspaceRoot: string, + files: WorkspaceFiles, + input: CsvAggregateInput, +) { + const file = resolveCsvFile(files, input.file); + const functions = input.functions ?? ['count']; + const metrics = Array.from( + new Set([...(input.metric ? [input.metric] : []), ...(input.metrics ?? [])]), + ); + const needsMetric = functions.some((fn) => fn !== 'count'); + if (needsMetric && metrics.length === 0) { + throw new Error('csv_aggregate requires metric or metrics for min, max, sum, or avg.'); + } + const groups = new Map(); + let rowCount = 0; + let groupLimitReached = false; + + await streamCsvRecords(workspaceRoot, file, { + onHeaders: (headers) => { + validateCsvColumns(headers, file.fileName, [ + ...metrics, + ...(input.groupBy ?? []), + ...(input.where ?? []).map((filter) => filter.column), + ]); + }, + onRecord: ({ record }) => { + if (!matchesFilters(record, input.where ?? [])) return; + rowCount++; + const groupValues = toCsvRecordValues(record, input.groupBy ?? []); + const key = JSON.stringify(groupValues); + let group = groups.get(key); + if (!group) { + // Bound memory: stop opening new groups past the cap, but keep + // aggregating rows for groups we already track. + if (groups.size >= CSV_MAX_AGGREGATE_GROUPS) { + groupLimitReached = true; + return; + } + group = createCsvAggregateGroup(groupValues, metrics); + groups.set(key, group); + } + group.count++; + for (const metric of metrics) { + group.metrics[metric].add(normaliseCsvValue(record[metric])); + } + }, + }); + if (groups.size === 0 && input.groupBy === undefined) { + groups.set(JSON.stringify({}), createCsvAggregateGroup({}, metrics)); + } + + const results = Array.from(groups.values()).map((group) => + formatCsvAggregateGroup(group, functions, metrics), + ); + sortCsvAggregateResults(results, input.orderBy); + const limit = input.limit ?? 50; + const skippedNonNumeric: Record = {}; + for (const group of groups.values()) { + for (const metric of metrics) { + skippedNonNumeric[metric] = (skippedNonNumeric[metric] ?? 0) + group.metrics[metric].skipped; + } + } + + return { + fileName: file.fileName, + relativePath: file.relativePath, + rowCount, + functions, + metrics, + groupBy: input.groupBy, + results: results.slice(0, limit), + truncated: results.length > limit || groupLimitReached, + skippedNonNumeric, + }; +} diff --git a/packages/cli/src/modules/agents/tools/knowledge/file-references.ts b/packages/cli/src/modules/agents/tools/knowledge/file-references.ts new file mode 100644 index 00000000000..0e6c701b6db --- /dev/null +++ b/packages/cli/src/modules/agents/tools/knowledge/file-references.ts @@ -0,0 +1,48 @@ +import type { AgentKnowledgeService } from '../../agent-knowledge.service'; + +import type { ParsedSearchKnowledgeInput } from './schemas'; + +export type WorkspaceFiles = Awaited>; + +export type FileReferenceResolution = + | { status: 'found'; file: WorkspaceFiles[number] } + | { status: 'missing'; error: string } + | { status: 'ambiguous'; error: string }; + +export function resolveFileReference( + files: WorkspaceFiles, + reference: string, +): FileReferenceResolution { + const matches = files.filter( + (file) => + file.id === reference || file.relativePath === reference || file.fileName === reference, + ); + if (matches.length === 1) return { status: 'found', file: matches[0] }; + if (matches.length === 0) return { status: 'missing', error: `File "${reference}" not found` }; + + return { + status: 'ambiguous', + error: `File "${reference}" matches multiple uploaded files. Use the file id or relative path instead.`, + }; +} + +export function getRequiredFileReferences(input: ParsedSearchKnowledgeInput) { + if (input.operation === 'search') return input.files; + if ( + input.operation === 'read' || + input.operation === 'csv_query' || + input.operation === 'csv_profile' || + input.operation === 'csv_distinct' || + input.operation === 'csv_aggregate' + ) { + return [input.file]; + } + return undefined; +} + +export function mapFileReferences(files: WorkspaceFiles, requestedFiles?: string[]) { + return requestedFiles?.map((file) => { + const resolvedFile = resolveFileReference(files, file); + return resolvedFile.status === 'found' ? resolvedFile.file.relativePath : file; + }); +} diff --git a/packages/cli/src/modules/agents/tools/knowledge/read.operation.ts b/packages/cli/src/modules/agents/tools/knowledge/read.operation.ts new file mode 100644 index 00000000000..3ae7bb176e2 --- /dev/null +++ b/packages/cli/src/modules/agents/tools/knowledge/read.operation.ts @@ -0,0 +1,50 @@ +import type { AgentKnowledgeCommandService } from '../../agent-knowledge-command.service'; + +import { resolveFileReference, type WorkspaceFiles } from './file-references'; +import { runInternalCommand } from './search.operation'; +import type { + InternalKnowledgeCommandRequest, + ParsedSearchKnowledgeInput, + SearchKnowledgeOutput, +} from './schemas'; + +type ReadInput = Extract; + +export async function runReadOperation( + input: ReadInput, + workspaceRoot: string, + files: WorkspaceFiles, + commandService: AgentKnowledgeCommandService, +): Promise { + const resolvedFile = resolveFileReference(files, input.file); + if (resolvedFile.status !== 'found') { + return { + operation: 'read', + files, + error: resolvedFile.error, + }; + } + const file = resolvedFile.file; + const request: InternalKnowledgeCommandRequest = input.lineRange + ? { + command: 'sed', + file: file.relativePath, + startLine: input.lineRange.start, + endLine: input.lineRange.end, + } + : { command: 'cat', file: file.relativePath }; + const result = await runInternalCommand(commandService, workspaceRoot, request); + return { + operation: 'read', + files, + result: { + ...result, + citation: { + fileName: file.fileName, + lineRange: input.lineRange, + instruction: + 'Cite this source using only fileName and lineRange. Do not cite file ids, relative paths, binary ids, or storage ids.', + }, + }, + }; +} diff --git a/packages/cli/src/modules/agents/tools/knowledge/schemas.ts b/packages/cli/src/modules/agents/tools/knowledge/schemas.ts new file mode 100644 index 00000000000..ac95d6a2b2b --- /dev/null +++ b/packages/cli/src/modules/agents/tools/knowledge/schemas.ts @@ -0,0 +1,501 @@ +import type { JSONSchema7 } from 'json-schema'; +import { z } from 'zod'; + +import type { + AgentKnowledgeCommandRequest, + AgentKnowledgeCommandResult, +} from '../../agent-knowledge-command.service'; + +export const DEFAULT_SEARCH_HEAD_LIMIT = 250; + +export const KNOWLEDGE_OPERATIONS = [ + 'list', + 'search', + 'read', + 'csv_query', + 'csv_profile', + 'csv_distinct', + 'csv_aggregate', +] as const; + +const lineRangeSchema = z.object({ + start: z.number().int().min(1), + end: z.number().int().min(1), +}); + +const searchOutputModeSchema = z.enum(['files_with_matches', 'content', 'count']); +const searchMatchModeSchema = z.enum(['any', 'all_on_same_line', 'all_within_lines']); +const csvAggregateFunctionSchema = z.enum(['count', 'min', 'max', 'sum', 'avg']); + +export const csvFilterSchema = z.discriminatedUnion('op', [ + z.object({ + column: z.string().min(1), + op: z.literal('eq'), + value: z.string(), + }), + z.object({ + column: z.string().min(1), + op: z.literal('in'), + value: z.array(z.string()).min(1).max(50), + }), + z.object({ + column: z.string().min(1), + op: z.literal('contains'), + value: z.string(), + }), +]); + +const listInputSchema = z.object({ operation: z.literal('list') }).strict(); +const searchInputSchema = z + .object({ + operation: z.literal('search'), + query: z.string().min(1).optional(), + queries: z.array(z.string().min(1)).min(1).max(5).optional(), + match_mode: searchMatchModeSchema.default('any'), + output_mode: searchOutputModeSchema.default('files_with_matches'), + caseInsensitive: z.boolean().optional(), + fixedStrings: z.boolean().optional(), + context: z.number().int().min(0).max(5).optional(), + file: z.string().min(1).optional(), + files: z.array(z.string()).max(10).optional(), + offset: z.number().int().min(0).default(0), + head_limit: z.number().int().min(0).default(DEFAULT_SEARCH_HEAD_LIMIT), + }) + .strict(); +const readInputSchema = z + .object({ + operation: z.literal('read'), + file: z.string().min(1), + lineRange: lineRangeSchema.optional(), + }) + .strict(); +export const csvQueryInputSchema = z + .object({ + operation: z.literal('csv_query'), + file: z.string().min(1), + select: z.array(z.string().min(1)).min(1).max(50).optional(), + where: z.array(csvFilterSchema).max(10).optional(), + rowNumber: z.number().int().min(2).optional(), + limit: z.number().int().min(1).max(100).default(20), + }) + .strict(); +const csvProfileInputSchema = z + .object({ + operation: z.literal('csv_profile'), + file: z.string().min(1), + sampleSize: z.number().int().min(1).max(20).default(5), + distinctLimit: z.number().int().min(10).max(500).default(100), + }) + .strict(); +const csvDistinctInputSchema = z + .object({ + operation: z.literal('csv_distinct'), + file: z.string().min(1), + column: z.string().min(1), + where: z.array(csvFilterSchema).max(10).optional(), + limit: z.number().int().min(1).max(200).default(50), + }) + .strict(); +const csvAggregateInputSchema = z + .object({ + operation: z.literal('csv_aggregate'), + file: z.string().min(1), + metric: z.string().min(1).optional(), + metrics: z.array(z.string().min(1)).min(1).max(10).optional(), + functions: z.array(csvAggregateFunctionSchema).min(1).max(5).default(['count']), + where: z.array(csvFilterSchema).max(10).optional(), + groupBy: z.array(z.string().min(1)).min(1).max(5).optional(), + orderBy: z + .object({ + column: z.string().min(1), + direction: z.enum(['asc', 'desc']).default('asc'), + }) + .strict() + .optional(), + limit: z.number().int().min(1).max(200).default(50), + }) + .strict(); + +export const searchKnowledgeParsingSchema = z.discriminatedUnion('operation', [ + listInputSchema, + searchInputSchema, + readInputSchema, + csvQueryInputSchema, + csvProfileInputSchema, + csvDistinctInputSchema, + csvAggregateInputSchema, +]); + +export const searchKnowledgeInputSchema: JSONSchema7 = { + type: 'object', + description: + 'Use exactly one operation shape. Do not include fields from other operations. ' + + 'Use csv_profile for unfamiliar CSVs, csv_query for rows, csv_distinct for values, and csv_aggregate for computed CSV answers.', + additionalProperties: false, + required: ['operation'], + properties: { + operation: { + type: 'string', + description: + 'Operation to perform. Allowed values: list, search, read, csv_query, csv_profile, csv_distinct, csv_aggregate.', + }, + query: { + type: 'string', + minLength: 1, + description: + 'For operation=search only: search pattern. For conceptual multi-term lookup, prefer queries with match_mode instead of writing regex by hand.', + }, + queries: { + type: 'array', + minItems: 1, + maxItems: 5, + items: { type: 'string', minLength: 1 }, + description: + 'For operation=search only: multiple literal search terms for conceptual lookup without hand-written regex.', + }, + match_mode: { + type: 'string', + default: 'any', + description: + 'For operation=search with queries only: any, all_on_same_line, or all_within_lines. Use all_within_lines to find concepts near each other without regex.', + }, + output_mode: { + type: 'string', + description: + 'For operation=search only: content shows matching lines, files_with_matches shows only matching files (default), count shows match counts. Use content only after narrowing to a file or exact phrase.', + default: 'files_with_matches', + }, + caseInsensitive: { + type: 'boolean', + description: 'For operation=search only: run case-insensitive search.', + }, + fixedStrings: { + type: 'boolean', + description: + 'For operation=search only: treat query as a fixed string instead of a regex. Defaults to true.', + }, + context: { + type: 'integer', + minimum: 0, + maximum: 5, + description: + 'For operation=search only: number of surrounding context lines. Requires output_mode=content.', + }, + files: { + type: 'array', + maxItems: 10, + items: { type: 'string' }, + description: + 'For operation=search only: optional file ids, relative paths, or exact file names to search. These are tool handles only; do not cite them to users.', + }, + offset: { + type: 'integer', + minimum: 0, + default: 0, + description: 'For operation=search only: number of files, counts, or matches to skip.', + }, + head_limit: { + type: 'integer', + minimum: 0, + default: DEFAULT_SEARCH_HEAD_LIMIT, + description: + 'For operation=search only: limit output to first N files/counts/lines. Defaults to 250. Pass 0 for unlimited.', + }, + file: { + type: 'string', + minLength: 1, + description: + 'For operation=read or CSV operations: file id, relative path, or exact file name. For operation=search: alias for a single files entry. This is a tool handle only; cite the returned fileName and lineRange instead.', + }, + lineRange: { + type: 'object', + additionalProperties: false, + description: 'For operation=read only: optional line range to read.', + properties: { + start: { type: 'integer', minimum: 1 }, + end: { type: 'integer', minimum: 1 }, + }, + }, + where: { + type: 'array', + maxItems: 10, + description: + 'For CSV operations only: row filters ANDed together. Each filter has column, op, and value. Allowed op values: eq, in, contains. For op=in, value must be an array of strings.', + items: { + type: 'object', + additionalProperties: true, + required: ['column', 'op', 'value'], + properties: { + column: { type: 'string', minLength: 1 }, + op: { + type: 'string', + description: 'Allowed values: eq, in, contains.', + }, + value: { + description: + 'String value for eq/contains, or array of strings for in. Local validation enforces the exact shape.', + }, + }, + }, + }, + select: { + type: 'array', + minItems: 1, + maxItems: 50, + items: { type: 'string', minLength: 1 }, + description: + 'For operation=csv_query only: columns to return. Omit with rowNumber to return all columns for that row.', + }, + rowNumber: { + type: 'integer', + minimum: 2, + description: + 'For operation=csv_query only: exact CSV file line number to fetch. Header is line 1, so data rows usually start at line 2.', + }, + column: { + type: 'string', + minLength: 1, + description: 'For operation=csv_distinct only: column whose values should be returned.', + }, + metric: { + type: 'string', + minLength: 1, + description: + 'For operation=csv_aggregate only: numeric metric column for min, max, sum, or avg.', + }, + metrics: { + type: 'array', + minItems: 1, + maxItems: 10, + items: { type: 'string', minLength: 1 }, + description: + 'For operation=csv_aggregate only: numeric metric columns for min, max, sum, or avg.', + }, + functions: { + type: 'array', + minItems: 1, + maxItems: 5, + items: { type: 'string', enum: ['count', 'min', 'max', 'sum', 'avg'] }, + default: ['count'], + description: + 'For operation=csv_aggregate only: aggregate functions to compute. count does not require a metric.', + }, + groupBy: { + type: 'array', + minItems: 1, + maxItems: 5, + items: { type: 'string', minLength: 1 }, + description: 'For operation=csv_aggregate only: columns to group aggregate results by.', + }, + orderBy: { + type: 'object', + additionalProperties: false, + description: + 'For operation=csv_aggregate only: sort grouped output by a group column or aggregate output column.', + properties: { + column: { type: 'string', minLength: 1 }, + direction: { type: 'string', enum: ['asc', 'desc'], default: 'asc' }, + }, + }, + sampleSize: { + type: 'integer', + minimum: 1, + maximum: 20, + default: 5, + description: 'For operation=csv_profile only: number of sample rows to return.', + }, + distinctLimit: { + type: 'integer', + minimum: 10, + maximum: 500, + default: 100, + description: + 'For operation=csv_profile only: maximum distinct values tracked per column before marking that column as truncated.', + }, + limit: { + type: 'integer', + minimum: 1, + maximum: 200, + default: 20, + description: + 'For CSV operations only: maximum rows, groups, or distinct values to return. Defaults to 20 for csv_query, 50 for csv_distinct, and 50 for csv_aggregate.', + }, + }, +}; + +const knowledgeFileOutputSchema = z.object({ + id: z.string(), + fileName: z.string(), + mimeType: z.string(), + fileSizeBytes: z.number(), + relativePath: z.string(), +}); + +const commandResultOutputSchema = z.object({ + command: z.enum(['git_grep', 'cat', 'sed']), + exitCode: z.number().nullable(), + stdout: z.string(), + stderr: z.string(), + truncated: z.boolean(), + citation: z + .object({ + fileName: z.string(), + lineRange: lineRangeSchema.optional(), + instruction: z.string(), + }) + .optional(), +}); + +const searchMatchOutputSchema = z.object({ + fileId: z.string(), + fileName: z.string(), + relativePath: z.string(), + lineNumber: z.number(), + text: z.string(), + readRange: lineRangeSchema, + truncated: z.boolean().optional(), +}); + +const searchFileOutputSchema = z.object({ + id: z.string(), + fileName: z.string(), + relativePath: z.string(), + matchCount: z.number(), +}); + +const searchResultOutputSchema = z.object({ + mode: searchOutputModeSchema, + query: z.string(), + queries: z.array(z.string()).optional(), + matchMode: searchMatchModeSchema.optional(), + totalMatchingFiles: z.number(), + totalMatchingLines: z.number(), + files: z.array(searchFileOutputSchema), + matches: z.array(searchMatchOutputSchema), + truncated: z.boolean(), + appliedLimit: z.number().optional(), + appliedOffset: z.number().optional(), + nextOffset: z.number().optional(), + hint: z.string().optional(), +}); + +const csvQueryResultOutputSchema = z.object({ + fileName: z.string(), + relativePath: z.string(), + columns: z.array(z.string()), + rowNumbers: z.array(z.number()), + rows: z.array(z.array(z.string())), + records: z + .array( + z.object({ + rowNumber: z.number(), + fileLineNumber: z.number(), + values: z.record(z.string(), z.string()), + }), + ) + .optional(), + rowCount: z.number(), + truncated: z.boolean(), + rowNumberBase: z.string().optional(), + ambiguity: z + .object({ + matchedRows: z.number(), + message: z.string(), + suggestedColumns: z.array(z.string()), + sampleDistinctValues: z.record(z.string(), z.array(z.string())).optional(), + }) + .optional(), +}); +const csvColumnProfileOutputSchema = z.object({ + name: z.string(), + inferredType: z.enum(['empty', 'integer', 'number', 'boolean', 'date', 'string']), + emptyCount: z.number(), + distinctCount: z.number().optional(), + distinctCountTruncated: z.boolean().optional(), + sampleValues: z.array(z.string()).optional(), +}); +const csvProfileOutputSchema = z.object({ + fileName: z.string(), + relativePath: z.string(), + columns: z.array(z.string()), + rowCount: z.number(), + sampleRows: z.array(z.record(z.string(), z.string())), + columnProfiles: z.array(csvColumnProfileOutputSchema), + likelyKeyColumns: z.array(z.string()), + likelyDisambiguatingColumns: z.array(z.string()), +}); +const csvDistinctOutputSchema = z.object({ + fileName: z.string(), + relativePath: z.string(), + column: z.string(), + values: z.array(z.string()), + distinctCount: z.number(), + truncated: z.boolean(), +}); +const csvAggregateOutputSchema = z.object({ + fileName: z.string(), + relativePath: z.string(), + rowCount: z.number(), + functions: z.array(csvAggregateFunctionSchema), + metrics: z.array(z.string()), + groupBy: z.array(z.string()).optional(), + results: z.array(z.record(z.string(), z.union([z.string(), z.number(), z.null()]))), + truncated: z.boolean(), + skippedNonNumeric: z.record(z.string(), z.number()).optional(), +}); + +export const searchKnowledgeOutputSchema = z.object({ + operation: z.enum(KNOWLEDGE_OPERATIONS), + files: z.array(knowledgeFileOutputSchema), + result: commandResultOutputSchema.optional(), + search: searchResultOutputSchema.optional(), + csv: csvQueryResultOutputSchema.optional(), + csvProfile: csvProfileOutputSchema.optional(), + csvDistinct: csvDistinctOutputSchema.optional(), + csvAggregate: csvAggregateOutputSchema.optional(), + error: z.string().optional(), +}); + +export type ParsedSearchKnowledgeInput = z.infer; +export type SearchKnowledgeOutput = z.infer; +export type CsvQueryInput = z.infer; +export type CsvProfileInput = z.infer; +export type CsvDistinctInput = z.infer; +export type CsvAggregateInput = z.infer; +export type CsvFilter = z.infer; +export type SearchOutputMode = z.infer; +export type SearchMatchMode = z.infer; +export type SearchMatchOutput = z.infer; +export type SearchResultOutput = z.infer; +export type InternalKnowledgeCommandRequest = Extract< + AgentKnowledgeCommandRequest, + { command: 'git_grep' | 'cat' | 'sed' } +>; +export type InternalKnowledgeCommandResult = Omit & { + command: InternalKnowledgeCommandRequest['command']; +}; + +export function parseSearchKnowledgeInput(input: unknown): ParsedSearchKnowledgeInput { + const parsed = searchKnowledgeParsingSchema.parse(input); + if (parsed.operation !== 'search' || parsed.file === undefined) return parsed; + + const { file, ...searchInput } = parsed; + const files = Array.from(new Set([file, ...(parsed.files ?? [])])); + if (files.length > 10) { + throw new Error('Search can target at most 10 files.'); + } + + return { + ...searchInput, + files, + }; +} + +export function getSearchKnowledgeOperation(input: unknown): SearchKnowledgeOutput['operation'] { + const parsed = z + .object({ + operation: z.enum(KNOWLEDGE_OPERATIONS), + }) + .safeParse(input); + return parsed.success ? parsed.data.operation : 'list'; +} diff --git a/packages/cli/src/modules/agents/tools/knowledge/search.operation.ts b/packages/cli/src/modules/agents/tools/knowledge/search.operation.ts new file mode 100644 index 00000000000..3b24acc0326 --- /dev/null +++ b/packages/cli/src/modules/agents/tools/knowledge/search.operation.ts @@ -0,0 +1,456 @@ +import type { AgentKnowledgeCommandService } from '../../agent-knowledge-command.service'; +import type { + InternalKnowledgeCommandRequest, + InternalKnowledgeCommandResult, + ParsedSearchKnowledgeInput, + SearchKnowledgeOutput, + SearchMatchMode, + SearchMatchOutput, + SearchOutputMode, + SearchResultOutput, +} from './schemas'; +import { mapFileReferences, type WorkspaceFiles } from './file-references'; + +type SearchInput = Extract; +type InternalSearchMatch = SearchMatchOutput & { fullText: string }; + +const DEFAULT_READ_RANGE_CONTEXT = 6; +const MAX_SEARCH_MATCH_TEXT_LENGTH = 500; +const MULTI_QUERY_WINDOW_LINES = 3; + +export async function runInternalCommand( + commandService: AgentKnowledgeCommandService, + workspaceRoot: string, + request: InternalKnowledgeCommandRequest, +): Promise { + const result = await commandService.run(workspaceRoot, request); + return { ...result, command: request.command }; +} + +export async function runSearchOperation( + input: SearchInput, + workspaceRoot: string, + files: WorkspaceFiles, + commandService: AgentKnowledgeCommandService, +): Promise { + if (input.query === undefined && input.queries === undefined) { + return { + operation: 'search', + files, + error: 'Either query or queries must be provided for search.', + }; + } + const requestedFiles = mapFileReferences(files, input.files); + const primaryPattern = getPrimarySearchPattern(input); + const commandPattern = getSearchCommandPattern(input); + const commandFixedStrings = getSearchCommandFixedStrings(input); + let contentResult: InternalKnowledgeCommandResult | undefined; + const countResult = await runInternalCommand(commandService, workspaceRoot, { + command: 'git_grep', + pattern: commandPattern, + outputMode: 'count', + caseInsensitive: input.caseInsensitive, + fixedStrings: commandFixedStrings, + files: requestedFiles, + }); + let counts = parseCountOutput(countResult.stdout, files); + let multiQueryMatches: InternalSearchMatch[] | undefined; + if (input.queries) { + contentResult = await runInternalCommand(commandService, workspaceRoot, { + command: 'git_grep', + pattern: commandPattern, + caseInsensitive: input.caseInsensitive, + fixedStrings: commandFixedStrings, + context: input.context, + files: requestedFiles, + }); + multiQueryMatches = filterMultiQueryMatches( + parseSearchMatches(contentResult.stdout, files), + input.queries, + input.match_mode, + input.caseInsensitive, + ); + counts = buildCountsFromMatches(multiQueryMatches, files); + } + + if (input.output_mode === 'files_with_matches') { + const slicedCounts = sliceResults(counts, input.offset, input.head_limit); + return { + operation: 'search', + files, + result: toDisplayResult( + countResult, + formatSearchFiles(counts, input.offset, input.head_limit), + slicedCounts.truncated, + ), + search: buildSearchResult({ + mode: input.output_mode, + query: primaryPattern, + queries: input.queries, + matchMode: input.queries ? input.match_mode : undefined, + counts, + matches: [], + offset: input.offset, + headLimit: input.head_limit, + hint: buildSearchHint('files_with_matches', slicedCounts, input.head_limit), + }), + }; + } + + if (input.output_mode === 'count') { + const slicedCounts = sliceResults(counts, input.offset, input.head_limit); + return { + operation: 'search', + files, + result: toDisplayResult( + countResult, + formatSearchCounts(counts, input.offset, input.head_limit), + slicedCounts.truncated, + ), + search: buildSearchResult({ + mode: input.output_mode, + query: primaryPattern, + queries: input.queries, + matchMode: input.queries ? input.match_mode : undefined, + counts, + matches: [], + offset: input.offset, + headLimit: input.head_limit, + hint: buildSearchHint('count', slicedCounts, input.head_limit), + }), + }; + } + + contentResult ??= await runInternalCommand(commandService, workspaceRoot, { + command: 'git_grep', + pattern: commandPattern, + caseInsensitive: input.caseInsensitive, + fixedStrings: commandFixedStrings, + context: input.context, + files: requestedFiles, + }); + const parsedMatches = parseSearchMatches(contentResult.stdout, files); + const matches = multiQueryMatches ?? parsedMatches; + const slicedMatches = sliceResults(matches, input.offset, input.head_limit); + const displayMatches = slicedMatches.items.map(toSearchMatchOutput); + const search = buildSearchResult({ + mode: input.output_mode, + query: primaryPattern, + queries: input.queries, + matchMode: input.queries ? input.match_mode : undefined, + counts, + matches: displayMatches, + offset: input.offset, + headLimit: input.head_limit, + nextOffset: slicedMatches.nextOffset, + hint: buildSearchHint('content', slicedMatches, input.head_limit), + }); + return { + operation: 'search', + files, + result: toDisplayResult( + contentResult, + formatSearchMatches(displayMatches, slicedMatches, input.head_limit), + search.truncated || contentResult.truncated, + ), + search, + }; +} + +function toDisplayResult( + result: InternalKnowledgeCommandResult, + stdout: string, + truncated = false, +): InternalKnowledgeCommandResult { + return { + ...result, + stdout, + truncated: result.truncated || truncated, + }; +} + +function parseCountOutput(stdout: string, files: WorkspaceFiles) { + const byRelativePath = new Map(files.map((file) => [file.relativePath, file])); + const counts = stdout + .split('\n') + .flatMap((line) => { + if (line.trim() === '') return []; + const separatorIndex = line.lastIndexOf(':'); + if (separatorIndex === -1) return []; + const relativePath = normaliseGrepPath(line.slice(0, separatorIndex)); + const matchCount = Number(line.slice(separatorIndex + 1)); + const file = byRelativePath.get(relativePath); + if (!file || !Number.isFinite(matchCount) || matchCount <= 0) return []; + return [ + { + id: file.id, + fileName: file.fileName, + relativePath: file.relativePath, + matchCount, + }, + ]; + }) + .sort((left, right) => right.matchCount - left.matchCount); + return counts; +} + +function parseSearchMatches(stdout: string, files: WorkspaceFiles): InternalSearchMatch[] { + const byRelativePath = new Map(files.map((file) => [file.relativePath, file])); + return stdout.split('\n').flatMap((line) => { + const parsed = parseGrepLine(line); + if (!parsed?.isMatch) return []; + const file = byRelativePath.get(normaliseGrepPath(parsed.filePath)); + if (!file || parsed.lineNumber === undefined) return []; + const fullText = line.slice(parsed.contentStartIndex); + const { text, truncated } = truncateMatchText(fullText); + return [ + { + fileId: file.id, + fileName: file.fileName, + relativePath: file.relativePath, + lineNumber: parsed.lineNumber, + fullText, + text, + readRange: toReadRange(parsed.lineNumber), + truncated, + }, + ]; + }); +} + +function toSearchMatchOutput({ + fullText: _fullText, + ...match +}: InternalSearchMatch): SearchMatchOutput { + return match; +} + +function truncateMatchText(text: string) { + if (text.length <= MAX_SEARCH_MATCH_TEXT_LENGTH) return { text }; + return { + text: `${text.slice(0, MAX_SEARCH_MATCH_TEXT_LENGTH)}... [line truncated; use read for full text]`, + truncated: true, + }; +} + +function filterMultiQueryMatches( + matches: InternalSearchMatch[], + queries: string[], + matchMode: SearchMatchMode, + caseInsensitive?: boolean, +) { + const normalizedQueries = queries.map((query) => normalizeSearchText(query, caseInsensitive)); + if (matchMode === 'any') { + return matches.filter((match) => + normalizedQueries.some((query) => + normalizeSearchText(match.fullText, caseInsensitive).includes(query), + ), + ); + } + if (matchMode === 'all_on_same_line') { + return matches.filter((match) => { + const text = normalizeSearchText(match.fullText, caseInsensitive); + return normalizedQueries.every((query) => text.includes(query)); + }); + } + return matches.filter((match) => + hasAllQueriesInNearbyWindow( + matches, + match.relativePath, + match.lineNumber, + normalizedQueries, + caseInsensitive, + ), + ); +} + +function buildCountsFromMatches(matches: SearchMatchOutput[], files: WorkspaceFiles) { + const countByRelativePath = new Map(); + for (const match of matches) { + countByRelativePath.set( + match.relativePath, + (countByRelativePath.get(match.relativePath) ?? 0) + 1, + ); + } + return files + .flatMap((file) => { + const matchCount = countByRelativePath.get(file.relativePath) ?? 0; + if (matchCount === 0) return []; + return [ + { + id: file.id, + fileName: file.fileName, + relativePath: file.relativePath, + matchCount, + }, + ]; + }) + .sort((left, right) => right.matchCount - left.matchCount); +} + +function hasAllQueriesInNearbyWindow( + matches: InternalSearchMatch[], + relativePath: string, + lineNumber: number, + queries: string[], + caseInsensitive?: boolean, +) { + const sameFileMatches = matches.filter((match) => match.relativePath === relativePath); + return sameFileMatches.some((windowStart) => { + const start = windowStart.lineNumber; + const end = start + MULTI_QUERY_WINDOW_LINES - 1; + if (lineNumber < start || lineNumber > end) return false; + const windowText = sameFileMatches + .filter((match) => match.lineNumber >= start && match.lineNumber <= end) + .map((match) => normalizeSearchText(match.fullText, caseInsensitive)) + .join('\n'); + return queries.every((query) => windowText.includes(query)); + }); +} + +function normalizeSearchText(text: string, caseInsensitive?: boolean) { + return caseInsensitive ? text.toLowerCase() : text; +} + +function toReadRange(lineNumber: number) { + return { + start: Math.max(1, lineNumber - DEFAULT_READ_RANGE_CONTEXT), + end: lineNumber + DEFAULT_READ_RANGE_CONTEXT, + }; +} + +function getPrimarySearchPattern(input: SearchInput) { + return input.query ?? input.queries?.[0] ?? ''; +} + +function getSearchCommandPattern(input: SearchInput) { + if (!input.queries) return input.query ?? ''; + return input.queries.map(escapeExtendedRegex).join('|'); +} + +function getSearchCommandFixedStrings(input: SearchInput) { + return input.queries ? false : (input.fixedStrings ?? true); +} + +function escapeExtendedRegex(pattern: string) { + return pattern.replace(/[\\^$.*+?()[\]{}|]/g, '\\$&'); +} + +function buildSearchResult({ + mode, + query, + queries, + matchMode, + counts, + matches, + offset, + headLimit, + nextOffset, + hint, +}: { + mode: SearchOutputMode; + query: string; + queries?: string[]; + matchMode?: SearchMatchMode; + counts: ReturnType; + matches: SearchMatchOutput[]; + offset: number; + headLimit: number; + nextOffset?: number; + hint?: string; +}): SearchResultOutput { + const slicedCounts = sliceResults(counts, offset, headLimit); + const totalMatchingLines = counts.reduce((total, count) => total + count.matchCount, 0); + const effectiveNextOffset = mode === 'content' ? nextOffset : slicedCounts.nextOffset; + return { + mode, + query, + queries, + matchMode, + totalMatchingFiles: counts.length, + totalMatchingLines, + files: slicedCounts.items, + matches, + truncated: slicedCounts.truncated || effectiveNextOffset !== undefined, + appliedLimit: + (mode === 'content' && effectiveNextOffset !== undefined) || slicedCounts.truncated + ? headLimit + : undefined, + appliedOffset: offset > 0 ? offset : undefined, + nextOffset: effectiveNextOffset, + hint, + }; +} + +function sliceResults(items: T[], offset: number, headLimit: number) { + const sliced = headLimit === 0 ? items.slice(offset) : items.slice(offset, offset + headLimit); + return { + items: sliced, + truncated: offset + sliced.length < items.length, + nextOffset: offset + sliced.length < items.length ? offset + sliced.length : undefined, + }; +} + +function buildSearchHint( + mode: SearchOutputMode, + sliced: { nextOffset?: number; truncated: boolean }, + headLimit: number, +) { + if (sliced.nextOffset !== undefined) { + return `Additional ${mode === 'files_with_matches' ? 'files' : mode === 'count' ? 'counts' : 'matches'} omitted. Continue with offset=${sliced.nextOffset} and head_limit=${headLimit}, or ${mode === 'content' ? 'read one of the returned ranges' : 'switch to output_mode=content after choosing a file'}.`; + } + if (mode === 'content') return 'Use read with the suggested line ranges for grounded citations.'; + if (mode === 'count') return 'Use output_mode=content after choosing a file or exact phrase.'; + return 'Use read on a matching file or switch to output_mode=content for line anchors.'; +} + +function formatSearchFiles( + counts: ReturnType, + offset: number, + headLimit: number, +) { + const sliced = sliceResults(counts, offset, headLimit); + const lines = sliced.items.map((file) => file.fileName); + if (sliced.truncated) lines.push(buildSearchHint('files_with_matches', sliced, headLimit)); + return lines.length > 0 ? `${lines.join('\n')}\n` : ''; +} + +function formatSearchCounts( + counts: ReturnType, + offset: number, + headLimit: number, +) { + const sliced = sliceResults(counts, offset, headLimit); + const lines = sliced.items.map((file) => `${file.fileName}: ${file.matchCount}`); + if (sliced.truncated) lines.push(buildSearchHint('count', sliced, headLimit)); + return lines.length > 0 ? `${lines.join('\n')}\n` : ''; +} + +function formatSearchMatches( + matches: SearchMatchOutput[], + sliced: { nextOffset?: number; truncated: boolean }, + headLimit: number, +) { + const lines = matches.map( + (match) => + `${match.fileName}:${match.lineNumber}:${match.text} (read ${match.readRange.start}-${match.readRange.end})`, + ); + if (sliced.truncated) lines.push(buildSearchHint('content', sliced, headLimit)); + return lines.length > 0 ? `${lines.join('\n')}\n` : ''; +} + +function parseGrepLine(line: string) { + const match = + /^(?.*)(?[:-])(?\d+)(?[:-])/.exec(line); + if (!match?.groups) return undefined; + return { + filePath: normaliseGrepPath(match.groups.filePath), + isMatch: match.groups.separator === ':' && match.groups.contentSeparator === ':', + lineNumber: Number(match.groups.lineNumber), + contentStartIndex: match[0].length, + }; +} + +function normaliseGrepPath(filePath: string) { + return filePath.startsWith('./') ? filePath.slice(2) : filePath; +} diff --git a/packages/cli/src/modules/agents/tools/knowledge/tool.ts b/packages/cli/src/modules/agents/tools/knowledge/tool.ts new file mode 100644 index 00000000000..70d316bf208 --- /dev/null +++ b/packages/cli/src/modules/agents/tools/knowledge/tool.ts @@ -0,0 +1,165 @@ +import { Tool } from '@n8n/agents/tool'; +import { createHash } from 'node:crypto'; + +import type { AgentKnowledgeCommandService } from '../../agent-knowledge-command.service'; +import type { AgentKnowledgeService } from '../../agent-knowledge.service'; + +import { aggregateCsv, distinctCsv, profileCsv, queryCsv } from './csv.operation'; +import { getRequiredFileReferences, type WorkspaceFiles } from './file-references'; +import { runReadOperation } from './read.operation'; +import { runSearchOperation } from './search.operation'; +import { + getSearchKnowledgeOperation, + parseSearchKnowledgeInput, + searchKnowledgeInputSchema, + searchKnowledgeOutputSchema, + type ParsedSearchKnowledgeInput, + type SearchKnowledgeOutput, +} from './schemas'; + +export function createSearchKnowledgeTool({ + agentId, + projectId, + knowledgeService, + commandService, +}: { + agentId: string; + projectId: string; + knowledgeService: AgentKnowledgeService; + commandService: AgentKnowledgeCommandService; +}) { + return new Tool('search_knowledge') + .description( + 'List, read, search, and query files uploaded to this agent knowledge base. ' + + 'Use this when the user asks about uploaded documents or facts likely contained in them.', + ) + .systemInstruction( + 'Use search_knowledge to inspect uploaded knowledge files. Do not claim a file says something ' + + 'unless you found it via list, search, read, or a CSV operation. Search defaults to output_mode=files_with_matches. ' + + 'Use output_mode=count for counts and output_mode=content only after narrowing to a file or exact phrase. ' + + 'For conceptual multi-term lookup, use queries with match_mode instead of writing regex by hand. ' + + 'Use read for grounded citations. Cite only file names and line ranges from read results. ' + + 'Never mention uploaded file ids, relative paths, binary ids, or storage ids to users. ' + + 'For unfamiliar CSVs, call csv_profile first. Use csv_query for rows, csv_distinct for possible values, and csv_aggregate for counts or numeric calculations. ' + + 'Do not answer from the first CSV row when rowCount is high or truncated; refine filters using ambiguity hints.', + ) + .input(searchKnowledgeInputSchema) + .output(searchKnowledgeOutputSchema) + .handler(async (input: unknown): Promise => { + let parsedInput: ParsedSearchKnowledgeInput; + try { + parsedInput = parseSearchKnowledgeInput(input); + } catch (error) { + return { + operation: getSearchKnowledgeOperation(input), + files: [], + error: toToolErrorMessage(error), + }; + } + + if (parsedInput.operation === 'list') { + try { + return { + operation: 'list', + files: await knowledgeService.listWorkspaceFiles(agentId, projectId), + }; + } catch (error) { + return { + operation: 'list', + files: [], + error: toToolErrorMessage(error), + }; + } + } + + let files: WorkspaceFiles = []; + try { + const fileReferences = getRequiredFileReferences(parsedInput); + files = await knowledgeService.resolveWorkspaceFiles(agentId, projectId, fileReferences); + const cacheKey = buildWorkspaceCacheKey(projectId, agentId, files); + return await commandService.withCachedWorkspace( + cacheKey, + async (workspaceRoot) => { + await knowledgeService.materializeWorkspace(agentId, projectId, workspaceRoot, { + fileReferences, + }); + }, + async (workspaceRoot) => + await handleKnowledgeOperation(parsedInput, workspaceRoot, files, commandService), + ); + } catch (error) { + return { + operation: parsedInput.operation, + files, + error: toToolErrorMessage(error), + }; + } + }) + .build(); +} + +/** + * Stable cache key for a materialized workspace. Encodes the agent plus the + * exact set of files and their sizes, so a different file selection or an + * add/delete invalidates the cache and forces re-materialization. + */ +function buildWorkspaceCacheKey(projectId: string, agentId: string, files: WorkspaceFiles): string { + const signature = files + .map((file) => `${file.relativePath}:${file.fileSizeBytes}`) + .sort() + .join('|'); + return `${projectId}:${agentId}:${createHash('sha1').update(signature).digest('hex')}`; +} + +/** + * Build the user-facing error string returned to the model. Strips absolute + * filesystem paths so internal temp/storage locations never leak to the model + * (and onward to end users). + */ +function toToolErrorMessage(error: unknown): string { + const message = error instanceof Error ? error.message : String(error); + return message.replace(/(^|[\s'"(])\/(?:[^\s'"()]+\/)*[^\s'"()]+/g, '$1[path]'); +} + +async function handleKnowledgeOperation( + input: ParsedSearchKnowledgeInput, + workspaceRoot: string, + files: WorkspaceFiles, + commandService: AgentKnowledgeCommandService, +): Promise { + switch (input.operation) { + case 'list': + return { + operation: 'list', + files, + }; + case 'search': + return await runSearchOperation(input, workspaceRoot, files, commandService); + case 'read': + return await runReadOperation(input, workspaceRoot, files, commandService); + case 'csv_query': + return { + operation: 'csv_query', + files, + csv: await queryCsv(workspaceRoot, files, input), + }; + case 'csv_profile': + return { + operation: 'csv_profile', + files, + csvProfile: await profileCsv(workspaceRoot, files, input), + }; + case 'csv_distinct': + return { + operation: 'csv_distinct', + files, + csvDistinct: await distinctCsv(workspaceRoot, files, input), + }; + case 'csv_aggregate': + return { + operation: 'csv_aggregate', + files, + csvAggregate: await aggregateCsv(workspaceRoot, files, input), + }; + } +} diff --git a/packages/cli/src/services/__tests__/project.service.ee.test.ts b/packages/cli/src/services/__tests__/project.service.ee.test.ts index f4dd8251cfb..1dfdcfed10c 100644 --- a/packages/cli/src/services/__tests__/project.service.ee.test.ts +++ b/packages/cli/src/services/__tests__/project.service.ee.test.ts @@ -4,6 +4,7 @@ import { type Project, type ProjectRepository, type SharedCredentialsRepository, + type SharedWorkflowRepository, type ProjectRelationRepository, type SharedCredentials, PROJECT_ADMIN_ROLE, @@ -12,18 +13,24 @@ import { PROJECT_OWNER_ROLE_SLUG } from '@n8n/permissions'; import type { EntityManager } from '@n8n/typeorm'; import { mock } from 'jest-mock-extended'; +import type { AgentKnowledgeService } from '@/modules/agents/agent-knowledge.service'; +import type { AgentRepository } from '@/modules/agents/repositories/agent.repository'; + import { ProjectService } from '../project.service.ee'; import type { RoleService } from '../role.service'; describe('ProjectService', () => { const manager = mock(); - const projectRepository = mock(); + const sharedWorkflowRepository = mock(); + const projectRepository = mock({ manager }); const projectRelationRepository = mock({ manager }); const roleService = mock(); const sharedCredentialsRepository = mock(); const moduleRegistry = mock({ entities: [] }); + const agentRepository = mock(); + const agentKnowledgeService = mock(); const projectService = new ProjectService( - mock(), + sharedWorkflowRepository, projectRepository, projectRelationRepository, roleService, @@ -32,13 +39,13 @@ describe('ProjectService', () => { moduleRegistry, ); + beforeEach(() => { + jest.clearAllMocks(); + }); + describe('getAccessibleProjectsAndCount', () => { const options = { skip: 0, take: 10, search: 'test' }; - beforeEach(() => { - jest.clearAllMocks(); - }); - it('should call findAllProjectsAndCount for admin users', async () => { const adminUser = { id: 'admin-user', @@ -113,7 +120,6 @@ describe('ProjectService', () => { ]; beforeEach(() => { - jest.clearAllMocks(); manager.transaction.mockImplementation(async (arg1: unknown, arg2?: unknown) => { const runInTransaction = (arg2 ?? arg1) as ( entityManager: EntityManager, @@ -199,7 +205,6 @@ describe('ProjectService', () => { ]; beforeEach(() => { - jest.clearAllMocks(); manager.transaction.mockImplementation(async (arg1: unknown, arg2?: unknown) => { const runInTransaction = (arg2 ?? arg1) as ( entityManager: EntityManager, @@ -271,4 +276,69 @@ describe('ProjectService', () => { }); }); }); + + describe('deleteProject', () => { + it('cleans agent knowledge files before project deletion cascades agent files', async () => { + const user = { id: 'user-1', role: { scopes: [{ slug: 'project:delete' }] } } as any; + const project = mock({ id: 'project-1', type: 'team' }); + Object.defineProperty(projectService, 'workflowService', { + configurable: true, + get: async () => ({ delete: jest.fn() }), + }); + Object.defineProperty(projectService, 'credentialsService', { + configurable: true, + get: async () => ({ delete: jest.fn() }), + }); + Object.defineProperty(projectService, 'agentRepository', { + configurable: true, + get: async () => agentRepository, + }); + Object.defineProperty(projectService, 'agentKnowledgeService', { + configurable: true, + get: async () => agentKnowledgeService, + }); + manager.findOne.mockResolvedValueOnce(project); + projectRepository.remove.mockResolvedValueOnce(project); + sharedWorkflowRepository.find.mockResolvedValueOnce([]); + sharedCredentialsRepository.find.mockResolvedValueOnce([]); + moduleRegistry.isActive.mockImplementation((moduleName) => moduleName === 'agents'); + agentRepository.findByProjectId.mockResolvedValueOnce([ + { id: 'agent-1' }, + { id: 'agent-2' }, + ] as never); + + await projectService.deleteProject(user, project.id); + + expect(agentRepository.findByProjectId).toHaveBeenCalledWith(project.id); + expect(agentKnowledgeService.deleteAllFilesForAgent).toHaveBeenCalledWith('agent-1'); + expect(agentKnowledgeService.deleteAllFilesForAgent).toHaveBeenCalledWith('agent-2'); + expect(agentKnowledgeService.deleteAllFilesForAgent.mock.invocationCallOrder[1]).toBeLessThan( + projectRepository.remove.mock.invocationCallOrder[0], + ); + }); + + it('skips agent knowledge cleanup when the agents module is inactive', async () => { + const user = { id: 'user-1', role: { scopes: [{ slug: 'project:delete' }] } } as any; + const project = mock({ id: 'project-1', type: 'team' }); + Object.defineProperty(projectService, 'workflowService', { + configurable: true, + get: async () => ({ delete: jest.fn() }), + }); + Object.defineProperty(projectService, 'credentialsService', { + configurable: true, + get: async () => ({ delete: jest.fn() }), + }); + manager.findOne.mockResolvedValueOnce(project); + projectRepository.remove.mockResolvedValueOnce(project); + sharedWorkflowRepository.find.mockResolvedValueOnce([]); + sharedCredentialsRepository.find.mockResolvedValueOnce([]); + moduleRegistry.isActive.mockReturnValue(false); + + await projectService.deleteProject(user, project.id); + + expect(agentRepository.findByProjectId).not.toHaveBeenCalled(); + expect(agentKnowledgeService.deleteAllFilesForAgent).not.toHaveBeenCalled(); + expect(projectRepository.remove).toHaveBeenCalledWith(project); + }); + }); }); diff --git a/packages/cli/src/services/project.service.ee.ts b/packages/cli/src/services/project.service.ee.ts index 5272ee590e8..41937eea758 100644 --- a/packages/cli/src/services/project.service.ee.ts +++ b/packages/cli/src/services/project.service.ee.ts @@ -105,6 +105,18 @@ export class ProjectService { ); } + private get agentRepository() { + return import('@/modules/agents/repositories/agent.repository').then(({ AgentRepository }) => + Container.get(AgentRepository), + ); + } + + private get agentKnowledgeService() { + return import('@/modules/agents/agent-knowledge.service').then(({ AgentKnowledgeService }) => + Container.get(AgentKnowledgeService), + ); + } + async deleteProject( user: User, projectId: string, @@ -206,10 +218,22 @@ export class ProjectService { await secretsProvidersConnectionsService.cleanupConnectionsForProjectDeletion(project.id); } - // 8. delete project + // 8. delete agent knowledge files before project removal cascades delete agent_files rows. + if (this.moduleRegistry.isActive('agents')) { + const [agentRepository, agentKnowledgeService] = await Promise.all([ + this.agentRepository, + this.agentKnowledgeService, + ]); + const agents = await agentRepository.findByProjectId(project.id); + for (const agent of agents) { + await agentKnowledgeService.deleteAllFilesForAgent(agent.id); + } + } + + // 9. delete project await this.projectRepository.remove(project); - // 9. delete project relations + // 10. delete project relations // Cascading deletes take care of this. } diff --git a/packages/frontend/@n8n/i18n/src/locales/en.json b/packages/frontend/@n8n/i18n/src/locales/en.json index 8b82814fb41..dbaf63352d7 100644 --- a/packages/frontend/@n8n/i18n/src/locales/en.json +++ b/packages/frontend/@n8n/i18n/src/locales/en.json @@ -6073,6 +6073,7 @@ "agents.chat.misconfigured.dismiss": "Dismiss", "agents.chat.askCredential.skip": "Skip", "agents.chat.toolNames.webSearch": "Web search", + "agents.chat.toolNames.searchKnowledge": "Search knowledge", "agents.chat.askQuestion.otherLabel": "Other", "agents.chat.askQuestion.otherPlaceholder": "Type another answer", "agents.chat.askQuestion.submit": "Submit", @@ -6243,11 +6244,35 @@ "agents.builder.memory.episodicMemory.label": "Episodic Memory", "agents.builder.memory.episodicMemory.hint": "Stores source-backed memories from previous conversations. Requires OpenAI credential.", "agents.builder.memory.episodicMemory.changeCredential": "Change credential", + "agents.builder.files.title": "Knowledge base", + "agents.builder.files.description": "Add CSV, PDF, Markdown, or TXT files this agent can search and read. Upload up to {maxFiles} files at a time, {maxSizeMb} MB each.", + "agents.builder.files.count": "{count} file uploaded | {count} files uploaded", + "agents.builder.files.upload": "Upload file", + "agents.builder.files.uploadFileTooLarge.title": "File too large", + "agents.builder.files.uploadFileTooLarge.message": "{name} is larger than {size} MB.", + "agents.builder.files.empty": "No files uploaded yet.", + "agents.builder.files.loading": "Loading files...", + "agents.builder.files.uploaded": "File uploaded", + "agents.builder.files.deleted": "File deleted", + "agents.builder.files.delete": "Delete file", + "agents.builder.files.type.csv": "CSV", + "agents.builder.files.type.pdf": "PDF", + "agents.builder.files.type.markdown": "Markdown", + "agents.builder.files.type.txt": "TXT", + "agents.builder.files.type.file": "File", + "agents.builder.files.deleteModal.title": "Delete {name}?", + "agents.builder.files.deleteModal.description": "This removes {name} from the agent's knowledge files.", + "agents.builder.files.deleteModal.button.delete": "Delete file", + "agents.builder.files.loadError": "Could not load files", + "agents.builder.files.uploadError": "Could not upload file", + "agents.builder.files.deleteError": "Could not delete file", + "agents.builder.files.size.bytes": "{bytes} B", + "agents.builder.files.size.kilobytes": "{kilobytes} KB", + "agents.builder.files.size.megabytes": "{megabytes} MB", "agents.builder.memory.recallModel.label": "Memory model", "agents.builder.memory.recallModel.hint": "Choose the model that creates, reviews, and retrieves memories. Uses the agent model by default.", "agents.builder.episodicMemoryCredentialModal.title": "Episodic Memory", "agents.builder.episodicMemoryCredentialModal.description": "An OpenAI credential is used to create embeddings for Episodic Memory.", - "agents.builder.memory.semanticRecall.topK": "Top K", "agents.builder.memory.semanticRecall.rangeBefore": "Range before", "agents.builder.memory.semanticRecall.rangeAfter": "Range after", diff --git a/packages/frontend/editor-ui/src/app/stores/settings.store.ts b/packages/frontend/editor-ui/src/app/stores/settings.store.ts index e5627018a14..61fed174e62 100644 --- a/packages/frontend/editor-ui/src/app/stores/settings.store.ts +++ b/packages/frontend/editor-ui/src/app/stores/settings.store.ts @@ -180,6 +180,10 @@ export const useSettingsStore = defineStore(STORES.SETTINGS, () => { isAgentModuleActive('node-tools-searcher'), ); + // Opt-in flag: the `knowledge-base` token must be listed in the backend + // `N8N_AGENTS_MODULES` env var for this to evaluate true. + const isAgentsKnowledgeBaseFeatureEnabled = computed(() => isAgentModuleActive('knowledge-base')); + const isPublicChatTriggerDisabled = computed( () => settings.value.chatTrigger?.disablePublicChat ?? false, ); @@ -475,6 +479,7 @@ export const useSettingsStore = defineStore(STORES.SETTINGS, () => { isChatFeatureEnabled, isOtelEnabled, isAgentsNodeToolsFeatureEnabled, + isAgentsKnowledgeBaseFeatureEnabled, isPublicChatTriggerDisabled, }; }); diff --git a/packages/frontend/editor-ui/src/features/agents/__tests__/AgentBuilder.readonly.test.ts b/packages/frontend/editor-ui/src/features/agents/__tests__/AgentBuilder.readonly.test.ts index 6220889b172..1edd71b53d0 100644 --- a/packages/frontend/editor-ui/src/features/agents/__tests__/AgentBuilder.readonly.test.ts +++ b/packages/frontend/editor-ui/src/features/agents/__tests__/AgentBuilder.readonly.test.ts @@ -25,6 +25,10 @@ describe('AgentBuilderEditorColumn — childrenDisabled composes streaming and c agent: null, projectId: 'p1', agentId: 'a1', + agentFiles: [], + agentFilesLoading: false, + agentFilesUploading: false, + knowledgeBaseEnabled: true, appliedSkills: [], connectedTriggers: [], isBuildChatStreaming: false, @@ -52,6 +56,11 @@ describe('AgentBuilderEditorColumn — childrenDisabled composes streaming and c template: '
', props: ['config', 'disabled', 'embedded'], }, + AgentFilesPanel: { + name: 'AgentFilesPanel', + template: '
', + props: ['files', 'disabled', 'loading', 'uploading'], + }, AgentAdvancedPanel: { name: 'AgentAdvancedPanel', template: '
', @@ -105,6 +114,10 @@ describe('AgentBuilderEditorColumn — childrenDisabled composes streaming and c agent: null, projectId: 'p1', agentId: 'a1', + agentFiles: [], + agentFilesLoading: false, + agentFilesUploading: false, + knowledgeBaseEnabled: true, appliedSkills: [], connectedTriggers: [], isBuildChatStreaming: false, diff --git a/packages/frontend/editor-ui/src/features/agents/__tests__/AgentBuilderEditorColumn.spec.ts b/packages/frontend/editor-ui/src/features/agents/__tests__/AgentBuilderEditorColumn.spec.ts index 61829a22fb3..f1f5ae8ab56 100644 --- a/packages/frontend/editor-ui/src/features/agents/__tests__/AgentBuilderEditorColumn.spec.ts +++ b/packages/frontend/editor-ui/src/features/agents/__tests__/AgentBuilderEditorColumn.spec.ts @@ -17,10 +17,14 @@ vi.mock('@n8n/i18n', () => ({ })); vi.mock('@n8n/design-system', () => ({ + N8nActionBox: { template: '
', props: ['icon', 'description'] }, N8nCard: { template: '
', props: ['variant'] }, N8nHeading: { template: '

', props: ['size'] }, + N8nIcon: { template: '', props: ['icon', 'size'] }, N8nIconButton: { template: '' }, + N8nLoading: { template: '
', props: ['rows', 'variant'] }, N8nRadioButtons: { template: '
', props: ['modelValue', 'options'] }, + N8nScrollArea: { template: '
', props: ['maxHeight', 'type'] }, N8nSwitch: { template: '' }, N8nText: { template: '', props: ['tag', 'bold', 'size', 'color'] }, N8nTooltip: { template: '
' }, @@ -43,6 +47,10 @@ async function mountColumn() { agent: null, projectId: 'project-1', agentId: 'agent-1', + agentFiles: [], + agentFilesLoading: false, + agentFilesUploading: false, + knowledgeBaseEnabled: true, appliedSkills: [], connectedTriggers: [], isBuildChatStreaming: false, diff --git a/packages/frontend/editor-ui/src/features/agents/__tests__/useAgentChatStream.test.ts b/packages/frontend/editor-ui/src/features/agents/__tests__/useAgentChatStream.test.ts index fbc0e26989c..d0121a6bcbd 100644 --- a/packages/frontend/editor-ui/src/features/agents/__tests__/useAgentChatStream.test.ts +++ b/packages/frontend/editor-ui/src/features/agents/__tests__/useAgentChatStream.test.ts @@ -44,13 +44,19 @@ function buildHook() { describe('useAgentChatStream — SDK-aligned event handling', () => { let originalFetch: typeof fetch; + let originalLocalStorage: typeof globalThis.localStorage | undefined; beforeEach(() => { originalFetch = globalThis.fetch; + originalLocalStorage = globalThis.localStorage; + vi.stubGlobal('localStorage', { + getItem: vi.fn(() => ''), + }); }); afterEach(() => { globalThis.fetch = originalFetch; + vi.stubGlobal('localStorage', originalLocalStorage); vi.restoreAllMocks(); }); diff --git a/packages/frontend/editor-ui/src/features/agents/components/AgentBuilderEditorColumn.vue b/packages/frontend/editor-ui/src/features/agents/components/AgentBuilderEditorColumn.vue index 1dfd1191c8a..75556a78a85 100644 --- a/packages/frontend/editor-ui/src/features/agents/components/AgentBuilderEditorColumn.vue +++ b/packages/frontend/editor-ui/src/features/agents/components/AgentBuilderEditorColumn.vue @@ -2,6 +2,7 @@ import { computed } from 'vue'; import { N8nCard, N8nRadioButtons } from '@n8n/design-system'; import { useI18n } from '@n8n/i18n'; +import type { AgentFileDto } from '@n8n/api-types'; import type { AgentBuilderMainTab } from '../composables/useAgentBuilderMainTabs'; import type { AgentJsonConfig, AgentResource, AgentSkill } from '../types'; @@ -12,6 +13,7 @@ import AgentCapabilitiesSection from './AgentCapabilitiesSection.vue'; import AgentIdentityHeader from './AgentIdentityHeader.vue'; import AgentInfoPanel from './AgentInfoPanel.vue'; import AgentJsonEditor from './AgentJsonEditor.vue'; +import AgentFilesPanel from './AgentFilesPanel.vue'; import AgentMemoryPanel from './AgentMemoryPanel.vue'; import AgentPanelHeader from './AgentPanelHeader.vue'; @@ -22,6 +24,11 @@ const props = defineProps<{ agent: AgentResource | null; projectId: string; agentId: string; + agentFiles: AgentFileDto[]; + agentFilesLoading: boolean; + agentFilesUploading: boolean; + knowledgeBaseEnabled: boolean; + deletingAgentFileId?: string | null; appliedSkills: Array<{ id: string; skill: AgentSkill }>; connectedTriggers: string[]; isBuildChatStreaming: boolean; @@ -42,6 +49,8 @@ const emit = defineEmits<{ 'add-trigger': []; 'remove-tool': [index: number]; 'remove-skill': [id: string]; + 'upload-files': [files: File[]]; + 'delete-file': [file: AgentFileDto]; 'update:connected-triggers': [triggers: string[]]; 'trigger-added': [payload: { triggerType: string; triggers: string[] }]; }>(); @@ -127,6 +136,19 @@ const i18n = useI18n(); /> + + + + +import { computed, useTemplateRef } from 'vue'; +import { + N8nActionBox, + N8nCard, + N8nIcon, + N8nIconButton, + N8nLoading, + N8nScrollArea, + N8nText, + N8nTooltip, +} from '@n8n/design-system'; +import { useI18n } from '@n8n/i18n'; +import { + ALLOWED_AGENT_FILE_EXTENSIONS, + MAX_AGENT_FILE_SIZE_MB, + MAX_AGENT_FILES_PER_UPLOAD, + type AgentFileDto, +} from '@n8n/api-types'; + +const props = withDefaults( + defineProps<{ + files: AgentFileDto[]; + disabled?: boolean; + loading?: boolean; + uploading?: boolean; + deletingFileId?: string | null; + }>(), + { + disabled: false, + loading: false, + uploading: false, + deletingFileId: null, + }, +); + +const emit = defineEmits<{ + 'upload-files': [files: File[]]; + 'delete-file': [file: AgentFileDto]; +}>(); + +const i18n = useI18n(); +const fileInput = useTemplateRef('fileInput'); +const totalCount = computed(() => props.files.length); +const isMutating = computed(() => props.uploading || props.deletingFileId !== null); +const isUploadDisabled = computed(() => props.disabled || props.loading || isMutating.value); + +const acceptAttr = ALLOWED_AGENT_FILE_EXTENSIONS.join(','); +const description = computed(() => + i18n.baseText('agents.builder.files.description', { + interpolate: { maxFiles: MAX_AGENT_FILES_PER_UPLOAD, maxSizeMb: MAX_AGENT_FILE_SIZE_MB }, + }), +); + +function getFileIcon(file: AgentFileDto) { + const extension = file.fileName.split('.').pop()?.toLowerCase(); + if (extension === 'csv' || file.mimeType === 'text/csv') return 'file-code'; + if (extension === 'pdf') return 'file'; + if (extension === 'md' || extension === 'markdown' || file.mimeType === 'text/markdown') { + return 'scroll-text'; + } + if (extension === 'txt' || file.mimeType === 'text/plain') return 'file-text'; + return 'file'; +} + +function getFileType(fileName: string) { + const extension = fileName.split('.').pop()?.toLowerCase(); + if (extension === 'csv') return i18n.baseText('agents.builder.files.type.csv'); + if (extension === 'pdf') return i18n.baseText('agents.builder.files.type.pdf'); + if (extension === 'md' || extension === 'markdown') { + return i18n.baseText('agents.builder.files.type.markdown'); + } + if (extension === 'txt') return i18n.baseText('agents.builder.files.type.txt'); + return i18n.baseText('agents.builder.files.type.file'); +} + +function formatFileSize(bytes: number) { + if (bytes < 1024) + return i18n.baseText('agents.builder.files.size.bytes', { interpolate: { bytes } }); + const kilobytes = bytes / 1024; + if (kilobytes < 1024) { + return i18n.baseText('agents.builder.files.size.kilobytes', { + interpolate: { kilobytes: kilobytes.toFixed(1) }, + }); + } + const megabytes = kilobytes / 1024; + return i18n.baseText('agents.builder.files.size.megabytes', { + interpolate: { megabytes: megabytes.toFixed(1) }, + }); +} + +function openFilePicker() { + if (isUploadDisabled.value) return; + fileInput.value?.click(); +} + +function onFilesSelected(event: Event) { + const input = event.target; + if (!(input instanceof HTMLInputElement)) return; + const selectedFiles = Array.from(input.files ?? []); + input.value = ''; + if (selectedFiles.length === 0) return; + + emit('upload-files', selectedFiles); +} + + + + + diff --git a/packages/frontend/editor-ui/src/features/agents/components/AgentPanelHeader.vue b/packages/frontend/editor-ui/src/features/agents/components/AgentPanelHeader.vue index 0eb32f431d9..902a0ed2d0c 100644 --- a/packages/frontend/editor-ui/src/features/agents/components/AgentPanelHeader.vue +++ b/packages/frontend/editor-ui/src/features/agents/components/AgentPanelHeader.vue @@ -9,8 +9,15 @@ defineProps<{ @@ -22,4 +29,24 @@ defineProps<{ flex: 1; min-width: 0; } + +.row { + display: flex; + align-items: flex-start; + gap: var(--spacing--xs); +} + +.copy { + display: flex; + flex-direction: column; + gap: var(--spacing--4xs); + flex: 1; + min-width: 0; +} + +.actions { + display: flex; + align-items: center; + flex-shrink: 0; +} diff --git a/packages/frontend/editor-ui/src/features/agents/composables/agentChatMessages.ts b/packages/frontend/editor-ui/src/features/agents/composables/agentChatMessages.ts index 783ef252e55..6743bb6077c 100644 --- a/packages/frontend/editor-ui/src/features/agents/composables/agentChatMessages.ts +++ b/packages/frontend/editor-ui/src/features/agents/composables/agentChatMessages.ts @@ -21,6 +21,7 @@ import { import { CHAT_MESSAGE_STATUS, TOOL_CALL_STATE } from '../constants'; import type { ChatMessageStatus, ToolCallState } from '../constants'; +import { summariseToolCall } from '../utils/interactive-summary'; export { type ChatMessageStatus, type ToolCallState }; // --------------------------------------------------------------------------- @@ -301,6 +302,7 @@ export function convertDbMessages(dbMessages: AgentPersistedMessageDto[]): ChatM input: part.input, ...(output !== undefined && { output }), state, + displaySummary: summariseToolCall(part.toolName, output, part.input), }); } } diff --git a/packages/frontend/editor-ui/src/features/agents/composables/useAgentApi.ts b/packages/frontend/editor-ui/src/features/agents/composables/useAgentApi.ts index 34ef7a41726..1a49a03b102 100644 --- a/packages/frontend/editor-ui/src/features/agents/composables/useAgentApi.ts +++ b/packages/frontend/editor-ui/src/features/agents/composables/useAgentApi.ts @@ -1,5 +1,6 @@ import type { AgentBuilderMessagesResponse, + AgentFileDto, AgentIntegrationStatusResponse, AgentPersistedMessageDto, AgentSkill, @@ -73,6 +74,50 @@ export const deleteAgent = async ( await makeRestApiRequest(context, 'DELETE', `/projects/${projectId}/agents/v2/${agentId}`); }; +export const listAgentFiles = async ( + context: IRestApiContext, + projectId: string, + agentId: string, +): Promise => { + return await makeRestApiRequest( + context, + 'GET', + `/projects/${projectId}/agents/v2/${agentId}/files`, + ); +}; + +export const uploadAgentFiles = async ( + context: IRestApiContext, + projectId: string, + agentId: string, + files: File[], +): Promise => { + const formData = new FormData(); + for (const file of files) { + formData.append('files', file); + } + + return await makeRestApiRequest( + context, + 'POST', + `/projects/${projectId}/agents/v2/${agentId}/files`, + formData, + ); +}; + +export const deleteAgentFile = async ( + context: IRestApiContext, + projectId: string, + agentId: string, + fileId: string, +): Promise => { + await makeRestApiRequest( + context, + 'DELETE', + `/projects/${projectId}/agents/v2/${agentId}/files/${fileId}`, + ); +}; + export const connectIntegration = async ( context: IRestApiContext, projectId: string, diff --git a/packages/frontend/editor-ui/src/features/agents/composables/useAgentChatStream.ts b/packages/frontend/editor-ui/src/features/agents/composables/useAgentChatStream.ts index 97d9ee81a1a..e80b87302b9 100644 --- a/packages/frontend/editor-ui/src/features/agents/composables/useAgentChatStream.ts +++ b/packages/frontend/editor-ui/src/features/agents/composables/useAgentChatStream.ts @@ -23,7 +23,7 @@ import { type ToolCall, } from './agentChatMessages'; import { CHAT_MESSAGE_STATUS, TOOL_CALL_STATE } from '../constants'; -import { summariseInteractiveOutput } from '../utils/interactive-summary'; +import { summariseToolCall } from '../utils/interactive-summary'; export interface FatalAgentError { message: string; @@ -252,9 +252,15 @@ export function useAgentChatStream(params: UseAgentChatStreamParams) { toolCallId: event.toolCallId, input: event.input, state: TOOL_CALL_STATE.PENDING, + displaySummary: summariseToolCall(event.toolName, undefined, event.input), }); } else { existing.input = event.input; + existing.displaySummary = summariseToolCall( + existing.tool, + existing.output, + existing.input, + ); if ( existing.state !== TOOL_CALL_STATE.RUNNING && existing.state !== TOOL_CALL_STATE.DONE @@ -280,11 +286,7 @@ export function useAgentChatStream(params: UseAgentChatStreamParams) { if (found) { found.tc.output = event.output; found.tc.state = event.isError ? TOOL_CALL_STATE.ERROR : TOOL_CALL_STATE.DONE; - found.tc.displaySummary = summariseInteractiveOutput( - found.tc.tool, - event.output, - found.tc.input, - ); + found.tc.displaySummary = summariseToolCall(found.tc.tool, event.output, found.tc.input); // If this was an interactive tool call, the result IS the user's // resume payload — refresh the card so it flips to its resolved // (disabled) state immediately. No separate "resumed" event needed. @@ -523,7 +525,7 @@ export function useAgentChatStream(params: UseAgentChatStreamParams) { if (found) { found.tc.state = TOOL_CALL_STATE.DONE; found.tc.output = payload.resumeData; - found.tc.displaySummary = summariseInteractiveOutput( + found.tc.displaySummary = summariseToolCall( found.tc.tool, payload.resumeData, found.tc.input, diff --git a/packages/frontend/editor-ui/src/features/agents/utils/interactive-summary.ts b/packages/frontend/editor-ui/src/features/agents/utils/interactive-summary.ts index 83b283ec694..f571ef8c728 100644 --- a/packages/frontend/editor-ui/src/features/agents/utils/interactive-summary.ts +++ b/packages/frontend/editor-ui/src/features/agents/utils/interactive-summary.ts @@ -55,3 +55,11 @@ export function summariseInteractiveOutput( return undefined; } + +export function summariseToolCall( + toolName: string, + output?: unknown, + input?: unknown, +): string | undefined { + return summariseInteractiveOutput(toolName, output, input); +} diff --git a/packages/frontend/editor-ui/src/features/agents/utils/toolDisplayName.ts b/packages/frontend/editor-ui/src/features/agents/utils/toolDisplayName.ts index bea4f8fcdb7..8feb9f3d167 100644 --- a/packages/frontend/editor-ui/src/features/agents/utils/toolDisplayName.ts +++ b/packages/frontend/editor-ui/src/features/agents/utils/toolDisplayName.ts @@ -1,13 +1,17 @@ import type { BaseTextKey } from '@n8n/i18n'; export const WEB_SEARCH_TOOL_NAME_KEY: BaseTextKey = 'agents.chat.toolNames.webSearch'; +export const SEARCH_KNOWLEDGE_TOOL_NAME_KEY: BaseTextKey = 'agents.chat.toolNames.searchKnowledge'; const WEB_SEARCH_TOOL_NAME_PATTERN = /^(?:web_search|(?:anthropic|openai)\.web_search(?:_\d{8})?)$/; +const SEARCH_KNOWLEDGE_TOOL_NAME = 'search_knowledge'; export function getToolNameTranslationKey(toolName: string | undefined): BaseTextKey | undefined { const trimmed = toolName?.trim(); if (!trimmed) return undefined; + if (trimmed === SEARCH_KNOWLEDGE_TOOL_NAME) return SEARCH_KNOWLEDGE_TOOL_NAME_KEY; + return WEB_SEARCH_TOOL_NAME_PATTERN.test(trimmed) ? WEB_SEARCH_TOOL_NAME_KEY : undefined; } diff --git a/packages/frontend/editor-ui/src/features/agents/views/AgentBuilderView.vue b/packages/frontend/editor-ui/src/features/agents/views/AgentBuilderView.vue index 07827f31639..dd47fca0274 100644 --- a/packages/frontend/editor-ui/src/features/agents/views/AgentBuilderView.vue +++ b/packages/frontend/editor-ui/src/features/agents/views/AgentBuilderView.vue @@ -3,7 +3,12 @@ import { ref, computed, watch, nextTick, onBeforeUnmount, useTemplateRef } from import { useRoute, useRouter } from 'vue-router'; import { N8nResizeWrapper, type DropdownMenuItemProps } from '@n8n/design-system'; import { useI18n } from '@n8n/i18n'; -import { AGENT_SCHEDULE_TRIGGER_TYPE } from '@n8n/api-types'; +import { + AGENT_SCHEDULE_TRIGGER_TYPE, + MAX_AGENT_FILE_SIZE_BYTES, + MAX_AGENT_FILE_SIZE_MB, +} from '@n8n/api-types'; +import type { AgentFileDto } from '@n8n/api-types'; import { useRootStore } from '@n8n/stores/useRootStore'; import { useProjectsStore } from '@/features/collaboration/projects/projects.store'; import { useTelemetry } from '@/app/composables/useTelemetry'; @@ -11,6 +16,7 @@ import { useToast } from '@/app/composables/useToast'; import { useUIStore } from '@/app/stores/ui.store'; import { useNodeTypesStore } from '@/app/stores/nodeTypes.store'; import { useCredentialsStore } from '@/features/credentials/credentials.store'; +import { useSettingsStore } from '@/app/stores/settings.store'; import { useDocumentTitle } from '@/app/composables/useDocumentTitle'; import { LOCAL_STORAGE_AGENT_BUILDER_CHAT_PANEL_WIDTH, MODAL_CONFIRM } from '@/app/constants'; import { AI_MCP_TOOL_NODE_TYPE } from '@/app/constants/nodeTypes'; @@ -19,6 +25,9 @@ import { deepCopy } from 'n8n-workflow'; import { getAgent, deleteAgent, + listAgentFiles, + uploadAgentFiles, + deleteAgentFile, updateAgentSkill, createAgentSkill, } from '../composables/useAgentApi'; @@ -72,6 +81,11 @@ const telemetry = useTelemetry(); const sessionsStore = useAgentSessionsStore(); const uiStore = useUIStore(); const credentialsStore = useCredentialsStore(); +const settingsStore = useSettingsStore(); + +// Gates the entire knowledge base feature (files panel + fetching) behind the +// `knowledge-base` token in the backend N8N_AGENTS_MODULES env var. +const isKnowledgeBaseEnabled = computed(() => settingsStore.isAgentsKnowledgeBaseFeatureEnabled); const documentTitle = useDocumentTitle(); const { showError, showMessage } = useToast(); const { isBuilderConfigured, fetchStatus: fetchBuilderStatus } = useAgentBuilderStatus(); @@ -104,6 +118,10 @@ function onBuildChatStreamingChange(streaming: boolean) { const initialized = ref(false); const agentName = ref(''); const agent = ref(null); +const agentFiles = ref([]); +const agentFilesLoading = ref(false); +const agentFilesUploading = ref(false); +const deletingAgentFileId = ref(null); watch(agentName, (name) => { documentTitle.set(name || locale.baseText('agents.heading')); @@ -203,24 +221,130 @@ const projectName = computed(() => { return match?.name ?? null; }); +// A fetch/mutation captures its target agent + project at call time. By the +// time an awaited call resolves the user may have switched to a different agent +// or project, and applying the result would clobber the new selection's state. +// Callers use this guard to drop such stale results. +function isStaleAgentTarget(targetProjectId: string, targetAgentId: string): boolean { + return projectId.value !== targetProjectId || agentId.value !== targetAgentId; +} + async function fetchAgent( targetProjectId: string = projectId.value, targetAgentId: string = agentId.value, ) { - // Capture the target at call-time so a fetch that resolves after the - // user has switched to a different agent is dropped instead of clobbering - // the new agent's resource state. const data = await getAgent(rootStore.restApiContext, targetProjectId, targetAgentId); - if (agentId.value !== targetAgentId || projectId.value !== targetProjectId) return; + if (isStaleAgentTarget(targetProjectId, targetAgentId)) return; agent.value = data; agentName.value = data.name; } +async function fetchAgentFiles( + targetProjectId: string = projectId.value, + targetAgentId: string = agentId.value, +) { + if (!isKnowledgeBaseEnabled.value) return; + agentFilesLoading.value = true; + try { + const files = await listAgentFiles(rootStore.restApiContext, targetProjectId, targetAgentId); + if (isStaleAgentTarget(targetProjectId, targetAgentId)) return; + agentFiles.value = files; + } catch (error) { + showError(error, locale.baseText('agents.builder.files.loadError')); + } finally { + if (!isStaleAgentTarget(targetProjectId, targetAgentId)) { + agentFilesLoading.value = false; + } + } +} + +async function onUploadAgentFiles(files: File[]) { + if (files.length === 0) return; + const oversizedFiles = files.filter((file) => file.size > MAX_AGENT_FILE_SIZE_BYTES); + if (oversizedFiles.length > 0) { + showError( + new Error( + locale.baseText('agents.builder.files.uploadFileTooLarge.message', { + interpolate: { name: oversizedFiles[0].name, size: String(MAX_AGENT_FILE_SIZE_MB) }, + }), + ), + locale.baseText('agents.builder.files.uploadFileTooLarge.title'), + ); + } + const filesWithinLimit = files.filter((file) => file.size <= MAX_AGENT_FILE_SIZE_BYTES); + if (filesWithinLimit.length === 0) return; + + const targetProjectId = projectId.value; + const targetAgentId = agentId.value; + agentFilesUploading.value = true; + try { + const uploadedFiles = await uploadAgentFiles( + rootStore.restApiContext, + targetProjectId, + targetAgentId, + filesWithinLimit, + ); + if (isStaleAgentTarget(targetProjectId, targetAgentId)) return; + const existingById = new Map(agentFiles.value.map((file) => [file.id, file])); + for (const file of uploadedFiles) { + existingById.set(file.id, file); + } + agentFiles.value = Array.from(existingById.values()).sort( + (a, b) => new Date(b.createdAt).getTime() - new Date(a.createdAt).getTime(), + ); + showMessage({ + title: locale.baseText('agents.builder.files.uploaded'), + type: 'success', + }); + } catch (error) { + showError(error, locale.baseText('agents.builder.files.uploadError')); + } finally { + if (!isStaleAgentTarget(targetProjectId, targetAgentId)) { + agentFilesUploading.value = false; + } + } +} + +async function onDeleteAgentFile(file: AgentFileDto) { + if (deletingAgentFileId.value !== null) return; + + const confirmed = await openAgentConfirmationModal({ + title: locale.baseText('agents.builder.files.deleteModal.title', { + interpolate: { name: file.fileName }, + }), + description: locale.baseText('agents.builder.files.deleteModal.description', { + interpolate: { name: file.fileName }, + }), + confirmButtonText: locale.baseText('agents.builder.files.deleteModal.button.delete'), + cancelButtonText: locale.baseText('generic.cancel'), + }); + if (confirmed !== MODAL_CONFIRM) return; + + const targetProjectId = projectId.value; + const targetAgentId = agentId.value; + deletingAgentFileId.value = file.id; + try { + await deleteAgentFile(rootStore.restApiContext, targetProjectId, targetAgentId, file.id); + if (isStaleAgentTarget(targetProjectId, targetAgentId)) return; + agentFiles.value = agentFiles.value.filter((agentFile) => agentFile.id !== file.id); + showMessage({ + title: locale.baseText('agents.builder.files.deleted'), + type: 'success', + }); + } catch (error) { + showError(error, locale.baseText('agents.builder.files.deleteError')); + } finally { + if (deletingAgentFileId.value === file.id) { + deletingAgentFileId.value = null; + } + } +} + async function refreshAgentAfterIntegrationChange( targetProjectId: string = projectId.value, targetAgentId: string = agentId.value, ) { - if (projectId.value !== targetProjectId || agentId.value !== targetAgentId) return; + if (isStaleAgentTarget(targetProjectId, targetAgentId)) return; await Promise.all([ fetchAgent(targetProjectId, targetAgentId), fetchConfig(targetProjectId, targetAgentId), @@ -582,6 +706,10 @@ async function initialize() { activeChatSessionId.value = null; localConfig.value = null; connectedTriggers.value = []; + agentFiles.value = []; + agentFilesLoading.value = false; + agentFilesUploading.value = false; + deletingAgentFileId.value = null; // Refresh builder readiness so the empty-state CTA reflects the latest // admin configuration. Never blocks the rest of the load. @@ -589,8 +717,7 @@ async function initialize() { showError(error, locale.baseText('settings.agentBuilder.loadError')); }); - await fetchAgent(); - await fetchConfig(projectId.value, agentId.value); + await Promise.all([fetchAgent(), fetchConfig(projectId.value, agentId.value), fetchAgentFiles()]); builderTelemetry.captureToolsBaseline(); builderTelemetry.captureSkillsBaseline(); // Keep agent credential pickers aligned with the workflow editor: load only @@ -1040,6 +1167,11 @@ function onSwitchAgent(nextAgentId: string) { :agent="agent" :project-id="projectId" :agent-id="agentId" + :agent-files="agentFiles" + :agent-files-loading="agentFilesLoading" + :agent-files-uploading="agentFilesUploading" + :knowledge-base-enabled="isKnowledgeBaseEnabled" + :deleting-agent-file-id="deletingAgentFileId" :applied-skills="appliedSkills" :connected-triggers="connectedTriggers" :is-build-chat-streaming="isBuildChatStreaming" @@ -1053,6 +1185,8 @@ function onSwitchAgent(nextAgentId: string) { @add-tool="onOpenAddToolModal" @add-skill="onOpenAddSkillModal" @add-trigger="onOpenAddTriggerModal" + @upload-files="onUploadAgentFiles" + @delete-file="onDeleteAgentFile" @remove-tool="onRemoveTool" @remove-skill="onRemoveSkill" @update:connected-triggers="onConnectedTriggersUpdate" diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index e68a1b8ef08..50a6569b1f0 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -489,7 +489,7 @@ overrides: date-fns: 2.30.0 date-fns-tz: 2.0.0 form-data: 4.0.4 - pdf-parse: ^2.4.5 + pdf-parse: 2.4.5 tmp: 0.2.4 nodemailer: 7.0.11 validator: 13.15.26 @@ -812,7 +812,7 @@ importers: specifier: 'catalog:' version: 1.2.30(@langchain/core@1.1.41(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10)))(@opentelemetry/api@1.9.0)(@opentelemetry/exporter-trace-otlp-proto@0.217.0(@opentelemetry/api@1.9.0))(@opentelemetry/sdk-trace-base@2.7.1(@opentelemetry/api@1.9.0))(openai@6.34.0(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod@3.25.67))(react-dom@18.2.0(react@18.2.0))(react@18.2.0)(vue@3.5.26(typescript@6.0.2))(ws@8.20.1(bufferutil@4.0.9)(utf-8-validate@5.0.10))(zod-to-json-schema@3.23.3(zod@3.25.67)) pdf-parse: - specifier: ^2.4.5 + specifier: 2.4.5 version: 2.4.5 proxy-from-env: specifier: ^1.1.0 @@ -1885,7 +1885,7 @@ importers: specifier: ^3.1.0 version: 3.1.0 pdf-parse: - specifier: ^2.4.5 + specifier: 2.4.5 version: 2.4.5 psl: specifier: 1.9.0 @@ -2953,6 +2953,9 @@ importers: fast-json-patch: specifier: 'catalog:' version: 3.1.1 + fastest-levenshtein: + specifier: 'catalog:' + version: 1.0.16 flat: specifier: 5.0.2 version: 5.0.2 @@ -3046,6 +3049,9 @@ importers: p-limit: specifier: ^3.1.0 version: 3.1.0 + pdf-parse: + specifier: 2.4.5 + version: 2.4.5 pg: specifier: 'catalog:' version: 8.17.0 @@ -7604,7 +7610,7 @@ packages: notion-to-md: ^3.1.0 officeparser: ^6.0.4 openai: '*' - pdf-parse: ^2.4.5 + pdf-parse: 2.4.5 pg: ^8.11.0 pg-copy-streams: ^7.0.0 pickleparser: ^0.2.1 diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index d0a8e090e88..1f95935ac2a 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -115,6 +115,7 @@ catalog: openai: 6.19.0 oxlint: ^1.61.0 oxlint-tsgolint: ^0.21.1 + pdf-parse: 2.4.5 pg: 8.17.0 picocolors: 1.0.1 playwright-core: 1.60.0