// --------------------------------------------------------------------------- // Computer-use scenario runner. // // External-daemon mode: the daemon is expected to be already running. Per // scenario: surgical pre-clean of paths the scenario will seed or grade, // snapshot n8n resources, optionally seed a fixture workflow, run chat, // grade. We never restart or kill the daemon, and we don't post-clean files // on disk — the user inspects them and wipes the sandbox dir manually when // they want a clean slate. // // The n8n side (workflows / credentials / data tables) IS still cleaned via // snapshot+diff so the local n8n instance stays in the state it started in. // --------------------------------------------------------------------------- import { jsonParse } from 'n8n-workflow'; import { copyFile, mkdir, readFile, rm } from 'node:fs/promises'; import { dirname, resolve } from 'node:path'; import { runChat } from './chat'; import { cleanupDelta, snapshotResources } from './cleanup'; import type { DaemonInfo } from './daemon'; import { applyGrader } from './graders'; import { findFiles } from './graders/fs'; import { isContained } from './path-utils'; import type { GraderResult, Scenario, ScenarioResult, ScenarioTrace } from './types'; import type { N8nClient } from '../clients/n8n-client'; import type { EvalLogger } from '../harness/logger'; const DEFAULT_TIMEOUT_MS = 600_000; export interface RunScenarioOptions { client: N8nClient; scenario: Scenario; daemon: DaemonInfo; fixturesDir: string; logger: EvalLogger; timeoutMs?: number; /** When true, skip post-run cleanup of n8n state and chat threads (default: false). */ keepData?: boolean; } export async function runScenario(options: RunScenarioOptions): Promise { const { client, scenario, daemon, logger } = options; const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS; const sandboxDir = daemon.directory; logger.info(`[${scenario.id}] start (${scenario.category})`); await preClean(sandboxDir, scenario, logger); await seedFiles(sandboxDir, scenario, options.fixturesDir, logger); const before = await snapshotResources(client); let trace: ScenarioTrace | undefined; let runError: string | undefined; try { await maybeSeedWorkflow(client, scenario, options.fixturesDir, logger); trace = await runChat({ client, prompt: scenario.prompt, timeoutMs, logger }); } catch (error) { runError = error instanceof Error ? error.message : String(error); logger.error(`[${scenario.id}] run failed: ${runError}`); } const graderResults = trace ? await runGraders(scenario, trace, sandboxDir) : []; const pass = !runError && graderResults.every((r) => r.pass); for (const r of graderResults) { const tag = r.pass ? 'PASS' : 'FAIL'; const message = `[${scenario.id}] ${tag} ${r.grader.type}: ${r.reason}`; if (r.pass) { logger.verbose(message); } else { logger.info(message); } } if (!options.keepData) { await cleanupDelta(client, before, logger); if (trace?.threadId) { try { await client.deleteThread(trace.threadId); logger.verbose(`[${scenario.id}] deleted chat thread ${trace.threadId}`); } catch (error) { logger.verbose( `[${scenario.id}] failed to delete chat thread ${trace.threadId}: ${error instanceof Error ? error.message : String(error)}`, ); } } } else if (trace?.threadId) { logger.info(`[${scenario.id}] keeping chat thread ${trace.threadId} (--keep-data)`); } return { scenario, pass, graderResults, durationMs: trace?.durationMs ?? 0, toolCallCount: trace?.toolCalls.length ?? 0, toolCalls: (trace?.toolCalls ?? []).map((tc, i) => ({ name: tc.toolName, args: tc.args, argTokensEst: trace?.tokens.perCall[i]?.argTokensEst ?? 0, resultTokensEst: trace?.tokens.perCall[i]?.resultTokensEst ?? 0, })), tokens: trace?.tokens ?? { perCall: [], totalArgsEst: 0, totalResultsEst: 0, largestResultEst: 0, estimated: true, }, finalText: (trace?.finalText ?? '').slice(0, 4000), confirmations: trace?.confirmations ?? [], sandboxDir, error: runError, }; } // --------------------------------------------------------------------------- // Surgical pre-clean // // Deletes ONLY the paths this scenario is about to seed or grade. Anything // else in the daemon's working dir is left alone — important when the user // has unrelated files in the sandbox they care about. // --------------------------------------------------------------------------- async function preClean(sandboxDir: string, scenario: Scenario, logger: EvalLogger): Promise { const paths = new Set(); for (const seed of scenario.setup?.seedFiles ?? []) { paths.add(seed.to); } for (const grader of scenario.graders) { if (grader.type === 'fs.fileExists' || grader.type === 'fs.fileMatches') { const matches = await findFiles(sandboxDir, grader.glob); for (const m of matches) paths.add(m); } } for (const p of paths) { const full = resolveInside(sandboxDir, p, 'sandbox path'); await rm(full, { recursive: true, force: true }); } if (paths.size > 0) { logger.verbose(`[${scenario.id}] pre-cleaned ${String(paths.size)} path(s) under sandbox`); } } async function seedFiles( sandboxDir: string, scenario: Scenario, fixturesDir: string, logger: EvalLogger, ): Promise { const seeds = scenario.setup?.seedFiles ?? []; for (const seed of seeds) { const src = resolveInside(fixturesDir, seed.from, 'fixture path'); const dest = resolveInside(sandboxDir, seed.to, 'sandbox path'); await mkdir(dirname(dest), { recursive: true }); await copyFile(src, dest); } if (seeds.length > 0) { logger.verbose(`[${scenario.id}] seeded ${String(seeds.length)} file(s)`); } } function resolveFixture(fixturesDir: string, fixturePath: string): string { return resolveInside(fixturesDir, fixturePath, 'fixture path'); } /** * Join `candidate` onto `root` and assert the result stays within `root`. * Throws if the resolved path escapes (e.g. via `..`). Used to keep scenario * authors honest when declaring fixture paths and sandbox destinations. * * Exported for unit testing — keep the import surface narrow. */ export function resolveInside(root: string, candidate: string, label: string): string { const rootResolved = resolve(root); const fullResolved = resolve(rootResolved, candidate); // Allow the root itself (e.g. empty candidate) as a no-op destination; // otherwise require strict containment. if (fullResolved !== rootResolved && !isContained(rootResolved, fullResolved)) { throw new Error(`${label} "${candidate}" escapes ${root}`); } return fullResolved; } // --------------------------------------------------------------------------- // Optional pre-seeded workflow (for scenarios that say "look at my workflow X") // --------------------------------------------------------------------------- async function maybeSeedWorkflow( client: N8nClient, scenario: Scenario, fixturesDir: string, logger: EvalLogger, ): Promise { const path = scenario.setup?.seedWorkflow; if (!path) return; const fixturePath = resolveFixture(fixturesDir, path); const raw = await readFile(fixturePath, 'utf-8'); const parsed = jsonParse>(raw, { errorMessage: `Invalid workflow JSON: ${path}`, }); const { id } = await client.createWorkflow(parsed); logger.verbose(`[${scenario.id}] seeded workflow ${id}`); if (scenario.setup?.activateSeededWorkflow) { await client.activateWorkflow(id); logger.verbose(`[${scenario.id}] activated workflow ${id}`); } } // --------------------------------------------------------------------------- // Grading // --------------------------------------------------------------------------- async function runGraders( scenario: Scenario, trace: ScenarioTrace, sandboxDir: string, ): Promise { const results: GraderResult[] = []; for (const grader of scenario.graders) { try { results.push(await applyGrader(grader, { sandboxDir, trace })); } catch (error) { results.push({ grader, pass: false, reason: `grader threw: ${error instanceof Error ? error.message : String(error)}`, }); } } return results; }