mirror of
https://github.com/n8n-io/n8n.git
synced 2026-05-12 16:10:30 +02:00
feat(ai-builder): Add LangSmith integration for workflow eval tracking (no-changelog) (#28835)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
639e1dab1c
commit
16e5f9572f
14
.github/workflows/test-evals-instance-ai.yml
vendored
14
.github/workflows/test-evals-instance-ai.yml
vendored
|
|
@ -91,11 +91,17 @@ jobs:
|
|||
run: >-
|
||||
pnpm eval:instance-ai
|
||||
--base-url http://localhost:5678
|
||||
--concurrency 4
|
||||
--verbose
|
||||
--runs 5
|
||||
--iterations 3
|
||||
${{ inputs.filter && format('--filter "{0}"', inputs.filter) || '' }}
|
||||
env:
|
||||
N8N_INSTANCE_AI_MODEL_API_KEY: ${{ secrets.EVALS_ANTHROPIC_KEY }}
|
||||
LANGSMITH_TRACING: 'true'
|
||||
LANGSMITH_ENDPOINT: ${{ secrets.EVALS_LANGSMITH_ENDPOINT }}
|
||||
LANGSMITH_API_KEY: ${{ secrets.EVALS_LANGSMITH_API_KEY }}
|
||||
LANGSMITH_REVISION_ID: ${{ github.sha }}
|
||||
LANGSMITH_BRANCH: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
- name: Stop n8n container
|
||||
if: ${{ always() }}
|
||||
|
|
@ -115,7 +121,7 @@ jobs:
|
|||
# Build the full comment body with jq
|
||||
jq -r '
|
||||
"### Instance AI Workflow Eval Results\n\n" +
|
||||
"**\(.summary.built)/\(.summary.testCases) built | \(.totalRuns) run(s) | pass@\(.totalRuns): \(.summary.passAtK * 100 | floor)% | pass^\(.totalRuns): \(.summary.passHatK * 100 | floor)%**\n\n" +
|
||||
"**\(.summary.built)/\(.summary.testCases) built | \(.totalRuns) run(s) | pass@\(.totalRuns): \(.summary.passAtK * 100 | floor)% | pass^\(.totalRuns): \(.summary.passHatK * 100 | floor)% | iterations: \(.summary.passRatePerIter)**\n\n" +
|
||||
"| Workflow | Build | pass@\(.totalRuns) | pass^\(.totalRuns) |\n|---|---|---|---|\n" +
|
||||
([.testCases[] as $tc | "| \($tc.name) | \($tc.buildSuccessCount)/\($tc.totalRuns) | \(([$tc.scenarios[] | .passAtK] | add) / ($tc.scenarios | length) * 100 | floor)% | \(([$tc.scenarios[] | .passHatK] | add) / ($tc.scenarios | length) * 100 | floor)% |"] | join("\n")) +
|
||||
"\n\n<details><summary>Failure details</summary>\n\n" +
|
||||
|
|
@ -138,5 +144,7 @@ jobs:
|
|||
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
|
||||
with:
|
||||
name: instance-ai-workflow-eval-results
|
||||
path: packages/@n8n/instance-ai/eval-results.json
|
||||
path: |
|
||||
packages/@n8n/instance-ai/eval-results.json
|
||||
packages/@n8n/instance-ai/.data/workflow-eval-report.html
|
||||
retention-days: 14
|
||||
|
|
|
|||
|
|
@ -19,7 +19,32 @@ dotenvx run -f ../../../.env.local -- pnpm eval:instance-ai --filter contact-for
|
|||
dotenvx run -f ../../../.env.local -- pnpm eval:instance-ai --filter contact-form --keep-workflows --verbose
|
||||
```
|
||||
|
||||
Results are printed to the console and written to `eval-results.json`.
|
||||
### Outputs
|
||||
|
||||
Every run produces three artifacts:
|
||||
|
||||
- **Console** — live progress, per-scenario pass/fail with `[failure_category]` tag, and a grouped summary at the end.
|
||||
- **`eval-results.json`** — structured results in the current working directory. Consumed by the CI PR comment.
|
||||
- **`.data/workflow-eval-report.html`** — rich debugging view with per-node execution traces, intercepted requests, mock responses, Phase 1 hints, and verifier reasoning. Self-contained HTML you can open in a browser.
|
||||
|
||||
If `LANGSMITH_API_KEY` is set, results are also sent to LangSmith as an experiment for historical comparison.
|
||||
|
||||
### CLI flags
|
||||
|
||||
| Flag | Default | Description |
|
||||
|------|---------|-------------|
|
||||
| `--verbose` | `false` | Log build/execute/verify timing and SSE events |
|
||||
| `--filter` | — | Filter test cases by filename substring (e.g. `contact-form`) |
|
||||
| `--keep-workflows` | `false` | Don't delete built workflows after the run |
|
||||
| `--base-url` | `http://localhost:5678` | n8n instance URL |
|
||||
| `--email` | E2E test owner | Override login email (also via `N8N_EVAL_EMAIL`) |
|
||||
| `--password` | E2E test owner | Override login password (also via `N8N_EVAL_PASSWORD`) |
|
||||
| `--timeout-ms` | `600000` | Per-test-case timeout |
|
||||
| `--output-dir` | cwd | Where to write `eval-results.json` |
|
||||
| `--dataset` | `instance-ai-workflow-evals` | LangSmith dataset name |
|
||||
| `--concurrency` | `16` | Max concurrent scenarios (builds are separately capped at 4) |
|
||||
| `--experiment-name` | auto | LangSmith experiment prefix (defaults to `{branch}-{sha}` in CI or `local-{branch}-{sha}-dirty?` locally) |
|
||||
| `--iterations` | `1` | Run each test case N times with fresh builds — powers pass@k / pass^k metrics |
|
||||
|
||||
### Docker (without pnpm dev:ai)
|
||||
|
||||
|
|
@ -36,6 +61,11 @@ docker run -d --name n8n-eval \
|
|||
-p 5678:5678 \
|
||||
n8nio/n8n:local
|
||||
|
||||
# Seed the test user
|
||||
curl -sf -X POST http://localhost:5678/rest/e2e/reset \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"owner":{"email":"nathan@n8n.io","password":"PlaywrightTest123","firstName":"Eval","lastName":"Owner"},"admin":{"email":"admin@n8n.io","password":"PlaywrightTest123","firstName":"Admin","lastName":"User"},"members":[],"chat":{"email":"chat@n8n.io","password":"PlaywrightTest123","firstName":"Chat","lastName":"User"}}'
|
||||
|
||||
# Run evals against it
|
||||
pnpm eval:instance-ai --base-url http://localhost:5678 --verbose
|
||||
```
|
||||
|
|
@ -44,17 +74,19 @@ pnpm eval:instance-ai --base-url http://localhost:5678 --verbose
|
|||
|
||||
Evals run automatically on PRs that change Instance AI code (path-filtered). The CI workflow starts a single Docker container and runs the CLI against it. See `.github/workflows/test-evals-instance-ai.yml`.
|
||||
|
||||
The eval job is **non-blocking**. Results are posted as a PR comment and uploaded as artifacts.
|
||||
The eval job is **non-blocking**. Results are posted as a PR comment and uploaded as artifacts. When `LANGSMITH_API_KEY` is set (via the `EVALS_LANGSMITH_API_KEY` secret), the run also lands as an experiment in LangSmith with commit SHA + branch tagged.
|
||||
|
||||
### Environment variables
|
||||
|
||||
Set these in `.env.local`:
|
||||
|
||||
| Variable | Required | Description |
|
||||
|----------|----------|-------------|
|
||||
| `N8N_INSTANCE_AI_MODEL_API_KEY` | Yes | Anthropic API key for the Instance AI agent, mock generation, and verification |
|
||||
| `N8N_EVAL_EMAIL` | No | n8n login email (defaults to E2E test owner) |
|
||||
| `N8N_EVAL_PASSWORD` | No | n8n login password (defaults to E2E test owner) |
|
||||
| `LANGSMITH_API_KEY` | No | Enables LangSmith experiment tracking + tracing. Without it, the CLI still runs and writes JSON/HTML. |
|
||||
| `LANGSMITH_ENDPOINT` | No | LangSmith region endpoint (`https://api.smith.langchain.com` for US, `https://eu.api.smith.langchain.com` for EU) |
|
||||
| `LANGSMITH_REVISION_ID` | No | Commit SHA to tag the experiment with (set automatically in CI) |
|
||||
| `LANGSMITH_BRANCH` | No | Branch name to tag the experiment with (set automatically in CI) |
|
||||
| `CONTEXT7_API_KEY` | No | Context7 API key for higher rate limits on API doc lookups. Free tier is 1,000 req/month |
|
||||
|
||||
## How it works
|
||||
|
|
@ -68,11 +100,15 @@ Each test run:
|
|||
|
||||
### What gets mocked
|
||||
|
||||
- **Mocked nodes** — any node that makes HTTP requests (Gmail, Slack, Google Sheets, HTTP Request, etc.). The request is intercepted before it leaves the process. An LLM generates the response.
|
||||
- **Pinned nodes** — trigger/start nodes get LLM-generated input data injected as pin data
|
||||
- **Real nodes** — logic nodes (Code, Set, Merge, Filter, Sort, IF, Switch) execute their actual code on the mocked/pinned data
|
||||
- **Mocked nodes** — any node that makes HTTP requests (Gmail, Slack, Google Sheets, HTTP Request, Notion, etc.). The request is intercepted before it leaves the process. An LLM generates the response.
|
||||
- **Pinned nodes** — nodes that don't go through the HTTP layer: trigger/webhook nodes, LangChain/AI nodes (they use SDKs directly), database nodes. These receive LLM-generated data as pin data.
|
||||
- **Real nodes** — logic nodes (Code, Set, Merge, Filter, IF, Switch) execute their actual code on the mocked/pinned data.
|
||||
|
||||
No real credentials or API connections are needed.
|
||||
No real credentials or API connections are needed. ~95% of node types are covered; the main gaps are binary-data nodes (file attachments, image generation) and streaming nodes.
|
||||
|
||||
## LangSmith integration
|
||||
|
||||
When `LANGSMITH_API_KEY` is set, each run is recorded as a LangSmith experiment against the `instance-ai-workflow-evals` dataset (synced from the JSON files before each run). Experiments against the same dataset can be compared side-by-side to spot regressions.
|
||||
|
||||
## Adding test cases
|
||||
|
||||
|
|
@ -128,13 +164,15 @@ When a scenario fails, the verifier categorizes the root cause:
|
|||
```
|
||||
evaluations/
|
||||
├── index.ts # Public API
|
||||
├── cli/ # CLI entry point and args parsing
|
||||
├── cli/ # CLI entry point, arg parsing, CI metadata
|
||||
├── clients/ # n8n REST + SSE clients
|
||||
├── checklist/ # LLM verification with retry
|
||||
├── credentials/ # Test credential seeding
|
||||
├── data/workflows/ # Test case JSON files
|
||||
├── harness/ # Runner: buildWorkflow, executeScenario, cleanupBuild
|
||||
├── langsmith/ # Dataset sync + experiment setup
|
||||
├── outcome/ # SSE event parsing, workflow discovery
|
||||
├── report/ # HTML report generator
|
||||
└── system-prompts/ # LLM prompts for verification
|
||||
|
||||
packages/cli/src/modules/instance-ai/eval/
|
||||
|
|
@ -149,6 +187,8 @@ packages/cli/src/modules/instance-ai/eval/
|
|||
## Known limitations
|
||||
|
||||
- **LangChain/AI nodes** — use their own SDKs, not intercepted by the HTTP mock layer. These nodes will fail with credential errors. Use pin data for these.
|
||||
- **Binary / file nodes** — media attachments, image generation, file downloads. Mock metadata works but realistic binary content is out of scope.
|
||||
- **Streaming nodes** — our mock returns complete responses, not streams.
|
||||
- **GraphQL APIs** — response shape depends on the query, not just the endpoint. Quality depends on the LLM knowing the API schema.
|
||||
- **Context7 quota** — free tier is 1,000 requests/month, 60/hour. A full suite run uses ~100 requests. When quota is exceeded, the LLM falls back to its training data.
|
||||
- **Non-determinism** — the agent builds different workflows each run. Pass rates vary between 40-65%.
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ const checklistResultSchema = z.object({
|
|||
// ---------------------------------------------------------------------------
|
||||
|
||||
const MAX_VERIFY_ATTEMPTS = 2;
|
||||
const VERIFY_ATTEMPT_TIMEOUT_MS = 120_000;
|
||||
|
||||
export async function verifyChecklist(
|
||||
checklist: ChecklistItem[],
|
||||
|
|
@ -47,13 +48,28 @@ Verify each checklist item against the artifact above.`;
|
|||
|
||||
const validIds = new Set(llmItems.map((i) => i.id));
|
||||
|
||||
for (let attempt = 0; attempt < MAX_VERIFY_ATTEMPTS; attempt++) {
|
||||
for (let attempt = 1; attempt <= MAX_VERIFY_ATTEMPTS; attempt++) {
|
||||
const agent = createEvalAgent('eval-checklist-verifier', {
|
||||
instructions: MOCK_EXECUTION_VERIFY_PROMPT,
|
||||
cache: true,
|
||||
}).structuredOutput(checklistResultSchema);
|
||||
|
||||
const result = await agent.generate(userMessage);
|
||||
const abortController = new AbortController();
|
||||
const timer = setTimeout(
|
||||
() =>
|
||||
abortController.abort(new Error(`verifier timed out after ${VERIFY_ATTEMPT_TIMEOUT_MS}ms`)),
|
||||
VERIFY_ATTEMPT_TIMEOUT_MS,
|
||||
);
|
||||
let result;
|
||||
try {
|
||||
result = await agent.generate(userMessage, { abortSignal: abortController.signal });
|
||||
} catch (error: unknown) {
|
||||
const msg = error instanceof Error ? error.message : String(error);
|
||||
console.warn(`[verifier] attempt ${attempt}/${MAX_VERIFY_ATTEMPTS} failed: ${msg}`);
|
||||
continue;
|
||||
} finally {
|
||||
clearTimeout(timer);
|
||||
}
|
||||
|
||||
const parsed = result.structuredOutput as z.infer<typeof checklistResultSchema> | undefined;
|
||||
const results: ChecklistResult[] = [];
|
||||
|
|
@ -82,7 +98,12 @@ Verify each checklist item against the artifact above.`;
|
|||
results.sort((a, b) => a.id - b.id);
|
||||
return results;
|
||||
}
|
||||
|
||||
console.warn(
|
||||
`[verifier] attempt ${attempt}/${MAX_VERIFY_ATTEMPTS} produced no parseable results`,
|
||||
);
|
||||
}
|
||||
|
||||
console.warn(`[verifier] exhausted ${MAX_VERIFY_ATTEMPTS} attempts, returning empty result`);
|
||||
return [];
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ function combinations(n: number, k: number): number {
|
|||
* Probability that at least 1 of k randomly chosen samples passes,
|
||||
* given n total samples of which c passed.
|
||||
*/
|
||||
function passAtK(n: number, c: number, k: number): number {
|
||||
export function passAtK(n: number, c: number, k: number): number {
|
||||
if (k > n) return 0;
|
||||
const denominator = combinations(n, k);
|
||||
if (denominator === 0) return 0;
|
||||
|
|
@ -35,7 +35,7 @@ function passAtK(n: number, c: number, k: number): number {
|
|||
* Probability that all k independent attempts pass,
|
||||
* given observed success rate p = c/n.
|
||||
*/
|
||||
function passHatK(n: number, c: number, k: number): number {
|
||||
export function passHatK(n: number, c: number, k: number): number {
|
||||
if (n === 0) return 0;
|
||||
return Math.pow(c / n, k);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ import { z } from 'zod';
|
|||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface CliArgs {
|
||||
/** TimeoutMs is defined per run, not as the total timeout for all the runs */
|
||||
/** TimeoutMs is defined per iteration, not as the total timeout for all iterations */
|
||||
timeoutMs: number;
|
||||
baseUrl: string;
|
||||
email?: string;
|
||||
|
|
@ -24,8 +24,15 @@ export interface CliArgs {
|
|||
keepWorkflows: boolean;
|
||||
/** Directory to write eval-results.json (defaults to cwd) */
|
||||
outputDir?: string;
|
||||
/** Number of times to run each test case (default: 1) */
|
||||
runs: number;
|
||||
/** LangSmith dataset name (synced from JSON test cases before each run) */
|
||||
dataset: string;
|
||||
/** Max concurrent scenarios in evaluate(). Builds are separately limited to 4 by semaphore. */
|
||||
concurrency: number;
|
||||
/** LangSmith experiment name prefix (auto-generated if not set) */
|
||||
experimentName?: string;
|
||||
/** Number of iterations to run each test case (default: 1). Each iteration
|
||||
* gets a fresh build so pass@k / pass^k capture real builder variance. */
|
||||
iterations: number;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
@ -41,7 +48,10 @@ const cliArgsSchema = z.object({
|
|||
filter: z.string().optional(),
|
||||
keepWorkflows: z.boolean().default(false),
|
||||
outputDir: z.string().optional(),
|
||||
runs: z.number().int().positive().default(1),
|
||||
dataset: z.string().default('instance-ai-workflow-evals'),
|
||||
concurrency: z.number().int().positive().default(16),
|
||||
experimentName: z.string().optional(),
|
||||
iterations: z.number().int().positive().default(1),
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
|
@ -61,7 +71,10 @@ export function parseCliArgs(argv: string[]): CliArgs {
|
|||
filter: validated.filter,
|
||||
keepWorkflows: validated.keepWorkflows,
|
||||
outputDir: validated.outputDir,
|
||||
runs: validated.runs,
|
||||
dataset: validated.dataset,
|
||||
concurrency: validated.concurrency,
|
||||
experimentName: validated.experimentName,
|
||||
iterations: validated.iterations,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -78,7 +91,10 @@ interface RawArgs {
|
|||
filter?: string;
|
||||
keepWorkflows: boolean;
|
||||
outputDir?: string;
|
||||
runs: number;
|
||||
dataset: string;
|
||||
concurrency: number;
|
||||
experimentName?: string;
|
||||
iterations: number;
|
||||
}
|
||||
|
||||
function parseRawArgs(argv: string[]): RawArgs {
|
||||
|
|
@ -88,7 +104,10 @@ function parseRawArgs(argv: string[]): RawArgs {
|
|||
verbose: false,
|
||||
keepWorkflows: false,
|
||||
outputDir: undefined,
|
||||
runs: 1,
|
||||
dataset: 'instance-ai-workflow-evals',
|
||||
concurrency: 16,
|
||||
experimentName: undefined,
|
||||
iterations: 1,
|
||||
};
|
||||
|
||||
for (let i = 0; i < argv.length; i++) {
|
||||
|
|
@ -130,16 +149,39 @@ function parseRawArgs(argv: string[]): RawArgs {
|
|||
|
||||
case '--output-dir':
|
||||
result.outputDir = nextArg(argv, i, '--output-dir');
|
||||
i++;
|
||||
break;
|
||||
|
||||
case '--runs':
|
||||
result.runs = parseIntArg(argv, i, '--runs');
|
||||
case '--iterations':
|
||||
result.iterations = parseIntArg(argv, i, '--iterations');
|
||||
i++;
|
||||
break;
|
||||
|
||||
case '--dataset':
|
||||
result.dataset = nextArg(argv, i, '--dataset');
|
||||
i++;
|
||||
break;
|
||||
|
||||
case '--concurrency':
|
||||
result.concurrency = parseIntArg(argv, i, '--concurrency');
|
||||
i++;
|
||||
break;
|
||||
|
||||
case '--experiment-name':
|
||||
result.experimentName = nextArg(argv, i, '--experiment-name');
|
||||
i++;
|
||||
break;
|
||||
|
||||
default:
|
||||
// Ignore unknown flags
|
||||
break;
|
||||
// Fail loudly on unknown flags. Strip any =value payload before
|
||||
// echoing and drop positional values entirely — raw CLI input
|
||||
// may contain secrets (e.g. --password=... or an accidentally
|
||||
// pasted token) that would otherwise leak into terminal/CI logs.
|
||||
if (arg.startsWith('--')) {
|
||||
const flagName = arg.split('=', 1)[0];
|
||||
throw new Error(`Unknown flag: ${flagName}`);
|
||||
}
|
||||
throw new Error('Unexpected positional argument');
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -162,7 +204,8 @@ function parseIntArg(argv: string[], currentIndex: number, flagName: string): nu
|
|||
const raw = nextArg(argv, currentIndex, flagName);
|
||||
const parsed = parseInt(raw, 10);
|
||||
if (Number.isNaN(parsed)) {
|
||||
throw new Error(`Invalid integer for ${flagName}: ${raw}`);
|
||||
// Don't echo raw — a bad shell expansion could leak a secret here.
|
||||
throw new Error(`Invalid integer for ${flagName}`);
|
||||
}
|
||||
return parsed;
|
||||
}
|
||||
|
|
|
|||
82
packages/@n8n/instance-ai/evaluations/cli/ci-metadata.ts
Normal file
82
packages/@n8n/instance-ai/evaluations/cli/ci-metadata.ts
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
/**
|
||||
* CI Metadata for LangSmith experiments.
|
||||
*
|
||||
* Distinguishes CI runs from local development runs and tracks provenance
|
||||
* of automated evaluation results.
|
||||
*
|
||||
* Note: git info (commit SHA, branch) is tracked by LangSmith automatically
|
||||
* via LANGSMITH_REVISION_ID and LANGSMITH_BRANCH env vars — set them in the
|
||||
* CI workflow and the SDK picks them up.
|
||||
*/
|
||||
|
||||
import { execSync } from 'node:child_process';
|
||||
|
||||
export interface CIMetadata {
|
||||
source: 'ci' | 'local';
|
||||
/** GitHub Actions event that triggered this run (e.g., 'pull_request', 'merge_group', 'workflow_dispatch') */
|
||||
trigger?: string;
|
||||
/** GitHub Actions run ID for linking back to the workflow run */
|
||||
runId?: string;
|
||||
}
|
||||
|
||||
export function buildCIMetadata(): CIMetadata {
|
||||
const isCI = process.env.GITHUB_ACTIONS === 'true';
|
||||
|
||||
if (!isCI) {
|
||||
return { source: 'local' };
|
||||
}
|
||||
|
||||
return {
|
||||
source: 'ci',
|
||||
trigger: process.env.GITHUB_EVENT_NAME,
|
||||
runId: process.env.GITHUB_RUN_ID,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute an informative experiment name prefix from branch and commit info.
|
||||
* Falls back to a generic name if no git context is available.
|
||||
*
|
||||
* - CI: `ci-{branch}-{short-sha}` from GitHub Actions env vars
|
||||
* - Local: `local-{branch}-{short-sha}[-dirty]` from git, dirty suffix if there are uncommitted changes
|
||||
* - Fallback: `instance-ai-workflow-evals`
|
||||
*
|
||||
* LangSmith appends its own random suffix, so this doesn't need to be unique.
|
||||
*/
|
||||
export function computeExperimentPrefix(): string {
|
||||
const ciName = computeCIExperimentName();
|
||||
if (ciName) return ciName;
|
||||
|
||||
const localName = computeLocalExperimentName();
|
||||
if (localName) return localName;
|
||||
|
||||
return 'instance-ai-workflow-evals';
|
||||
}
|
||||
|
||||
function computeCIExperimentName(): string | undefined {
|
||||
if (process.env.GITHUB_ACTIONS !== 'true') return undefined;
|
||||
|
||||
const branch = process.env.GITHUB_HEAD_REF ?? process.env.GITHUB_REF_NAME;
|
||||
const sha = process.env.GITHUB_SHA;
|
||||
if (!branch || !sha) return undefined;
|
||||
|
||||
return sanitize(`ci-${branch}-${sha.slice(0, 7)}`);
|
||||
}
|
||||
|
||||
function computeLocalExperimentName(): string | undefined {
|
||||
try {
|
||||
const run = (cmd: string): string =>
|
||||
execSync(cmd, { encoding: 'utf8', stdio: ['pipe', 'pipe', 'ignore'] }).trim();
|
||||
|
||||
const branch = run('git rev-parse --abbrev-ref HEAD');
|
||||
const sha = run('git rev-parse --short HEAD');
|
||||
const dirty = run('git status --porcelain').length > 0 ? '-dirty' : '';
|
||||
return sanitize(`local-${branch}-${sha}${dirty}`);
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
function sanitize(name: string): string {
|
||||
return name.replace(/[^a-zA-Z0-9_.-]/g, '_').replace(/_{2,}/g, '_');
|
||||
}
|
||||
|
|
@ -1,34 +1,121 @@
|
|||
#!/usr/bin/env node
|
||||
import { mkdirSync, writeFileSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
// ---------------------------------------------------------------------------
|
||||
// Instance AI workflow eval CLI
|
||||
//
|
||||
// Runs workflow execution evaluations. When LANGSMITH_API_KEY is set, uses
|
||||
// LangSmith's evaluate() for experiment tracking and tracing. Otherwise
|
||||
// falls back to a direct loop with the same eval-results.json output.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
import { aggregateResults } from './aggregator';
|
||||
import type { InstanceAiEvalExecutionResult } from '@n8n/api-types';
|
||||
import { mkdirSync, writeFileSync } from 'fs';
|
||||
import { Client } from 'langsmith';
|
||||
import { evaluate } from 'langsmith/evaluation';
|
||||
import type { EvaluationResult } from 'langsmith/evaluation';
|
||||
import type { Example, Run } from 'langsmith/schemas';
|
||||
import { traceable } from 'langsmith/traceable';
|
||||
import pLimit from 'p-limit';
|
||||
import { join } from 'path';
|
||||
import { z } from 'zod';
|
||||
|
||||
import { aggregateResults, passAtK, passHatK } from './aggregator';
|
||||
import { parseCliArgs } from './args';
|
||||
import { buildCIMetadata, computeExperimentPrefix } from './ci-metadata';
|
||||
import { N8nClient } from '../clients/n8n-client';
|
||||
import { seedCredentials, cleanupCredentials } from '../credentials/seeder';
|
||||
import { loadWorkflowTestCases } from '../data/workflows';
|
||||
import { loadWorkflowTestCasesWithFiles } from '../data/workflows';
|
||||
import type { WorkflowTestCaseWithFile } from '../data/workflows';
|
||||
import { createLogger } from '../harness/logger';
|
||||
import { runWorkflowTestCase, runWithConcurrency } from '../harness/runner';
|
||||
import type { EvalLogger } from '../harness/logger';
|
||||
import {
|
||||
buildWorkflow,
|
||||
executeScenario,
|
||||
cleanupBuild,
|
||||
runWorkflowTestCase,
|
||||
runWithConcurrency,
|
||||
type BuildResult,
|
||||
} from '../harness/runner';
|
||||
import { syncDataset, type DatasetExampleInputs } from '../langsmith/dataset-sync';
|
||||
import { snapshotWorkflowIds } from '../outcome/workflow-discovery';
|
||||
import type { MultiRunEvaluation, WorkflowTestCaseResult } from '../types';
|
||||
import { writeWorkflowReport } from '../report/workflow-report';
|
||||
import type {
|
||||
MultiRunEvaluation,
|
||||
ScenarioResult,
|
||||
TestScenario,
|
||||
WorkflowTestCaseResult,
|
||||
} from '../types';
|
||||
|
||||
// n8n degrades above ~4 concurrent builds.
|
||||
const MAX_CONCURRENT_BUILDS = 4;
|
||||
|
||||
const targetOutputSchema = z.object({
|
||||
buildSuccess: z.boolean().default(false),
|
||||
passed: z.boolean().default(false),
|
||||
score: z.number().default(0),
|
||||
reasoning: z.string().default(''),
|
||||
workflowId: z.string().optional(),
|
||||
failureCategory: z.string().optional(),
|
||||
rootCause: z.string().optional(),
|
||||
execErrors: z.array(z.string()).default([]),
|
||||
evalResult: z.unknown().optional(),
|
||||
/** Only set on the scenario that initiated the build. */
|
||||
buildDurationMs: z.number().optional(),
|
||||
execDurationMs: z.number().default(0),
|
||||
nodeCount: z.number().default(0),
|
||||
});
|
||||
|
||||
type TargetOutput = Omit<z.infer<typeof targetOutputSchema>, 'evalResult'> & {
|
||||
evalResult?: InstanceAiEvalExecutionResult;
|
||||
};
|
||||
|
||||
function isPlainObject(v: unknown): v is Record<string, unknown> {
|
||||
return typeof v === 'object' && v !== null && !Array.isArray(v);
|
||||
}
|
||||
|
||||
function isEvalResult(v: unknown): v is InstanceAiEvalExecutionResult {
|
||||
if (!isPlainObject(v)) return false;
|
||||
return (
|
||||
typeof v.nodeResults === 'object' &&
|
||||
v.nodeResults !== null &&
|
||||
Array.isArray(v.errors) &&
|
||||
typeof v.hints === 'object' &&
|
||||
v.hints !== null
|
||||
);
|
||||
}
|
||||
|
||||
/** Safe-parse a run's outputs. Returns `undefined` if the row is malformed
|
||||
* or missing, so callers can skip it instead of treating it as a genuine
|
||||
* failed evaluation. Every field in the schema has a default, so an empty
|
||||
* or nullish raw value would otherwise parse successfully into a "failed"
|
||||
* shape (passed:false, score:0) — masking infra errors as builder regressions.
|
||||
*/
|
||||
function parseTargetOutput(raw: unknown): TargetOutput | undefined {
|
||||
if (!isPlainObject(raw) || Object.keys(raw).length === 0) return undefined;
|
||||
const parsed = targetOutputSchema.safeParse(raw);
|
||||
if (!parsed.success) return undefined;
|
||||
return {
|
||||
...parsed.data,
|
||||
evalResult: isEvalResult(parsed.data.evalResult) ? parsed.data.evalResult : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
const runInputsSchema = z
|
||||
.object({
|
||||
prompt: z.string().default(''),
|
||||
testCaseFile: z.string().default(''),
|
||||
scenarioName: z.string().default(''),
|
||||
/** 0-based iteration index; injected during multi-run expansion. */
|
||||
_iteration: z.number().int().nonnegative().default(0),
|
||||
})
|
||||
.passthrough();
|
||||
|
||||
/** Target input shape with the iteration index we inject for multi-run. */
|
||||
type TargetInputs = DatasetExampleInputs & { _iteration?: number };
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const args = parseCliArgs(process.argv.slice(2));
|
||||
|
||||
const testCases = loadWorkflowTestCases(args.filter);
|
||||
if (testCases.length === 0) {
|
||||
console.log('No workflow test cases found in evaluations/data/workflows/');
|
||||
return;
|
||||
}
|
||||
|
||||
const totalScenarios = testCases.reduce((sum, tc) => sum + tc.scenarios.length, 0);
|
||||
console.log(
|
||||
`Running ${String(testCases.length)} workflow test case(s) with ${String(totalScenarios)} scenario(s) x ${String(args.runs)} runs\n`,
|
||||
);
|
||||
|
||||
const logger = createLogger(args.verbose);
|
||||
|
||||
// Setup: authenticate, seed credentials, snapshot workflows
|
||||
const client = new N8nClient(args.baseUrl);
|
||||
logger.info(`Authenticating with ${args.baseUrl}...`);
|
||||
await client.login(args.email, args.password);
|
||||
|
|
@ -38,73 +125,640 @@ async function main(): Promise<void> {
|
|||
const seedResult = await seedCredentials(client, undefined, logger);
|
||||
logger.info(`Seeded ${String(seedResult.credentialIds.length)} credential(s)`);
|
||||
|
||||
// Run test cases with bounded concurrency.
|
||||
// Each test case builds a workflow (uses n8n's agent) then runs scenarios
|
||||
// (uses our Anthropic key for Phase 1 + Phase 2 mock generation).
|
||||
const MAX_CONCURRENT_TEST_CASES = 4;
|
||||
const preRunWorkflowIds = await snapshotWorkflowIds(client);
|
||||
const claimedWorkflowIds = new Set<string>();
|
||||
|
||||
const startTime = Date.now();
|
||||
const allRunResults: WorkflowTestCaseResult[][] = [];
|
||||
|
||||
try {
|
||||
for (let run = 0; run < args.runs; run++) {
|
||||
if (args.runs > 1) {
|
||||
console.log(`\n--- Run #${String(run + 1)}/${String(args.runs)} ---\n`);
|
||||
}
|
||||
const hasLangSmith = Boolean(process.env.LANGSMITH_API_KEY);
|
||||
|
||||
const preRunWorkflowIds = await snapshotWorkflowIds(client);
|
||||
const claimedWorkflowIds = new Set<string>();
|
||||
let evaluation: MultiRunEvaluation;
|
||||
|
||||
const results = await runWithConcurrency(
|
||||
testCases,
|
||||
async (testCase) =>
|
||||
await runWorkflowTestCase({
|
||||
client,
|
||||
testCase,
|
||||
timeoutMs: args.timeoutMs,
|
||||
seededCredentialTypes: seedResult.seededTypes,
|
||||
preRunWorkflowIds,
|
||||
claimedWorkflowIds,
|
||||
logger,
|
||||
keepWorkflows: args.keepWorkflows,
|
||||
}),
|
||||
MAX_CONCURRENT_TEST_CASES,
|
||||
);
|
||||
|
||||
allRunResults.push(results);
|
||||
if (hasLangSmith) {
|
||||
logger.info('LangSmith API key detected, using evaluate() with experiment tracking');
|
||||
evaluation = await runWithLangSmith({
|
||||
args,
|
||||
client,
|
||||
preRunWorkflowIds,
|
||||
claimedWorkflowIds,
|
||||
logger,
|
||||
seedResult,
|
||||
});
|
||||
} else {
|
||||
logger.info('No LANGSMITH_API_KEY, running direct loop (results in eval-results.json only)');
|
||||
evaluation = await runDirectLoop({
|
||||
args,
|
||||
client,
|
||||
preRunWorkflowIds,
|
||||
claimedWorkflowIds,
|
||||
logger,
|
||||
seedResult,
|
||||
});
|
||||
}
|
||||
|
||||
const totalDuration = Date.now() - startTime;
|
||||
const outputPath = writeEvalResults(evaluation, totalDuration, args.outputDir);
|
||||
console.log(`Results: ${outputPath}`);
|
||||
const htmlPath = writeWorkflowReport(flattenRunsForReport(evaluation));
|
||||
console.log(`Report: ${htmlPath}`);
|
||||
printSummary(evaluation);
|
||||
} finally {
|
||||
await cleanupCredentials(client, seedResult.credentialIds).catch(() => {});
|
||||
}
|
||||
|
||||
const totalDuration = Date.now() - startTime;
|
||||
const aggregatedResults = aggregateResults(allRunResults, args.runs);
|
||||
|
||||
// Write eval-results.json for CI consumption (PR comments, artifacts)
|
||||
const outputPath = writeEvalResults(aggregatedResults, totalDuration, args.outputDir);
|
||||
console.log(`Results: ${outputPath}`);
|
||||
|
||||
// Print console summary
|
||||
printSummary(aggregatedResults);
|
||||
}
|
||||
|
||||
/** Write structured JSON results for CI (PR comments, artifact upload). */
|
||||
// ---------------------------------------------------------------------------
|
||||
// LangSmith mode: evaluate() with dataset sync, tracing, experiments
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
interface RunConfig {
|
||||
args: ReturnType<typeof parseCliArgs>;
|
||||
client: N8nClient;
|
||||
preRunWorkflowIds: Set<string>;
|
||||
claimedWorkflowIds: Set<string>;
|
||||
logger: EvalLogger;
|
||||
seedResult: { seededTypes: string[]; credentialIds: string[] };
|
||||
}
|
||||
|
||||
async function runWithLangSmith(config: RunConfig): Promise<MultiRunEvaluation> {
|
||||
const { args, client, preRunWorkflowIds, claimedWorkflowIds, logger } = config;
|
||||
|
||||
const lsClient = new Client();
|
||||
const datasetName = await syncDataset(lsClient, args.dataset, logger, args.filter);
|
||||
const testCasesWithFiles = loadWorkflowTestCasesWithFiles(args.filter);
|
||||
|
||||
const buildLimiter = pLimit(MAX_CONCURRENT_BUILDS);
|
||||
// Keyed by `${iteration}:${prompt}` so the same prompt gets a fresh build
|
||||
// per iteration — pass@k captures real builder variance.
|
||||
const buildCache = new Map<string, Promise<BuildResult>>();
|
||||
const buildDurations = new Map<string, number>();
|
||||
|
||||
// Traceable wraps the actual build call *inside* the limiter — otherwise the
|
||||
// LangSmith span would include queue-wait time, which accumulates across
|
||||
// iterations as later builds queue behind earlier ones.
|
||||
const tracedBuildWorkflow = traceable(
|
||||
async (prompt: string) =>
|
||||
await buildWorkflow({
|
||||
client,
|
||||
prompt,
|
||||
timeoutMs: args.timeoutMs,
|
||||
preRunWorkflowIds,
|
||||
claimedWorkflowIds,
|
||||
logger,
|
||||
}),
|
||||
{ name: 'workflow_build', run_type: 'chain', client: lsClient },
|
||||
);
|
||||
|
||||
async function getOrBuild(
|
||||
prompt: string,
|
||||
iteration: number,
|
||||
): Promise<{ build: BuildResult; buildDurationMs?: number }> {
|
||||
const key = `${String(iteration)}:${prompt}`;
|
||||
const existing = buildCache.get(key);
|
||||
if (existing) return { build: await existing };
|
||||
const promise = buildLimiter(async () => {
|
||||
const start = Date.now();
|
||||
const build = await tracedBuildWorkflow(prompt);
|
||||
buildDurations.set(key, Date.now() - start);
|
||||
return build;
|
||||
});
|
||||
buildCache.set(key, promise);
|
||||
const build = await promise;
|
||||
return { build, buildDurationMs: buildDurations.get(key) };
|
||||
}
|
||||
|
||||
const traceableExecute = traceable(
|
||||
async (execArgs: {
|
||||
workflowId: string;
|
||||
scenario: TestScenario;
|
||||
workflowJsons: BuildResult['workflowJsons'];
|
||||
}) =>
|
||||
await executeScenario(
|
||||
client,
|
||||
execArgs.workflowId,
|
||||
execArgs.scenario,
|
||||
execArgs.workflowJsons,
|
||||
logger,
|
||||
),
|
||||
{ name: 'scenario_execution', run_type: 'chain', client: lsClient },
|
||||
);
|
||||
|
||||
const target = async (inputs: TargetInputs): Promise<TargetOutput> => {
|
||||
const iteration = inputs._iteration ?? 0;
|
||||
const scenario: TestScenario = {
|
||||
name: inputs.scenarioName,
|
||||
description: inputs.scenarioDescription,
|
||||
dataSetup: inputs.dataSetup,
|
||||
successCriteria: inputs.successCriteria,
|
||||
};
|
||||
|
||||
const { build, buildDurationMs } = await getOrBuild(inputs.prompt, iteration);
|
||||
|
||||
if (!build.success || !build.workflowId) {
|
||||
return {
|
||||
buildSuccess: false,
|
||||
passed: false,
|
||||
score: 0,
|
||||
reasoning: `Build failed: ${build.error ?? 'unknown'}`,
|
||||
failureCategory: 'build_failure',
|
||||
execErrors: build.error ? [build.error] : [],
|
||||
buildDurationMs,
|
||||
execDurationMs: 0,
|
||||
nodeCount: 0,
|
||||
};
|
||||
}
|
||||
|
||||
const execStart = Date.now();
|
||||
const nodeCount = build.workflowJsons[0]?.nodes.length ?? 0;
|
||||
let result;
|
||||
try {
|
||||
result = await traceableExecute({
|
||||
workflowId: build.workflowId,
|
||||
scenario,
|
||||
workflowJsons: build.workflowJsons,
|
||||
});
|
||||
} catch (error: unknown) {
|
||||
// Mirror direct mode's per-scenario guard — without this, n8n API errors
|
||||
// or verifier timeouts from executeWithLlmMock / verifyChecklist would
|
||||
// escape to LangSmith, come back as a Run with null outputs, and be
|
||||
// misclassified as builder regressions by the feedback extractor.
|
||||
const errorMessage = error instanceof Error ? error.message : String(error);
|
||||
logger.error(` ERROR [${scenario.name}]: ${errorMessage}`);
|
||||
return {
|
||||
buildSuccess: true,
|
||||
workflowId: build.workflowId,
|
||||
passed: false,
|
||||
score: 0,
|
||||
reasoning: `Scenario execution error: ${errorMessage}`,
|
||||
failureCategory: 'framework_issue',
|
||||
execErrors: [errorMessage],
|
||||
buildDurationMs,
|
||||
execDurationMs: Date.now() - execStart,
|
||||
nodeCount,
|
||||
};
|
||||
}
|
||||
const execDurationMs = Date.now() - execStart;
|
||||
|
||||
// Strip failure fields on pass: the verifier sometimes returns "."
|
||||
// placeholders instead of omitting them.
|
||||
const failureCategory = result.success ? undefined : result.failureCategory;
|
||||
const rootCause = result.success ? undefined : result.rootCause;
|
||||
|
||||
return {
|
||||
buildSuccess: true,
|
||||
workflowId: build.workflowId,
|
||||
passed: result.success,
|
||||
score: result.score,
|
||||
reasoning: result.reasoning,
|
||||
failureCategory,
|
||||
rootCause,
|
||||
execErrors: result.evalResult?.errors ?? [],
|
||||
evalResult: result.evalResult,
|
||||
buildDurationMs,
|
||||
execDurationMs,
|
||||
nodeCount,
|
||||
};
|
||||
};
|
||||
|
||||
const feedbackExtractor = ({ run }: { run: Run }): EvaluationResult[] => {
|
||||
const output = parseTargetOutput(run.outputs);
|
||||
if (!output) return [];
|
||||
// 'none' for passed scenarios so the column shows a full categorical
|
||||
// breakdown instead of blank cells.
|
||||
const failureCategory = output.passed ? 'none' : (output.failureCategory ?? 'unknown');
|
||||
const feedback: EvaluationResult[] = [
|
||||
{
|
||||
key: 'scenario_pass',
|
||||
score: output.score,
|
||||
comment: output.reasoning || undefined,
|
||||
},
|
||||
{
|
||||
key: 'failure_category',
|
||||
value: failureCategory,
|
||||
},
|
||||
{
|
||||
key: 'exec_duration_s',
|
||||
score: output.execDurationMs / 1000,
|
||||
},
|
||||
{
|
||||
key: 'node_count',
|
||||
score: output.nodeCount,
|
||||
},
|
||||
];
|
||||
if (output.buildDurationMs !== undefined) {
|
||||
feedback.push({ key: 'build_duration_s', score: output.buildDurationMs / 1000 });
|
||||
}
|
||||
return feedback;
|
||||
};
|
||||
|
||||
const experimentPrefix = args.experimentName ?? computeExperimentPrefix();
|
||||
|
||||
logger.info(
|
||||
`Starting evaluate() with concurrency=${String(args.concurrency)}, builds limited to ${String(MAX_CONCURRENT_BUILDS)}, iterations=${String(args.iterations)}`,
|
||||
);
|
||||
|
||||
const sourceExamples = args.filter
|
||||
? filteredExamplesIterable(lsClient, datasetName, args.filter, logger)
|
||||
: lsClient.listExamples({ datasetName });
|
||||
const evaluateData =
|
||||
args.iterations > 1
|
||||
? expandExamplesForIterations(sourceExamples, args.iterations)
|
||||
: sourceExamples;
|
||||
|
||||
try {
|
||||
const evaluateStart = Date.now();
|
||||
const experimentResults = await evaluate(target, {
|
||||
data: evaluateData,
|
||||
evaluators: [feedbackExtractor],
|
||||
experimentPrefix,
|
||||
maxConcurrency: args.concurrency,
|
||||
client: lsClient,
|
||||
metadata: {
|
||||
filter: args.filter ?? 'all',
|
||||
concurrency: args.concurrency,
|
||||
maxBuilds: MAX_CONCURRENT_BUILDS,
|
||||
iterations: args.iterations,
|
||||
...buildCIMetadata(),
|
||||
},
|
||||
});
|
||||
const totalDurationMs = Date.now() - evaluateStart;
|
||||
|
||||
logger.info(`Experiment: ${experimentResults.experimentName}`);
|
||||
await lsClient.awaitPendingTraceBatches();
|
||||
|
||||
const allRunResults = reshapeLangSmithRuns(
|
||||
experimentResults.results,
|
||||
testCasesWithFiles,
|
||||
args.iterations,
|
||||
);
|
||||
const evaluation = aggregateResults(allRunResults, args.iterations);
|
||||
|
||||
await updateExperimentAggregates({
|
||||
lsClient,
|
||||
experimentName: experimentResults.experimentName,
|
||||
runs: experimentResults.results,
|
||||
evaluation,
|
||||
buildDurations,
|
||||
totalDurationMs,
|
||||
logger,
|
||||
});
|
||||
|
||||
await writePerRunPassMetrics({
|
||||
lsClient,
|
||||
runs: experimentResults.results,
|
||||
logger,
|
||||
});
|
||||
|
||||
return evaluation;
|
||||
} finally {
|
||||
if (!args.keepWorkflows) {
|
||||
await Promise.all(
|
||||
[...buildCache.values()].map(async (buildPromise) => {
|
||||
try {
|
||||
const build = await buildPromise;
|
||||
await cleanupBuild(client, build, logger);
|
||||
} catch {
|
||||
// Best-effort
|
||||
}
|
||||
}),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Expand a source example stream into N copies, tagging each with `_iteration`
|
||||
* so the target function can key its build cache by iteration and we can
|
||||
* reshape runs back into per-iteration groups afterwards. All N copies share
|
||||
* the source example's id, so LangSmith's UI groups them naturally by
|
||||
* `reference_example_id` — useful for pass@k visualization.
|
||||
*
|
||||
* The source is buffered into memory once before the first yield: we need to
|
||||
* emit each example N times, and an AsyncIterable can only be consumed once.
|
||||
*/
|
||||
async function* expandExamplesForIterations(
|
||||
source: AsyncIterable<Example>,
|
||||
iterations: number,
|
||||
): AsyncIterable<Example> {
|
||||
const cached: Example[] = [];
|
||||
for await (const ex of source) cached.push(ex);
|
||||
for (let i = 0; i < iterations; i++) {
|
||||
for (const ex of cached) {
|
||||
yield { ...ex, inputs: { ...ex.inputs, _iteration: i } };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function filteredExamplesIterable(
|
||||
lsClient: Client,
|
||||
datasetName: string,
|
||||
filter: string,
|
||||
logger: EvalLogger,
|
||||
): AsyncIterable<Example> {
|
||||
const slugs = loadWorkflowTestCasesWithFiles(filter).map((tc) => tc.fileSlug);
|
||||
if (slugs.length === 0) {
|
||||
logger.info(`Filter "${filter}" matched no local test case files`);
|
||||
return (async function* () {})();
|
||||
}
|
||||
logger.info(`Filter "${filter}" matched ${String(slugs.length)} split(s): ${slugs.join(', ')}`);
|
||||
return lsClient.listExamples({ datasetName, splits: slugs });
|
||||
}
|
||||
|
||||
async function updateExperimentAggregates(config: {
|
||||
lsClient: Client;
|
||||
experimentName: string;
|
||||
runs: Array<{ run: Run }>;
|
||||
evaluation: MultiRunEvaluation;
|
||||
buildDurations: Map<string, number>;
|
||||
totalDurationMs: number;
|
||||
logger: EvalLogger;
|
||||
}): Promise<void> {
|
||||
const { lsClient, experimentName, runs, evaluation, buildDurations, totalDurationMs, logger } =
|
||||
config;
|
||||
|
||||
const buildTimes = [...buildDurations.values()];
|
||||
const uniqueBuilds = buildTimes.length;
|
||||
const avgBuildMs =
|
||||
uniqueBuilds > 0 ? buildTimes.reduce((sum, d) => sum + d, 0) / uniqueBuilds : 0;
|
||||
|
||||
const execTimes = runs
|
||||
.map(({ run }) => parseTargetOutput(run.outputs)?.execDurationMs)
|
||||
.filter((ms): ms is number => typeof ms === 'number');
|
||||
const avgExecMs =
|
||||
execTimes.length > 0 ? execTimes.reduce((sum, d) => sum + d, 0) / execTimes.length : 0;
|
||||
|
||||
const aggregates = {
|
||||
duration_s: Math.round(totalDurationMs / 100) / 10,
|
||||
avg_build_s: Math.round(avgBuildMs / 100) / 10,
|
||||
avg_exec_s: Math.round(avgExecMs / 100) / 10,
|
||||
unique_builds: uniqueBuilds,
|
||||
pass_rate_per_iter: computePassRatePerIter(evaluation),
|
||||
};
|
||||
|
||||
try {
|
||||
const project = await lsClient.readProject({ projectName: experimentName });
|
||||
// `updateProject` replaces `extra` wholesale — preserve it so auto-set
|
||||
// fields (splits, etc.) survive. Narrow via typeof guards rather than `as`.
|
||||
const existingExtra = isPlainObject(project.extra) ? project.extra : {};
|
||||
const existingMetadata = isPlainObject(existingExtra.metadata) ? existingExtra.metadata : {};
|
||||
await lsClient.updateProject(project.id, {
|
||||
projectExtra: existingExtra,
|
||||
metadata: { ...existingMetadata, ...aggregates },
|
||||
});
|
||||
logger.verbose(`Updated experiment metadata: ${JSON.stringify(aggregates)}`);
|
||||
} catch (error: unknown) {
|
||||
const msg = error instanceof Error ? error.message : String(error);
|
||||
logger.verbose(`Could not update experiment metadata: ${msg}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Attach per-example pass metrics (pass_rate, pass_at_k, pass_hat_k) as
|
||||
* feedback on every run in the example's group. All N runs of the same example
|
||||
* carry the same value — that lets the LangSmith UI sort/filter individual
|
||||
* runs by their example's metric, and its per-experiment column aggregation
|
||||
* reduces to the mean across unique examples.
|
||||
*/
|
||||
async function writePerRunPassMetrics(config: {
|
||||
lsClient: Client;
|
||||
runs: Array<{ run: Run }>;
|
||||
logger: EvalLogger;
|
||||
}): Promise<void> {
|
||||
const { lsClient, runs, logger } = config;
|
||||
|
||||
// Group runs by reference_example_id, counting passes.
|
||||
const byExample = new Map<string, { runIds: string[]; passed: number; total: number }>();
|
||||
for (const { run } of runs) {
|
||||
const exampleId = run.reference_example_id;
|
||||
if (!exampleId) continue;
|
||||
const output = parseTargetOutput(run.outputs);
|
||||
if (!output) continue;
|
||||
const entry = byExample.get(exampleId) ?? { runIds: [], passed: 0, total: 0 };
|
||||
entry.runIds.push(run.id);
|
||||
entry.total++;
|
||||
if (output.passed) entry.passed++;
|
||||
byExample.set(exampleId, entry);
|
||||
}
|
||||
|
||||
// Individual writes are best-effort: a transient API error on one run
|
||||
// shouldn't block the rest, so we swallow per-promise and keep going.
|
||||
const feedbackWrites: Array<Promise<unknown>> = [];
|
||||
for (const { runIds, passed, total } of byExample.values()) {
|
||||
const passAtKValue = passAtK(total, passed, total);
|
||||
const passHatKValue = passHatK(total, passed, total);
|
||||
for (const runId of runIds) {
|
||||
feedbackWrites.push(
|
||||
lsClient.createFeedback(runId, 'pass_at_k', { score: passAtKValue }).catch(() => {}),
|
||||
lsClient.createFeedback(runId, 'pass_hat_k', { score: passHatKValue }).catch(() => {}),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
await Promise.all(feedbackWrites);
|
||||
logger.verbose(
|
||||
`Wrote pass metrics feedback for ${String(byExample.size)} example(s) across ${String(runs.length)} run(s)`,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert LangSmith's flat `Run[]` into the `WorkflowTestCaseResult[][]` shape
|
||||
* the aggregator expects (outer: runs, inner: test cases). Groups by
|
||||
* (testCaseFile, scenarioName), then reconstructs per-iteration test case
|
||||
* results. Scenarios with no matching run get a build_failure stub.
|
||||
*/
|
||||
function reshapeLangSmithRuns(
|
||||
rows: Array<{ run: Run }>,
|
||||
testCasesWithFiles: WorkflowTestCaseWithFile[],
|
||||
numIterations: number,
|
||||
): WorkflowTestCaseResult[][] {
|
||||
// Index runs by (iteration, testCaseFile, scenarioName) using the `_iteration`
|
||||
// we injected in expandExamplesForIterations. Falls back to 0 for single-run.
|
||||
const byKey = new Map<string, Run>();
|
||||
for (const { run } of rows) {
|
||||
const inputs = runInputsSchema.safeParse(run.inputs ?? {});
|
||||
if (!inputs.success) continue;
|
||||
const key = `${String(inputs.data._iteration)}/${inputs.data.testCaseFile}/${inputs.data.scenarioName}`;
|
||||
byKey.set(key, run);
|
||||
}
|
||||
|
||||
const allRunResults: WorkflowTestCaseResult[][] = [];
|
||||
for (let iter = 0; iter < numIterations; iter++) {
|
||||
const runResults: WorkflowTestCaseResult[] = [];
|
||||
for (const { testCase, fileSlug } of testCasesWithFiles) {
|
||||
const scenarioResults: ScenarioResult[] = [];
|
||||
let workflowBuildSuccess = false;
|
||||
let workflowId: string | undefined;
|
||||
let buildError: string | undefined;
|
||||
|
||||
for (const scenario of testCase.scenarios) {
|
||||
const run = byKey.get(`${String(iter)}/${fileSlug}/${scenario.name}`);
|
||||
const output = run ? parseTargetOutput(run.outputs) : undefined;
|
||||
if (!run || !output) {
|
||||
scenarioResults.push({
|
||||
scenario,
|
||||
success: false,
|
||||
score: 0,
|
||||
reasoning: run ? 'Malformed run output — skipped' : 'No run result for this scenario',
|
||||
});
|
||||
continue;
|
||||
}
|
||||
if (output.buildSuccess) workflowBuildSuccess = true;
|
||||
if (output.workflowId) workflowId = output.workflowId;
|
||||
if (!output.buildSuccess && output.reasoning) buildError = output.reasoning;
|
||||
scenarioResults.push({
|
||||
scenario,
|
||||
success: output.passed,
|
||||
evalResult: output.evalResult,
|
||||
score: output.score,
|
||||
reasoning: output.reasoning,
|
||||
failureCategory: output.failureCategory,
|
||||
rootCause: output.rootCause,
|
||||
});
|
||||
}
|
||||
|
||||
runResults.push({
|
||||
testCase,
|
||||
workflowBuildSuccess,
|
||||
workflowId,
|
||||
scenarioResults,
|
||||
buildError,
|
||||
});
|
||||
}
|
||||
allRunResults.push(runResults);
|
||||
}
|
||||
return allRunResults;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Direct mode: simple loop, no LangSmith dependency
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async function runDirectLoop(config: RunConfig): Promise<MultiRunEvaluation> {
|
||||
const { args, client, preRunWorkflowIds, claimedWorkflowIds, logger, seedResult } = config;
|
||||
|
||||
const testCasesWithFiles = loadWorkflowTestCasesWithFiles(args.filter);
|
||||
if (testCasesWithFiles.length === 0) {
|
||||
console.log('No workflow test cases found in evaluations/data/workflows/');
|
||||
return { totalRuns: 0, testCases: [] };
|
||||
}
|
||||
|
||||
const totalScenarios = testCasesWithFiles.reduce(
|
||||
(sum, { testCase }) => sum + testCase.scenarios.length,
|
||||
0,
|
||||
);
|
||||
logger.info(
|
||||
`Running ${String(testCasesWithFiles.length)} test case(s) with ${String(totalScenarios)} scenario(s) × ${String(args.iterations)} iteration(s)`,
|
||||
);
|
||||
|
||||
const allRunResults: WorkflowTestCaseResult[][] = [];
|
||||
for (let iter = 0; iter < args.iterations; iter++) {
|
||||
if (args.iterations > 1) {
|
||||
logger.info(`--- Iteration #${String(iter + 1)}/${String(args.iterations)} ---`);
|
||||
}
|
||||
const results = await runWithConcurrency(
|
||||
testCasesWithFiles,
|
||||
async ({ testCase }) =>
|
||||
await runWorkflowTestCase({
|
||||
client,
|
||||
testCase,
|
||||
timeoutMs: args.timeoutMs,
|
||||
seededCredentialTypes: seedResult.seededTypes,
|
||||
preRunWorkflowIds,
|
||||
claimedWorkflowIds,
|
||||
logger,
|
||||
keepWorkflows: args.keepWorkflows,
|
||||
}),
|
||||
MAX_CONCURRENT_BUILDS,
|
||||
);
|
||||
allRunResults.push(results);
|
||||
}
|
||||
|
||||
return aggregateResults(allRunResults, args.iterations);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// eval-results.json output (same shape as CI PR comment expects)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Flatten per-iteration runs into a single list of test-case results for the
|
||||
* HTML report. Previously we rendered only `tc.runs[0]`, which silently hid
|
||||
* iterations 2..N — a flaky scenario that passed once and failed twice would
|
||||
* appear clean in the uploaded artifact. For multi-iteration runs we prefix
|
||||
* each prompt with its iteration number so the cards are distinguishable at
|
||||
* a glance.
|
||||
*/
|
||||
function flattenRunsForReport(evaluation: MultiRunEvaluation): WorkflowTestCaseResult[] {
|
||||
if (evaluation.totalRuns <= 1) {
|
||||
return evaluation.testCases.map((tc) => tc.runs[0]);
|
||||
}
|
||||
return evaluation.testCases.flatMap((tc) =>
|
||||
tc.runs.map((run, iter) => ({
|
||||
...run,
|
||||
testCase: {
|
||||
...run.testCase,
|
||||
prompt: `[iter ${String(iter + 1)}/${String(evaluation.totalRuns)}] ${run.testCase.prompt}`,
|
||||
},
|
||||
})),
|
||||
);
|
||||
}
|
||||
|
||||
interface AggregateMetrics {
|
||||
/** Number of test cases with at least one successful build across iterations. */
|
||||
built: number;
|
||||
/** Total scenarios across all test cases. */
|
||||
scenariosTotal: number;
|
||||
/** Mean pass@k across scenarios at k = totalRuns (0..1). */
|
||||
passAtK: number;
|
||||
/** Mean pass^k across scenarios at k = totalRuns (0..1). */
|
||||
passHatK: number;
|
||||
/** Index into each scenario's passAtK/passHatK array for k = totalRuns. */
|
||||
kIndex: number;
|
||||
/** Pass rate of each iteration formatted as e.g. "37% / 37% / 37%". */
|
||||
passRatePerIter: string;
|
||||
}
|
||||
|
||||
function computeAggregateMetrics(evaluation: MultiRunEvaluation): AggregateMetrics {
|
||||
const { totalRuns, testCases } = evaluation;
|
||||
const allScenarios = testCases.flatMap((tc) => tc.scenarios);
|
||||
const total = allScenarios.length;
|
||||
const kIndex = Math.max(totalRuns - 1, 0);
|
||||
const built = testCases.filter((tc) => tc.buildSuccessCount > 0).length;
|
||||
const passAtK =
|
||||
total > 0 ? allScenarios.reduce((sum, s) => sum + (s.passAtK[kIndex] ?? 0), 0) / total : 0;
|
||||
const passHatK =
|
||||
total > 0 ? allScenarios.reduce((sum, s) => sum + (s.passHatK[kIndex] ?? 0), 0) / total : 0;
|
||||
return {
|
||||
built,
|
||||
scenariosTotal: total,
|
||||
passAtK,
|
||||
passHatK,
|
||||
kIndex,
|
||||
passRatePerIter: computePassRatePerIter(evaluation),
|
||||
};
|
||||
}
|
||||
|
||||
/** Pass rate of each iteration formatted as e.g. "37% / 37% / 37%". */
|
||||
function computePassRatePerIter(evaluation: MultiRunEvaluation): string {
|
||||
const { totalRuns, testCases } = evaluation;
|
||||
const allScenarios = testCases.flatMap((tc) => tc.scenarios);
|
||||
if (allScenarios.length === 0) return '';
|
||||
const rates: string[] = [];
|
||||
for (let i = 0; i < totalRuns; i++) {
|
||||
const passed = allScenarios.filter((s) => s.runs[i]?.success).length;
|
||||
rates.push(`${String(Math.round((passed / allScenarios.length) * 100))}%`);
|
||||
}
|
||||
return rates.join(' / ');
|
||||
}
|
||||
|
||||
function writeEvalResults(
|
||||
evaluation: MultiRunEvaluation,
|
||||
duration: number,
|
||||
outputDir?: string,
|
||||
): string {
|
||||
const { totalRuns, testCases } = evaluation;
|
||||
const allScenarios = testCases.flatMap((tc) => tc.scenarios);
|
||||
const totalScenariosCount = allScenarios.length;
|
||||
|
||||
const passAtKCount =
|
||||
totalScenariosCount > 0
|
||||
? allScenarios.reduce((sum, s) => sum + (s.passAtK[totalRuns - 1] ?? 0), 0)
|
||||
: 0;
|
||||
const passHatKCount =
|
||||
totalScenariosCount > 0
|
||||
? allScenarios.reduce((sum, s) => sum + (s.passHatK[totalRuns - 1] ?? 0), 0)
|
||||
: 0;
|
||||
const metrics = computeAggregateMetrics(evaluation);
|
||||
|
||||
const report = {
|
||||
timestamp: new Date().toISOString(),
|
||||
|
|
@ -112,10 +766,11 @@ function writeEvalResults(
|
|||
totalRuns,
|
||||
summary: {
|
||||
testCases: testCases.length,
|
||||
built: testCases.filter((tc) => tc.buildSuccessCount > 0).length,
|
||||
scenariosTotal: totalScenariosCount,
|
||||
passAtK: totalScenariosCount > 0 ? passAtKCount / totalScenariosCount : 0,
|
||||
passHatK: totalScenariosCount > 0 ? passHatKCount / totalScenariosCount : 0,
|
||||
built: metrics.built,
|
||||
scenariosTotal: metrics.scenariosTotal,
|
||||
passAtK: metrics.passAtK,
|
||||
passHatK: metrics.passHatK,
|
||||
passRatePerIter: metrics.passRatePerIter,
|
||||
},
|
||||
testCases: testCases.map((tc) => ({
|
||||
name: tc.testCase.prompt.slice(0, 70),
|
||||
|
|
@ -125,29 +780,36 @@ function writeEvalResults(
|
|||
name: sa.scenario.name,
|
||||
passCount: sa.passCount,
|
||||
totalRuns,
|
||||
passAtK: sa.passAtK[totalRuns - 1] ?? 0,
|
||||
passHatK: sa.passHatK[totalRuns - 1] ?? 0,
|
||||
passAtK: sa.passAtK[metrics.kIndex] ?? 0,
|
||||
passHatK: sa.passHatK[metrics.kIndex] ?? 0,
|
||||
runs: sa.runs.map((sr) => ({
|
||||
passed: sr.success,
|
||||
score: sr.score,
|
||||
reasoning: sr.reasoning,
|
||||
failureCategory: sr.failureCategory,
|
||||
rootCause: sr.rootCause,
|
||||
execErrors: sr.evalResult?.errors ?? [],
|
||||
evalResult: sr.evalResult,
|
||||
})),
|
||||
})),
|
||||
})),
|
||||
};
|
||||
|
||||
const dir = outputDir ?? process.cwd();
|
||||
mkdirSync(dir, { recursive: true });
|
||||
const outputPath = join(dir, 'eval-results.json');
|
||||
const targetDir = outputDir ?? process.cwd();
|
||||
mkdirSync(targetDir, { recursive: true });
|
||||
const outputPath = join(targetDir, 'eval-results.json');
|
||||
writeFileSync(outputPath, JSON.stringify(report, null, 2));
|
||||
return outputPath;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Console summary
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function printSummary(evaluation: MultiRunEvaluation): void {
|
||||
const { totalRuns, testCases } = evaluation;
|
||||
const multiRun = totalRuns > 1;
|
||||
const metrics = computeAggregateMetrics(evaluation);
|
||||
|
||||
console.log('\n=== Workflow Eval Results ===\n');
|
||||
for (const tc of testCases) {
|
||||
|
|
@ -166,8 +828,8 @@ function printSummary(evaluation: MultiRunEvaluation): void {
|
|||
|
||||
for (const sa of tc.scenarios) {
|
||||
if (multiRun) {
|
||||
const passAtK = Math.round((sa.passAtK[totalRuns - 1] ?? 0) * 100);
|
||||
const passHatK = Math.round((sa.passHatK[totalRuns - 1] ?? 0) * 100);
|
||||
const passAtK = Math.round((sa.passAtK[metrics.kIndex] ?? 0) * 100);
|
||||
const passHatK = Math.round((sa.passHatK[metrics.kIndex] ?? 0) * 100);
|
||||
console.log(
|
||||
` ${sa.scenario.name}: ${String(sa.passCount)}/${String(totalRuns)} passed` +
|
||||
` | pass@${String(totalRuns)}: ${String(passAtK)}% | pass^${String(totalRuns)}: ${String(passHatK)}%`,
|
||||
|
|
@ -175,52 +837,34 @@ function printSummary(evaluation: MultiRunEvaluation): void {
|
|||
} else {
|
||||
const sr = sa.runs[0];
|
||||
const icon = sr.success ? '✓' : '✗';
|
||||
const category = sr.failureCategory ? ` [${sr.failureCategory}]` : '';
|
||||
console.log(
|
||||
` ${icon} ${sr.scenario.name}: ${sr.success ? 'PASS' : 'FAIL'} (${String(sr.score * 100)}%)`,
|
||||
` ${icon} ${sr.scenario.name}: ${sr.success ? 'PASS' : 'FAIL'}${category} (${String(sr.score * 100)}%)`,
|
||||
);
|
||||
if (!sr.success) {
|
||||
console.log(` ${sr.reasoning.slice(0, 120)}`);
|
||||
const execErrors = sr.evalResult?.errors ?? [];
|
||||
if (execErrors.length > 0) {
|
||||
console.log(` Error: ${execErrors.join('; ').slice(0, 200)}`);
|
||||
}
|
||||
console.log(` Diagnosis: ${sr.reasoning.slice(0, 200)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
console.log('');
|
||||
}
|
||||
|
||||
// Aggregate metrics for multi-run
|
||||
if (multiRun) {
|
||||
console.log(
|
||||
`${String(metrics.built)}/${String(testCases.length)} built | pass@${String(totalRuns)}: ${String(Math.round(metrics.passAtK * 100))}% | pass^${String(totalRuns)}: ${String(Math.round(metrics.passHatK * 100))}% | iterations: ${metrics.passRatePerIter}`,
|
||||
);
|
||||
} else {
|
||||
const allScenarios = testCases.flatMap((tc) => tc.scenarios);
|
||||
const total = allScenarios.length;
|
||||
const avgPassAtK =
|
||||
total > 0
|
||||
? Math.round(
|
||||
(allScenarios.reduce((sum, s) => sum + (s.passAtK[totalRuns - 1] ?? 0), 0) / total) *
|
||||
100,
|
||||
)
|
||||
: 0;
|
||||
const avgPassHatK =
|
||||
total > 0
|
||||
? Math.round(
|
||||
(allScenarios.reduce((sum, s) => sum + (s.passHatK[totalRuns - 1] ?? 0), 0) / total) *
|
||||
100,
|
||||
)
|
||||
: 0;
|
||||
|
||||
console.log('\n=== Aggregate Metrics ===\n');
|
||||
console.log(` pass@${String(totalRuns)}: ${String(avgPassAtK)}%`);
|
||||
console.log(` pass^${String(totalRuns)}: ${String(avgPassHatK)}%`);
|
||||
const passed = allScenarios.filter((s) => s.runs[0]?.success).length;
|
||||
const total = metrics.scenariosTotal;
|
||||
console.log(
|
||||
`${String(metrics.built)}/${String(testCases.length)} built | ${String(passed)}/${String(total)} passed (${String(total > 0 ? Math.round((passed / total) * 100) : 0)}%)`,
|
||||
);
|
||||
}
|
||||
|
||||
// Totals
|
||||
const allScenarios = testCases.flatMap((tc) => tc.scenarios);
|
||||
const total = allScenarios.length;
|
||||
const built = testCases.filter((tc) => tc.buildSuccessCount > 0).length;
|
||||
const passedTotal = multiRun
|
||||
? allScenarios.reduce((sum, s) => sum + s.passCount, 0)
|
||||
: allScenarios.filter((s) => s.runs[0]?.success).length;
|
||||
const totalAttempts = multiRun ? total * totalRuns : total;
|
||||
|
||||
console.log(
|
||||
`\n${String(built)}/${String(testCases.length)} built | ${String(passedTotal)}/${String(totalAttempts)} passed (${String(totalAttempts > 0 ? Math.round((passedTotal / totalAttempts) * 100) : 0)}%)`,
|
||||
);
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
|
|
|
|||
|
|
@ -75,15 +75,10 @@ interface ThreadStatus {
|
|||
export class N8nClient {
|
||||
private sessionCookie?: string;
|
||||
|
||||
constructor(readonly baseUrl: string) {}
|
||||
constructor(private readonly baseUrl: string) {}
|
||||
|
||||
// -- Auth ----------------------------------------------------------------
|
||||
|
||||
/** Set the session cookie directly (for sharing across workers). */
|
||||
setSessionCookie(cookie: string): void {
|
||||
this.sessionCookie = cookie;
|
||||
}
|
||||
|
||||
/**
|
||||
* Authenticate with the n8n instance via POST /rest/login.
|
||||
* Captures the `n8n-auth` cookie for subsequent requests.
|
||||
|
|
|
|||
|
|
@ -1,8 +1,14 @@
|
|||
import { readFileSync, readdirSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
import { basename, join } from 'path';
|
||||
|
||||
import type { WorkflowTestCase } from '../../types';
|
||||
|
||||
export interface WorkflowTestCaseWithFile {
|
||||
testCase: WorkflowTestCase;
|
||||
/** Filename without extension, e.g. "contact-form-automation" */
|
||||
fileSlug: string;
|
||||
}
|
||||
|
||||
function parseTestCaseFile(filePath: string): WorkflowTestCase {
|
||||
const content = readFileSync(filePath, 'utf-8');
|
||||
try {
|
||||
|
|
@ -14,11 +20,19 @@ function parseTestCaseFile(filePath: string): WorkflowTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
export function loadWorkflowTestCases(filter?: string): WorkflowTestCase[] {
|
||||
function getJsonFiles(filter?: string): string[] {
|
||||
const dir = __dirname;
|
||||
let files = readdirSync(dir).filter((f) => f.endsWith('.json'));
|
||||
if (filter) {
|
||||
files = files.filter((f) => f.toLowerCase().includes(filter.toLowerCase()));
|
||||
}
|
||||
return files.map((f) => parseTestCaseFile(join(dir, f)));
|
||||
return files.map((f) => join(dir, f));
|
||||
}
|
||||
|
||||
/** Load test cases with their file slugs (for LangSmith dataset sync derived IDs). */
|
||||
export function loadWorkflowTestCasesWithFiles(filter?: string): WorkflowTestCaseWithFile[] {
|
||||
return getJsonFiles(filter).map((f) => ({
|
||||
testCase: parseTestCaseFile(f),
|
||||
fileSlug: basename(f, '.json'),
|
||||
}));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,7 +10,8 @@ export { N8nClient } from './clients/n8n-client';
|
|||
export type { WorkflowResponse, WorkflowNodeResponse, ExecutionDetail } from './clients/n8n-client';
|
||||
|
||||
// -- Test case data --
|
||||
export { loadWorkflowTestCases } from './data/workflows';
|
||||
export { loadWorkflowTestCasesWithFiles } from './data/workflows';
|
||||
export type { WorkflowTestCaseWithFile } from './data/workflows';
|
||||
|
||||
// -- Credentials --
|
||||
export { seedCredentials, cleanupCredentials } from './credentials/seeder';
|
||||
|
|
|
|||
323
packages/@n8n/instance-ai/evaluations/langsmith/dataset-sync.ts
Normal file
323
packages/@n8n/instance-ai/evaluations/langsmith/dataset-sync.ts
Normal file
|
|
@ -0,0 +1,323 @@
|
|||
// ---------------------------------------------------------------------------
|
||||
// LangSmith dataset sync
|
||||
//
|
||||
// Syncs JSON test case files from the repo to a LangSmith dataset.
|
||||
// Uses derived IDs (fileSlug/scenarioName) so examples are stable across
|
||||
// runs, enabling experiment comparison over time.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
import { createHash } from 'crypto';
|
||||
import type { Client } from 'langsmith';
|
||||
import type { Example, KVMap } from 'langsmith/schemas';
|
||||
import { z } from 'zod';
|
||||
|
||||
import { loadWorkflowTestCasesWithFiles } from '../data/workflows';
|
||||
import type { EvalLogger } from '../harness/logger';
|
||||
|
||||
// Bump this if existing IDs get tombstoned by LangSmith soft-delete and need
|
||||
// to be regenerated fresh. UUIDs for the same derivedId stay stable within a
|
||||
// version, so experiment comparison still works.
|
||||
const UUID_VERSION = 'v2';
|
||||
|
||||
/**
|
||||
* Generate a deterministic UUID from a string.
|
||||
* Same input always produces the same UUID, so example IDs are stable across runs.
|
||||
*/
|
||||
function deterministicUuid(input: string): string {
|
||||
const hash = createHash('sha256').update(`${UUID_VERSION}:${input}`).digest('hex');
|
||||
// Format as UUID v4 shape (8-4-4-4-12)
|
||||
return [
|
||||
hash.slice(0, 8),
|
||||
hash.slice(8, 12),
|
||||
'4' + hash.slice(13, 16),
|
||||
'8' + hash.slice(17, 20),
|
||||
hash.slice(20, 32),
|
||||
].join('-');
|
||||
}
|
||||
|
||||
/**
|
||||
* Shape of the inputs passed to the target function for each scenario.
|
||||
* `testCaseFile` is included so the LangSmith Inputs column shows which
|
||||
* workflow a scenario belongs to (metadata is hidden by default).
|
||||
*/
|
||||
export const datasetExampleInputsSchema = z.object({
|
||||
prompt: z.string(),
|
||||
testCaseFile: z.string(),
|
||||
scenarioName: z.string(),
|
||||
scenarioDescription: z.string(),
|
||||
dataSetup: z.string(),
|
||||
successCriteria: z.string(),
|
||||
});
|
||||
export type DatasetExampleInputs = z.infer<typeof datasetExampleInputsSchema>;
|
||||
|
||||
/** Metadata attached to each example for filtering / grouping in the UI. */
|
||||
export const datasetExampleMetadataSchema = z.object({
|
||||
/** Duplicated from inputs so the LangSmith UI can group by it (only metadata keys are groupable). */
|
||||
testCaseFile: z.string(),
|
||||
complexity: z.enum(['simple', 'medium', 'complex']).optional(),
|
||||
tags: z.array(z.string()).optional(),
|
||||
triggerType: z.enum(['manual', 'webhook', 'schedule', 'form']).optional(),
|
||||
});
|
||||
export type DatasetExampleMetadata = z.infer<typeof datasetExampleMetadataSchema>;
|
||||
|
||||
/**
|
||||
* Sync JSON test cases to a LangSmith dataset.
|
||||
*
|
||||
* - Creates the dataset if it doesn't exist
|
||||
* - Diffs local scenarios against existing examples
|
||||
* - Creates, updates, or deletes examples to match
|
||||
* - Orders examples round-robin across test cases for optimal parallelism
|
||||
* - Assigns each example to a split (test case file slug) for UI filtering
|
||||
*
|
||||
* Returns the dataset name for use with evaluate().
|
||||
*/
|
||||
export async function syncDataset(
|
||||
lsClient: Client,
|
||||
datasetName: string,
|
||||
logger: EvalLogger,
|
||||
filter?: string,
|
||||
): Promise<string> {
|
||||
const testCasesWithFiles = loadWorkflowTestCasesWithFiles(filter);
|
||||
|
||||
// Round-robin ordering ensures evaluate() triggers diverse builds early
|
||||
// rather than burning all concurrency slots on one test case.
|
||||
const scenarios = buildRoundRobinScenarios(testCasesWithFiles);
|
||||
|
||||
logger.info(
|
||||
`Dataset sync: ${String(scenarios.length)} scenarios from ${String(testCasesWithFiles.length)} test cases`,
|
||||
);
|
||||
|
||||
// Create or get dataset. `hasDataset` distinguishes "not found" from auth/
|
||||
// network errors, so we only create when it genuinely doesn't exist.
|
||||
let datasetId: string;
|
||||
if (await lsClient.hasDataset({ datasetName })) {
|
||||
const dataset = await lsClient.readDataset({ datasetName });
|
||||
datasetId = dataset.id;
|
||||
} else {
|
||||
const dataset = await lsClient.createDataset(datasetName, {
|
||||
description: 'Instance AI workflow execution evaluations (synced from repo JSON files)',
|
||||
});
|
||||
datasetId = dataset.id;
|
||||
logger.info(`Created dataset: ${datasetName}`);
|
||||
}
|
||||
|
||||
// List existing examples, keyed by derived ID (testCaseFile/scenarioName from inputs).
|
||||
const existingByDerivedId = new Map<string, Example>();
|
||||
for await (const example of lsClient.listExamples({ datasetId })) {
|
||||
const inputs = existingInputsSchema.safeParse(example.inputs);
|
||||
if (!inputs.success) continue;
|
||||
existingByDerivedId.set(`${inputs.data.testCaseFile}/${inputs.data.scenarioName}`, example);
|
||||
}
|
||||
|
||||
// Diff and sync
|
||||
const currentIds = new Set<string>();
|
||||
const toCreate: Array<{ id: string; inputs: KVMap; metadata: KVMap; split: string }> = [];
|
||||
const toUpdate: Array<{ id: string; inputs: KVMap; metadata: KVMap; split: string }> = [];
|
||||
|
||||
for (const scenario of scenarios) {
|
||||
const derivedId = `${scenario.testCaseFile}/${scenario.scenarioName}`;
|
||||
currentIds.add(derivedId);
|
||||
|
||||
const inputs: DatasetExampleInputs = {
|
||||
prompt: scenario.prompt,
|
||||
testCaseFile: scenario.testCaseFile,
|
||||
scenarioName: scenario.scenarioName,
|
||||
scenarioDescription: scenario.scenarioDescription,
|
||||
dataSetup: scenario.dataSetup,
|
||||
successCriteria: scenario.successCriteria,
|
||||
};
|
||||
|
||||
const metadata: DatasetExampleMetadata = {
|
||||
testCaseFile: scenario.testCaseFile,
|
||||
complexity: scenario.complexity,
|
||||
tags: scenario.tags,
|
||||
triggerType: scenario.triggerType,
|
||||
};
|
||||
|
||||
const existingExample = existingByDerivedId.get(derivedId);
|
||||
if (existingExample) {
|
||||
if (
|
||||
hasInputsChanged(existingExample.inputs, inputs) ||
|
||||
hasMetadataChanged(existingExample.metadata, metadata)
|
||||
) {
|
||||
toUpdate.push({
|
||||
id: existingExample.id,
|
||||
inputs,
|
||||
metadata,
|
||||
split: scenario.testCaseFile,
|
||||
});
|
||||
}
|
||||
} else {
|
||||
toCreate.push({
|
||||
id: deterministicUuid(derivedId),
|
||||
inputs,
|
||||
metadata,
|
||||
split: scenario.testCaseFile,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Only delete stale examples on a full sync (no filter). With a filter,
|
||||
// we're only syncing a subset and mustn't delete the others.
|
||||
// LangSmith also soft-deletes, which tombstones the UUID and prevents
|
||||
// recreation with the same ID on a later full run.
|
||||
const toDelete: string[] = [];
|
||||
if (!filter) {
|
||||
for (const [derivedId, example] of existingByDerivedId) {
|
||||
if (!currentIds.has(derivedId)) {
|
||||
toDelete.push(example.id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (toCreate.length > 0) {
|
||||
await lsClient.createExamples(
|
||||
toCreate.map((e) => ({
|
||||
id: e.id,
|
||||
inputs: e.inputs,
|
||||
metadata: e.metadata,
|
||||
split: e.split,
|
||||
dataset_id: datasetId,
|
||||
})),
|
||||
);
|
||||
logger.info(` Created ${String(toCreate.length)} example(s)`);
|
||||
}
|
||||
|
||||
if (toUpdate.length > 0) {
|
||||
await lsClient.updateExamples(
|
||||
toUpdate.map((e) => ({
|
||||
id: e.id,
|
||||
inputs: e.inputs,
|
||||
metadata: e.metadata,
|
||||
split: e.split,
|
||||
dataset_id: datasetId,
|
||||
})),
|
||||
);
|
||||
logger.info(` Updated ${String(toUpdate.length)} example(s)`);
|
||||
}
|
||||
|
||||
if (toDelete.length > 0) {
|
||||
await lsClient.deleteExamples(toDelete);
|
||||
logger.info(` Deleted ${String(toDelete.length)} stale example(s)`);
|
||||
}
|
||||
|
||||
if (toCreate.length === 0 && toUpdate.length === 0 && toDelete.length === 0) {
|
||||
logger.info(' Dataset up to date');
|
||||
}
|
||||
|
||||
return datasetName;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
interface FlatScenario {
|
||||
prompt: string;
|
||||
testCaseFile: string;
|
||||
scenarioName: string;
|
||||
scenarioDescription: string;
|
||||
dataSetup: string;
|
||||
successCriteria: string;
|
||||
complexity?: 'simple' | 'medium' | 'complex';
|
||||
tags?: string[];
|
||||
triggerType?: 'manual' | 'webhook' | 'schedule' | 'form';
|
||||
}
|
||||
|
||||
/**
|
||||
* Flatten test cases into scenarios ordered round-robin across test cases.
|
||||
*
|
||||
* Input: [tc1(s1,s2,s3), tc2(s1,s2), tc3(s1)]
|
||||
* Output: [tc1/s1, tc2/s1, tc3/s1, tc1/s2, tc2/s2, tc1/s3]
|
||||
*/
|
||||
function buildRoundRobinScenarios(
|
||||
testCasesWithFiles: Array<{
|
||||
testCase: {
|
||||
prompt: string;
|
||||
complexity?: 'simple' | 'medium' | 'complex';
|
||||
tags?: string[];
|
||||
triggerType?: 'manual' | 'webhook' | 'schedule' | 'form';
|
||||
scenarios: Array<{
|
||||
name: string;
|
||||
description: string;
|
||||
dataSetup: string;
|
||||
successCriteria: string;
|
||||
}>;
|
||||
};
|
||||
fileSlug: string;
|
||||
}>,
|
||||
): FlatScenario[] {
|
||||
const result: FlatScenario[] = [];
|
||||
const maxScenarios = Math.max(...testCasesWithFiles.map((tc) => tc.testCase.scenarios.length), 0);
|
||||
|
||||
for (let i = 0; i < maxScenarios; i++) {
|
||||
for (const { testCase, fileSlug } of testCasesWithFiles) {
|
||||
const scenario = testCase.scenarios[i];
|
||||
if (scenario) {
|
||||
result.push({
|
||||
prompt: testCase.prompt,
|
||||
testCaseFile: fileSlug,
|
||||
scenarioName: scenario.name,
|
||||
scenarioDescription: scenario.description,
|
||||
dataSetup: scenario.dataSetup,
|
||||
successCriteria: scenario.successCriteria,
|
||||
complexity: testCase.complexity,
|
||||
tags: testCase.tags,
|
||||
triggerType: testCase.triggerType,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Schemas for reading existing LangSmith example data, which is typed as an
|
||||
// open KVMap by the SDK. We only parse the fields we care about for diffing.
|
||||
|
||||
const existingInputsSchema = z
|
||||
.object({
|
||||
prompt: z.string().default(''),
|
||||
testCaseFile: z.string().default(''),
|
||||
scenarioName: z.string().default(''),
|
||||
scenarioDescription: z.string().default(''),
|
||||
dataSetup: z.string().default(''),
|
||||
successCriteria: z.string().default(''),
|
||||
})
|
||||
.passthrough();
|
||||
|
||||
const existingMetadataSchema = z
|
||||
.object({
|
||||
testCaseFile: z.string().default(''),
|
||||
complexity: z.string().default(''),
|
||||
triggerType: z.string().default(''),
|
||||
tags: z.array(z.string()).default([]),
|
||||
})
|
||||
.passthrough();
|
||||
|
||||
function hasInputsChanged(existing: unknown, incoming: DatasetExampleInputs): boolean {
|
||||
// Treat unparseable existing data as changed so we overwrite with fresh
|
||||
// values rather than aborting the whole sync on one malformed row.
|
||||
const parsed = existingInputsSchema.safeParse(existing ?? {});
|
||||
if (!parsed.success) return true;
|
||||
const e = parsed.data;
|
||||
return (
|
||||
e.prompt !== incoming.prompt ||
|
||||
e.testCaseFile !== incoming.testCaseFile ||
|
||||
e.dataSetup !== incoming.dataSetup ||
|
||||
e.successCriteria !== incoming.successCriteria ||
|
||||
e.scenarioDescription !== incoming.scenarioDescription
|
||||
);
|
||||
}
|
||||
|
||||
function hasMetadataChanged(existing: unknown, incoming: DatasetExampleMetadata): boolean {
|
||||
const parsed = existingMetadataSchema.safeParse(existing ?? {});
|
||||
if (!parsed.success) return true;
|
||||
const e = parsed.data;
|
||||
return (
|
||||
e.testCaseFile !== incoming.testCaseFile ||
|
||||
e.complexity !== (incoming.complexity ?? '') ||
|
||||
e.triggerType !== (incoming.triggerType ?? '') ||
|
||||
JSON.stringify(e.tags) !== JSON.stringify(incoming.tags ?? [])
|
||||
);
|
||||
}
|
||||
501
packages/@n8n/instance-ai/evaluations/report/workflow-report.ts
Normal file
501
packages/@n8n/instance-ai/evaluations/report/workflow-report.ts
Normal file
|
|
@ -0,0 +1,501 @@
|
|||
/**
|
||||
* HTML report generator for workflow test case evaluations.
|
||||
*
|
||||
* Produces a self-contained HTML file optimized for three tasks:
|
||||
* 1. Triage — which scenarios failed? (seconds)
|
||||
* 2. Diagnose — why did they fail? (minutes)
|
||||
* 3. Compare — what changed between runs? (cross-report)
|
||||
*/
|
||||
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
|
||||
import type { WorkflowTestCaseResult, ScenarioResult } from '../types';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function escapeHtml(str: string): string {
|
||||
return str
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/"/g, '"')
|
||||
.replace(/'/g, ''');
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Scenario rendering
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function renderScenario(sr: ScenarioResult, index: number): string {
|
||||
const icon = sr.success ? '✓' : '✗';
|
||||
const statusClass = sr.success ? 'pass' : 'fail';
|
||||
|
||||
// Passing scenarios: compact one-liner with collapsible detail
|
||||
if (sr.success) {
|
||||
const summary = sr.reasoning ? sr.reasoning.slice(0, 150) : 'All checks passed';
|
||||
return `<div class="scenario ${statusClass}">
|
||||
<div class="scenario-header" onclick="this.parentElement.classList.toggle('expanded')">
|
||||
<span class="scenario-icon ${statusClass}">${icon}</span>
|
||||
<span class="scenario-name">${escapeHtml(sr.scenario.name)}</span>
|
||||
<span class="scenario-summary-inline">${escapeHtml(summary)}${sr.reasoning && sr.reasoning.length > 150 ? '...' : ''}</span>
|
||||
</div>
|
||||
<div class="scenario-detail" id="scenario-${String(index)}">
|
||||
${renderScenarioDetail(sr)}
|
||||
</div>
|
||||
</div>`;
|
||||
}
|
||||
|
||||
// Failing scenarios: show error prominently, detail expanded by default
|
||||
return `<div class="scenario ${statusClass} expanded">
|
||||
<div class="scenario-header" onclick="this.parentElement.classList.toggle('expanded')">
|
||||
<span class="scenario-icon ${statusClass}">${icon}</span>
|
||||
<span class="scenario-name">${escapeHtml(sr.scenario.name)}</span>
|
||||
<span class="scenario-desc">${escapeHtml(sr.scenario.description)}</span>
|
||||
</div>
|
||||
<div class="scenario-detail" id="scenario-${String(index)}">
|
||||
${renderScenarioDetail(sr)}
|
||||
</div>
|
||||
</div>`;
|
||||
}
|
||||
|
||||
function renderScenarioDetail(sr: ScenarioResult): string {
|
||||
let html = '';
|
||||
|
||||
if (!sr.evalResult) {
|
||||
if (sr.reasoning) {
|
||||
html += `<div class="diagnosis">${escapeHtml(sr.reasoning)}</div>`;
|
||||
}
|
||||
return html;
|
||||
}
|
||||
|
||||
// Failure category badge
|
||||
if (!sr.success && sr.failureCategory) {
|
||||
const catClass =
|
||||
sr.failureCategory === 'builder_issue'
|
||||
? 'warn'
|
||||
: sr.failureCategory === 'mock_issue'
|
||||
? 'fail'
|
||||
: 'info';
|
||||
html += `<div class="category-badge category-${catClass}">${escapeHtml(sr.failureCategory)}${sr.rootCause ? ': ' + escapeHtml(sr.rootCause) : ''}</div>`;
|
||||
}
|
||||
|
||||
// 1. Error — what broke
|
||||
if (sr.evalResult.errors.length > 0) {
|
||||
html += `<div class="error-box">${escapeHtml(sr.evalResult.errors.join('; '))}</div>`;
|
||||
}
|
||||
|
||||
// Phase 1 warnings
|
||||
const warnings = sr.evalResult.hints?.warnings ?? [];
|
||||
if (warnings.length > 0) {
|
||||
html += `<div class="warning-box">${escapeHtml(warnings.join('; '))}</div>`;
|
||||
}
|
||||
|
||||
// 2. Diagnosis — verifier's reasoning
|
||||
if (sr.reasoning) {
|
||||
html += '<details class="section" open><summary>Diagnosis</summary>';
|
||||
html += `<div class="diagnosis">${escapeHtml(sr.reasoning)}</div>`;
|
||||
html += '</details>';
|
||||
}
|
||||
|
||||
// 3. Mock data plan — Phase 1 hints
|
||||
if (sr.evalResult.hints) {
|
||||
html += '<details class="section"><summary>Mock data plan</summary>';
|
||||
const { globalContext, triggerContent, nodeHints } = sr.evalResult.hints;
|
||||
|
||||
if (globalContext) {
|
||||
html += '<div class="subsection-label">Global context</div>';
|
||||
html += `<div class="hint-text">${escapeHtml(globalContext)}</div>`;
|
||||
}
|
||||
|
||||
if (Object.keys(triggerContent ?? {}).length > 0) {
|
||||
html += '<div class="subsection-label">Trigger content</div>';
|
||||
html += `<pre class="json-block"><code>${escapeHtml(JSON.stringify(triggerContent, null, 2))}</code></pre>`;
|
||||
} else {
|
||||
html +=
|
||||
'<div class="warning-inline">No trigger content generated \u2014 start node has no input data</div>';
|
||||
}
|
||||
|
||||
if (nodeHints && Object.keys(nodeHints).length > 0) {
|
||||
html += '<div class="subsection-label">Per-node hints</div>';
|
||||
for (const [nodeName, hint] of Object.entries(nodeHints)) {
|
||||
html += `<details class="node-hint"><summary>${escapeHtml(nodeName)}</summary>`;
|
||||
html += `<div class="hint-text">${escapeHtml(hint)}</div>`;
|
||||
html += '</details>';
|
||||
}
|
||||
}
|
||||
html += '</details>';
|
||||
}
|
||||
|
||||
// 4. Execution trace — per-node results
|
||||
const nodeEntries = Object.entries(sr.evalResult.nodeResults);
|
||||
if (nodeEntries.length > 0) {
|
||||
html += '<details class="section"><summary>Execution trace</summary>';
|
||||
html +=
|
||||
'<div class="trace-legend"><span class="node-mode-mocked">mocked</span> <span class="node-mode-pinned">pinned</span> <span class="node-mode-real">real</span></div>';
|
||||
|
||||
for (const [nodeName, nr] of nodeEntries) {
|
||||
const modeClass = `node-mode-${nr.executionMode}`;
|
||||
const hasError = nr.configIssues && Object.keys(nr.configIssues).length > 0;
|
||||
const configWarning = hasError
|
||||
? `<span class="build-issue">Build issue: ${escapeHtml(Object.values(nr.configIssues!).flat().join('; '))}</span>`
|
||||
: '';
|
||||
|
||||
html += '<div class="trace-node">';
|
||||
html += '<div class="trace-node-header">';
|
||||
html += `<span class="${modeClass}">[${nr.executionMode}]</span> <strong>${escapeHtml(nodeName)}</strong>`;
|
||||
if (nr.interceptedRequests.length > 0) {
|
||||
html += ` <span class="request-count">${String(nr.interceptedRequests.length)} request(s)</span>`;
|
||||
}
|
||||
html += '</div>';
|
||||
if (configWarning) html += configWarning;
|
||||
|
||||
// Intercepted requests
|
||||
for (const req of nr.interceptedRequests) {
|
||||
html += '<div class="request-pair">';
|
||||
html += '<div class="request-header">Request sent</div>';
|
||||
html += `<div class="request-method">${escapeHtml(req.method)} ${escapeHtml(req.url)}</div>`;
|
||||
if (req.requestBody) {
|
||||
html += `<pre class="json-block json-sm"><code>${escapeHtml(JSON.stringify(req.requestBody, null, 2))}</code></pre>`;
|
||||
}
|
||||
html += '<div class="response-header">Mock returned</div>';
|
||||
if (req.mockResponse) {
|
||||
html += `<pre class="json-block json-sm"><code>${escapeHtml(JSON.stringify(req.mockResponse, null, 2))}</code></pre>`;
|
||||
} else {
|
||||
html += '<div class="muted">no mock response</div>';
|
||||
}
|
||||
html += '</div>';
|
||||
}
|
||||
|
||||
// Node output
|
||||
if (nr.output !== null && nr.output !== undefined) {
|
||||
html += '<details class="node-output-toggle"><summary>Node output</summary>';
|
||||
html += `<pre class="json-block"><code>${escapeHtml(JSON.stringify(nr.output, null, 2))}</code></pre>`;
|
||||
html += '</details>';
|
||||
} else {
|
||||
html += '<div class="muted">no output</div>';
|
||||
}
|
||||
|
||||
html += '</div>';
|
||||
}
|
||||
html += '</details>';
|
||||
}
|
||||
|
||||
return html;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Workflow summary
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function renderWorkflowSummary(result: WorkflowTestCaseResult): string {
|
||||
const firstEval = result.scenarioResults[0]?.evalResult;
|
||||
|
||||
let nodesHtml = '';
|
||||
if (firstEval) {
|
||||
const nodes = Object.entries(firstEval.nodeResults);
|
||||
if (nodes.length > 0) {
|
||||
const nodeList = nodes
|
||||
.map(([name, nr]) => {
|
||||
const mode = nr.executionMode;
|
||||
const requests = nr.interceptedRequests.length;
|
||||
const issues = nr.configIssues ? Object.values(nr.configIssues).flat().join('; ') : '';
|
||||
let line = `<span class="node-mode-${mode}">[${mode}]</span> ${escapeHtml(name)}`;
|
||||
if (requests > 0) line += ` <span class="muted">(${String(requests)} req)</span>`;
|
||||
if (issues)
|
||||
line += ` <span class="build-issue">Build issue: ${escapeHtml(issues)}</span>`;
|
||||
return `<li>${line}</li>`;
|
||||
})
|
||||
.join('');
|
||||
nodesHtml = `<details class="section"><summary>Built workflow (${String(nodes.length)} nodes)</summary><ul class="node-list">${nodeList}</ul></details>`;
|
||||
}
|
||||
}
|
||||
|
||||
let jsonHtml = '';
|
||||
if (result.workflowJson) {
|
||||
const raw = JSON.stringify(result.workflowJson, null, 2);
|
||||
jsonHtml = `<details class="section"><summary>Agent output (raw JSON)</summary><pre class="json-block"><code>${escapeHtml(raw)}</code></pre></details>`;
|
||||
}
|
||||
|
||||
return nodesHtml + jsonHtml;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Test case rendering
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function renderTestCase(result: WorkflowTestCaseResult, tcIndex: number): string {
|
||||
const passCount = result.scenarioResults.filter((sr) => sr.success).length;
|
||||
const totalCount = result.scenarioResults.length;
|
||||
const allPass = passCount === totalCount && totalCount > 0;
|
||||
const statusClass = result.workflowBuildSuccess ? (allPass ? 'pass' : 'mixed') : 'fail';
|
||||
|
||||
const buildBadge = result.workflowBuildSuccess
|
||||
? '<span class="badge badge-pass">BUILT</span>'
|
||||
: '<span class="badge badge-fail">BUILD FAILED</span>';
|
||||
|
||||
const scoreBadge =
|
||||
totalCount > 0
|
||||
? `<span class="badge badge-${allPass ? 'pass' : 'fail'}">${String(passCount)}/${String(totalCount)}</span>`
|
||||
: '';
|
||||
|
||||
const prompt = result.testCase.prompt;
|
||||
const truncatedPrompt = prompt.length > 100 ? prompt.slice(0, 100) + '...' : prompt;
|
||||
|
||||
// Inline scenario indicators for quick triage without expanding
|
||||
const scenarioIndicators = result.scenarioResults
|
||||
.map(
|
||||
(sr) =>
|
||||
`<span class="scenario-indicator ${sr.success ? 'pass' : 'fail'}" title="${escapeHtml(sr.scenario.name)}">${sr.success ? '✓' : '✗'} ${escapeHtml(sr.scenario.name)}</span>`,
|
||||
)
|
||||
.join(' ');
|
||||
|
||||
let scenariosHtml = '';
|
||||
if (result.scenarioResults.length > 0) {
|
||||
scenariosHtml = result.scenarioResults
|
||||
.map((sr, i) => renderScenario(sr, tcIndex * 100 + i))
|
||||
.join('');
|
||||
} else if (!result.workflowBuildSuccess) {
|
||||
const errorDetail = result.buildError
|
||||
? `<div class="error-box">${escapeHtml(result.buildError)}</div>`
|
||||
: '';
|
||||
scenariosHtml = `<div class="muted">Workflow failed to build — no scenarios executed</div>${errorDetail}`;
|
||||
}
|
||||
|
||||
return `<div class="test-case ${statusClass}">
|
||||
<div class="test-case-header" onclick="this.parentElement.classList.toggle('expanded')">
|
||||
<div class="test-case-title">
|
||||
${buildBadge} ${scoreBadge}
|
||||
<span class="test-case-prompt">${escapeHtml(truncatedPrompt)}</span>
|
||||
</div>
|
||||
<div class="test-case-meta">
|
||||
<span class="badge badge-tag">${escapeHtml(result.testCase.complexity)}</span>
|
||||
${result.workflowId ? `<span class="workflow-id">${escapeHtml(result.workflowId)}</span>` : ''}
|
||||
</div>
|
||||
<div class="scenario-indicators">${scenarioIndicators}</div>
|
||||
</div>
|
||||
<div class="test-case-detail">
|
||||
<details class="section"><summary>Prompt</summary><div class="prompt-text">${escapeHtml(prompt)}</div></details>
|
||||
${renderWorkflowSummary(result)}
|
||||
${scenariosHtml}
|
||||
</div>
|
||||
</div>`;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Full report
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function generateWorkflowReport(results: WorkflowTestCaseResult[]): string {
|
||||
const totalTestCases = results.length;
|
||||
const builtCount = results.filter((r) => r.workflowBuildSuccess).length;
|
||||
const allScenarios = results.flatMap((r) => r.scenarioResults);
|
||||
const passCount = allScenarios.filter((sr) => sr.success).length;
|
||||
const failCount = allScenarios.length - passCount;
|
||||
const totalScenarios = allScenarios.length;
|
||||
const passRate = totalScenarios > 0 ? Math.round((passCount / totalScenarios) * 100) : 0;
|
||||
|
||||
return `<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Workflow evaluation report</title>
|
||||
<style>
|
||||
:root {
|
||||
--bg-primary: #0d1117;
|
||||
--bg-secondary: #161b22;
|
||||
--bg-tertiary: #1c2129;
|
||||
--border: #30363d;
|
||||
--border-light: #21262d;
|
||||
--text-primary: #f0f6fc;
|
||||
--text-secondary: #c9d1d9;
|
||||
--text-muted: #8b949e;
|
||||
--color-pass: #3fb950;
|
||||
--color-fail: #f85149;
|
||||
--color-warn: #d29922;
|
||||
--color-info: #58a6ff;
|
||||
--color-purple: #bc8cff;
|
||||
--color-pass-bg: #23863622;
|
||||
--color-fail-bg: #da363322;
|
||||
--color-warn-bg: #d2992222;
|
||||
}
|
||||
|
||||
* { margin: 0; padding: 0; box-sizing: border-box; }
|
||||
body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; background: var(--bg-primary); color: var(--text-secondary); padding: 24px; max-width: 1400px; margin: 0 auto; font-size: 14px; line-height: 1.5; }
|
||||
|
||||
/* Header */
|
||||
h1 { color: var(--text-primary); font-size: 20px; margin-bottom: 2px; }
|
||||
.subtitle { color: var(--text-muted); font-size: 13px; margin-bottom: 20px; }
|
||||
|
||||
/* Dashboard */
|
||||
.dashboard { display: flex; gap: 12px; margin-bottom: 24px; flex-wrap: wrap; align-items: stretch; }
|
||||
.stat-card { background: var(--bg-secondary); border: 1px solid var(--border); border-radius: 8px; padding: 14px 20px; min-width: 120px; }
|
||||
.stat-card .label { color: var(--text-muted); font-size: 12px; }
|
||||
.stat-card .value { color: var(--text-primary); font-size: 26px; font-weight: 700; margin-top: 2px; }
|
||||
.stat-card .value.pass { color: var(--color-pass); }
|
||||
.stat-card .value.fail { color: var(--color-fail); }
|
||||
.stat-card .value.mixed { color: var(--color-warn); }
|
||||
|
||||
/* Toolbar */
|
||||
.toolbar { display: flex; gap: 8px; margin-bottom: 16px; }
|
||||
.toolbar button { background: var(--bg-secondary); border: 1px solid var(--border); border-radius: 6px; color: var(--text-secondary); padding: 6px 12px; font-size: 12px; cursor: pointer; }
|
||||
.toolbar button:hover { background: var(--bg-tertiary); color: var(--text-primary); }
|
||||
.toolbar button.active { border-color: var(--color-info); color: var(--color-info); }
|
||||
|
||||
/* Badges */
|
||||
.badge { display: inline-block; padding: 2px 8px; border-radius: 12px; font-size: 11px; font-weight: 600; margin-right: 4px; }
|
||||
.badge-pass { background: var(--color-pass-bg); color: var(--color-pass); }
|
||||
.badge-fail { background: var(--color-fail-bg); color: var(--color-fail); }
|
||||
.badge-tag { background: var(--border); color: var(--text-muted); }
|
||||
|
||||
/* Test case cards */
|
||||
.test-case { background: var(--bg-secondary); border: 1px solid var(--border); border-radius: 8px; margin-bottom: 10px; overflow: hidden; }
|
||||
.test-case.pass { border-left: 3px solid var(--color-pass); }
|
||||
.test-case.fail { border-left: 3px solid var(--color-fail); }
|
||||
.test-case.mixed { border-left: 3px solid var(--color-warn); }
|
||||
.test-case-header { padding: 12px 16px; cursor: pointer; }
|
||||
.test-case-header:hover { background: var(--bg-tertiary); }
|
||||
.test-case-title { display: flex; align-items: center; gap: 8px; margin-bottom: 4px; }
|
||||
.test-case-prompt { color: var(--text-primary); font-weight: 500; font-size: 13px; }
|
||||
.test-case-meta { display: flex; align-items: center; gap: 6px; margin-bottom: 6px; }
|
||||
.workflow-id { color: var(--text-muted); font-size: 11px; font-family: monospace; }
|
||||
.scenario-indicators { display: flex; gap: 8px; flex-wrap: wrap; }
|
||||
.scenario-indicator { font-size: 11px; font-family: monospace; }
|
||||
.scenario-indicator.pass { color: var(--color-pass); }
|
||||
.scenario-indicator.fail { color: var(--color-fail); }
|
||||
.test-case-detail { display: none; padding: 0 16px 16px; }
|
||||
.test-case.expanded .test-case-detail { display: block; }
|
||||
|
||||
/* Sections (collapsible) */
|
||||
.section { margin: 8px 0; }
|
||||
.section > summary { cursor: pointer; color: var(--color-info); font-size: 12px; font-weight: 600; padding: 4px 0; }
|
||||
.section > summary:hover { text-decoration: underline; }
|
||||
|
||||
/* Scenarios */
|
||||
.scenario { border: 1px solid var(--border-light); border-radius: 6px; margin-bottom: 6px; overflow: hidden; }
|
||||
.scenario-header { padding: 8px 12px; cursor: pointer; display: flex; align-items: center; gap: 8px; font-size: 13px; }
|
||||
.scenario-header:hover { background: var(--bg-tertiary); }
|
||||
.scenario-icon { font-weight: bold; font-size: 14px; min-width: 16px; }
|
||||
.scenario-icon.pass { color: var(--color-pass); }
|
||||
.scenario-icon.fail { color: var(--color-fail); }
|
||||
.scenario-name { color: var(--text-primary); font-weight: 600; }
|
||||
.scenario-desc { color: var(--text-muted); font-size: 12px; }
|
||||
.scenario-summary-inline { color: var(--text-muted); font-size: 12px; flex: 1; }
|
||||
.scenario-detail { display: none; padding: 10px 12px; border-top: 1px solid var(--border-light); background: var(--bg-primary); }
|
||||
.scenario.expanded .scenario-detail { display: block; }
|
||||
|
||||
/* Error and warning boxes */
|
||||
.error-box { color: var(--color-fail); font-size: 12px; padding: 6px 10px; background: var(--color-fail-bg); border-radius: 4px; margin-bottom: 8px; border-left: 3px solid var(--color-fail); }
|
||||
.warning-box { color: var(--color-warn); font-size: 12px; padding: 6px 10px; background: var(--color-warn-bg); border-radius: 4px; margin-bottom: 8px; border-left: 3px solid var(--color-warn); }
|
||||
.warning-inline { color: var(--color-warn); font-size: 11px; margin: 4px 0; }
|
||||
.build-issue { color: var(--color-warn); font-size: 11px; display: block; margin-top: 2px; }
|
||||
|
||||
/* Diagnosis */
|
||||
.diagnosis { color: var(--text-secondary); font-size: 12px; line-height: 1.6; padding: 6px 0; }
|
||||
|
||||
/* Prompt */
|
||||
.prompt-text { color: var(--text-secondary); font-size: 13px; line-height: 1.6; padding: 10px; background: var(--bg-primary); border: 1px solid var(--border); border-radius: 6px; white-space: pre-wrap; }
|
||||
|
||||
/* Execution trace */
|
||||
.trace-legend { font-size: 11px; margin-bottom: 8px; display: flex; gap: 12px; }
|
||||
.trace-node { border: 1px solid var(--border-light); border-radius: 4px; margin-bottom: 6px; padding: 8px; }
|
||||
.trace-node-header { font-size: 12px; font-family: monospace; margin-bottom: 4px; }
|
||||
.request-count { color: var(--text-muted); font-size: 11px; }
|
||||
|
||||
/* Request/response pairs */
|
||||
.request-pair { border: 1px solid var(--border-light); border-radius: 4px; margin: 6px 0; overflow: hidden; }
|
||||
.request-header { background: #1c3a5e; color: var(--color-info); font-size: 10px; font-weight: 700; padding: 3px 8px; letter-spacing: 0.5px; }
|
||||
.response-header { background: #2a1f3e; color: var(--color-purple); font-size: 10px; font-weight: 700; padding: 3px 8px; letter-spacing: 0.5px; }
|
||||
.request-method { font-size: 11px; color: var(--text-primary); padding: 4px 8px; font-family: monospace; font-weight: 600; background: var(--bg-primary); }
|
||||
|
||||
/* JSON blocks */
|
||||
.json-block { font-size: 11px; margin: 4px 0; padding: 8px; background: var(--bg-secondary); border: 1px solid var(--border-light); border-radius: 4px; overflow-x: auto; }
|
||||
.json-sm { font-size: 10px; }
|
||||
pre { overflow-x: auto; margin: 0; }
|
||||
code { color: var(--text-secondary); }
|
||||
|
||||
/* Node list */
|
||||
.node-list { list-style: none; padding: 4px 0; font-size: 12px; font-family: monospace; }
|
||||
.node-list li { padding: 3px 0; }
|
||||
.node-mode-mocked { color: var(--color-info); font-weight: 600; }
|
||||
.node-mode-pinned { color: var(--color-warn); font-weight: 600; }
|
||||
.node-mode-real { color: var(--color-pass); font-weight: 600; }
|
||||
|
||||
/* Node output toggle */
|
||||
.node-output-toggle { margin: 4px 0; }
|
||||
.node-output-toggle > summary { cursor: pointer; color: var(--text-muted); font-size: 11px; }
|
||||
|
||||
/* Node hint */
|
||||
.node-hint { margin: 2px 0; }
|
||||
.node-hint > summary { cursor: pointer; color: var(--text-secondary); font-size: 11px; font-family: monospace; }
|
||||
.hint-text { color: var(--text-muted); font-size: 11px; padding: 4px 0; line-height: 1.5; }
|
||||
.subsection-label { color: var(--text-primary); font-size: 11px; font-weight: 600; margin-top: 8px; margin-bottom: 2px; }
|
||||
|
||||
/* Category badges */
|
||||
.category-badge { font-size: 11px; font-weight: 600; padding: 4px 10px; border-radius: 4px; margin-bottom: 8px; }
|
||||
.category-warn { background: var(--color-warn-bg); color: var(--color-warn); border-left: 3px solid var(--color-warn); }
|
||||
.category-fail { background: var(--color-fail-bg); color: var(--color-fail); border-left: 3px solid var(--color-fail); }
|
||||
.category-info { background: #1c3a5e33; color: var(--color-info); border-left: 3px solid var(--color-info); }
|
||||
|
||||
/* Utilities */
|
||||
.muted { color: var(--text-muted); font-size: 12px; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h1>Workflow evaluation report</h1>
|
||||
<p class="subtitle">Generated ${new Date().toLocaleString()} — ${String(totalScenarios)} scenarios across ${String(totalTestCases)} test cases</p>
|
||||
|
||||
<div class="dashboard">
|
||||
<div class="stat-card">
|
||||
<div class="label">Pass rate</div>
|
||||
<div class="value${passRate >= 80 ? ' pass' : passRate >= 50 ? ' mixed' : ' fail'}">${String(passRate)}%</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="label">Passed</div>
|
||||
<div class="value pass">${String(passCount)}</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="label">Failed</div>
|
||||
<div class="value${failCount > 0 ? ' fail' : ''}">${String(failCount)}</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="label">Built</div>
|
||||
<div class="value${builtCount === totalTestCases ? ' pass' : ' mixed'}">${String(builtCount)}/${String(totalTestCases)}</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="toolbar">
|
||||
<button onclick="document.querySelectorAll('.test-case').forEach(e => e.classList.add('expanded'))">Expand all</button>
|
||||
<button onclick="document.querySelectorAll('.test-case').forEach(e => e.classList.remove('expanded'))">Collapse all</button>
|
||||
<button onclick="document.querySelectorAll('.test-case').forEach(e => { e.style.display = e.classList.contains('pass') ? 'none' : '' }); this.classList.toggle('active')">Show failures only</button>
|
||||
</div>
|
||||
|
||||
${results.map((r, i) => renderTestCase(r, i)).join('')}
|
||||
|
||||
</body>
|
||||
</html>`;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Write report to disk
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export function writeWorkflowReport(results: WorkflowTestCaseResult[]): string {
|
||||
const reportDir = path.join(__dirname, '..', '..', '.data');
|
||||
if (!fs.existsSync(reportDir)) {
|
||||
fs.mkdirSync(reportDir, { recursive: true });
|
||||
}
|
||||
const html = generateWorkflowReport(results);
|
||||
const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
|
||||
const reportPath = path.join(reportDir, `workflow-eval-${timestamp}.html`);
|
||||
fs.writeFileSync(reportPath, html);
|
||||
|
||||
// Also write to the stable filename for quick access
|
||||
fs.writeFileSync(path.join(reportDir, 'workflow-eval-report.html'), html);
|
||||
|
||||
return reportPath;
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user