n8n/packages/@n8n/instance-ai/evaluations/binaryChecks/index.ts
José Braulio González Valido 700b32237f
feat(ai-builder): Surface WHAT-dimension binary checks per built workflow (no-changelog) (#30932)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 12:18:52 +01:00

121 lines
3.8 KiB
TypeScript

// ---------------------------------------------------------------------------
// Binary checks evaluator for instance-ai
// ---------------------------------------------------------------------------
import type {
BinaryCheck,
BinaryCheckContext,
BinaryCheckResult,
CheckOutcome,
CheckStatus,
} from './types';
import type { WorkflowResponse } from '../clients/n8n-client';
import type { Feedback } from '../subagent/types';
import { DETERMINISTIC_CHECKS, LLM_CHECKS } from './checks/index';
const EVALUATOR_NAME = 'binary-checks';
export interface BinaryChecksOptions {
/** Run only the checks whose names appear in this list. Runs all if omitted. */
only?: string[];
}
export interface BinaryChecksRun {
feedback: Feedback[];
outcomes: CheckOutcome[];
}
/** LLM checks are filtered out entirely when `ctx.modelId` is not set (they don't appear in outcomes). */
export async function runBinaryChecks(
workflow: WorkflowResponse,
ctx: BinaryCheckContext,
options?: BinaryChecksOptions,
): Promise<BinaryChecksRun> {
const selected = resolveChecks(options?.only, ctx);
const results = await Promise.allSettled(
selected.map(async (check) => await check.run(workflow, ctx)),
);
const outcomes: CheckOutcome[] = results.map((settled, i) => {
const check = selected[i];
if (settled.status === 'fulfilled') {
return toOutcome(check, settled.value);
}
const message =
settled.reason instanceof Error ? settled.reason.message : String(settled.reason);
return {
name: check.name,
description: check.description,
kind: check.kind,
dimension: check.dimension,
status: 'fail',
comment: `Error: ${message}`,
};
});
const feedback: Feedback[] = outcomes
.filter((o) => o.status !== 'n_a')
.map((o) => ({
evaluator: EVALUATOR_NAME,
metric: o.name,
score: o.status === 'pass' ? 1 : 0,
kind: 'metric' as const,
...(o.comment ? { comment: o.comment } : {}),
}));
const scoredOutcomes = outcomes.filter((o) => o.status !== 'n_a');
const totalScored = scoredOutcomes.length;
const passCount = scoredOutcomes.filter((o) => o.status === 'pass').length;
const passRate = totalScored > 0 ? passCount / totalScored : 0;
const naCount = outcomes.length - totalScored;
const passRateComment = `${String(passCount)}/${String(totalScored)} checks passed${
naCount > 0 ? ` (${String(naCount)} N/A)` : ''
}`;
feedback.push({
evaluator: EVALUATOR_NAME,
metric: 'pass_rate',
score: passRate,
kind: 'score',
comment: passRateComment,
});
return { feedback, outcomes };
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function toOutcome(check: BinaryCheck, result: BinaryCheckResult): CheckOutcome {
const status: CheckStatus = result.applicable === false ? 'n_a' : result.pass ? 'pass' : 'fail';
return {
name: check.name,
description: check.description,
kind: check.kind,
dimension: check.dimension,
status,
...(result.comment ? { comment: result.comment } : {}),
};
}
function resolveChecks(only: string[] | undefined, ctx: BinaryCheckContext): BinaryCheck[] {
const allChecks = [...DETERMINISTIC_CHECKS, ...LLM_CHECKS];
const eligible = ctx.modelId ? allChecks : DETERMINISTIC_CHECKS;
if (!only || only.length === 0) return eligible;
// Validate names against all registered checks, not just eligible ones,
// so LLM checks are skipped (not rejected) when modelId is missing.
const allNames = new Set(allChecks.map((c) => c.name));
const unknown = only.filter((name) => !allNames.has(name));
if (unknown.length > 0) {
const available = Array.from(allNames).join(', ');
throw new Error(`Unknown binary check(s): ${unknown.join(', ')}. Available: ${available}`);
}
return eligible.filter((c) => only.includes(c.name));
}