n8n/packages/@n8n/instance-ai/evaluations/index.ts
José Braulio González Valido bbe3e2d148
feat(ai-builder): Add per-PR eval regression detection vs LangSmith baseline (#29456)
Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-06 08:15:08 +00:00

77 lines
2.3 KiB
TypeScript

// ---------------------------------------------------------------------------
// Public API for the instance-ai workflow evaluation framework
//
// This module exports the domain logic used by the CLI (evaluations/cli/)
// and available for custom orchestration (e.g. LangSmith evaluate).
// ---------------------------------------------------------------------------
// -- Client & Auth --
export { N8nClient } from './clients/n8n-client';
export type { WorkflowResponse, WorkflowNodeResponse, ExecutionDetail } from './clients/n8n-client';
// -- Test case data --
export { loadWorkflowTestCasesWithFiles } from './data/workflows';
export type { WorkflowTestCaseWithFile } from './data/workflows';
// -- Credentials --
export { seedCredentials, cleanupCredentials } from './credentials/seeder';
export type { SeedResult } from './credentials/seeder';
// -- Runner (all-in-one) --
export { runWorkflowTestCase, runWithConcurrency } from './harness/runner';
// -- Runner (split API: build once, run scenarios independently) --
export { buildWorkflow, executeScenario, cleanupBuild } from './harness/runner';
export type { BuildResult, BuildWorkflowConfig } from './harness/runner';
// -- Workflow discovery --
export { snapshotWorkflowIds } from './outcome/workflow-discovery';
// -- Logger --
export { type EvalLogger, createLogger } from './harness/logger';
// -- Types --
export type {
WorkflowTestCase,
TestScenario,
WorkflowTestCaseResult,
ScenarioResult,
ChecklistItem,
ChecklistResult,
} from './types';
// -- Comparison (regression detection) --
export {
compareBuckets,
byVerdict,
improvements,
hardRegressions,
softRegressions,
watchList,
} from './comparison/compare';
export type {
ComparisonResult,
ScenarioComparison,
ScenarioCounts,
ExperimentBucket,
AggregateComparison,
FailureCategoryComparison,
} from './comparison/compare';
export {
classifyScenario,
fishersExactOneSidedLeft,
wilsonInterval,
} from './comparison/statistics';
export type {
ScenarioVerdict,
ScenarioClassification,
ClassifyOptions,
TierThresholds,
} from './comparison/statistics';
export { formatComparisonMarkdown, formatComparisonTerminal } from './comparison/format';
export {
fetchBaselineBucket,
findLatestBaseline,
BASELINE_EXPERIMENT_PREFIX,
} from './comparison/fetch-baseline';