n8n/packages/@n8n/instance-ai/evaluations/index.ts

// ---------------------------------------------------------------------------
// Public API for the instance-ai workflow evaluation framework
//
// This module exports the domain logic used by the CLI (evaluations/cli/)
// and available for custom orchestration (e.g. LangSmith evaluate).
// ---------------------------------------------------------------------------

// -- Client & Auth --
export { N8nClient } from './clients/n8n-client';
export type { WorkflowResponse, WorkflowNodeResponse, ExecutionDetail } from './clients/n8n-client';

// -- Test case data --
export { loadWorkflowTestCasesWithFiles } from './data/workflows';
export type { WorkflowTestCaseWithFile } from './data/workflows';

// -- Credentials --
export { seedCredentials, cleanupCredentials } from './credentials/seeder';
export type { SeedResult } from './credentials/seeder';

// -- Runner (all-in-one) --
export { runWorkflowTestCase, runWithConcurrency } from './harness/runner';

// -- Runner (split API: build once, run scenarios independently) --
export { buildWorkflow, executeScenario, cleanupBuild } from './harness/runner';
export type { BuildResult, BuildWorkflowConfig } from './harness/runner';

// -- Workflow discovery --
export { snapshotWorkflowIds } from './outcome/workflow-discovery';

// -- Logger --
export { type EvalLogger, createLogger } from './harness/logger';

// -- Types --
export type {
	WorkflowTestCase,
	ExecutionScenario,
	WorkflowTestCaseResult,
	ExecutionScenarioResult,
	ChecklistItem,
	ChecklistResult,
} from './types';

// -- Comparison (regression detection) --
export {
	compareBuckets,
	byVerdict,
	improvements,
	hardRegressions,
	softRegressions,
	watchList,
} from './comparison/compare';
export type {
	ComparisonResult,
	ScenarioComparison,
	ScenarioCounts,
	ExperimentBucket,
	AggregateComparison,
	FailureCategoryComparison,
} from './comparison/compare';
export {
	classifyScenario,
	fishersExactOneSidedLeft,
	wilsonInterval,
} from './comparison/statistics';
export type {
	ScenarioVerdict,
	ScenarioClassification,
	ClassifyOptions,
	TierThresholds,
} from './comparison/statistics';
export { formatComparisonMarkdown, formatComparisonTerminal } from './comparison/format';
export {
	fetchBaselineBucket,
	findLatestBaseline,
	BASELINE_EXPERIMENT_PREFIX,
} from './comparison/fetch-baseline';