Skip to content

Evaluations

Evaluations are built from worker primitives: SampleExecutor runs a task once, while TrialExecutor runs a batch with parameters for studies and optimization.

type TaskFn<In, Out> = (input: In, context: TaskContext) => Promise<Out> | Out;
class SampleExecutor<In = unknown, Out = unknown> {
constructor(options: { task: TaskFn<In, Out>; scorers?: EvalScorerConfig<In, Out>[] });
execute(request: SampleRequest<In>, signal?: AbortSignal): Promise<Sample<In, Out>>;
executeBatch(
requests: SampleRequest<In>[],
options?: { signal?: AbortSignal; concurrency?: number }
): Promise<Sample<In, Out>[]>;
}
class TrialExecutor<In = unknown, Out = unknown> {
constructor(options: {
task: TaskFn<In, Out>;
scorers?: EvalScorerConfig<In, Out>[];
objectiveMetric: string;
objectiveMode?: 'maximize' | 'minimize';
});
execute(
trial: TrialRequest,
samples: { input: In; context?: Record<string, unknown> }[],
signal?: AbortSignal
): Promise<TrialResult>;
}
class Evaluation {
constructor(config: EvaluationConfig);
run(): Promise<EvalResult>;
}
import { SampleExecutor, evalScorer, similarity, type SampleRequest } from '@dreadnode/agents';
type Input = { question: string; expected: string };
type Output = string;
const task = async (input: Input): Promise<Output> => {
return `Answer: ${input.question}`;
};
const scorers = [
evalScorer<Input, Output>('semantic_similarity', ({ input, output }) =>
similarity({ reference: input.expected }).score(output)
),
];
const executor = new SampleExecutor({ task, scorers });
const request: SampleRequest<Input> = {
id: 'sample-1',
input: { question: 'What is Dreadnode?', expected: 'Dreadnode is an AI agent platform.' },
index: 0,
iteration: 1,
params: {},
context: {},
};
const sample = await executor.execute(request);
console.log(sample.metrics);
import { TrialExecutor, evalScorer, contains } from '@dreadnode/agents';
type Input = string;
type Output = string;
const task = (input: Input): Output => `Response: ${input}`;
const scorers = [
evalScorer<Input, Output>('mentions_platform', ({ output }) =>
contains({ pattern: 'platform' }).score(output)
),
];
const trialExecutor = new TrialExecutor({
task,
scorers,
objectiveMetric: 'mentions_platform',
});
const trialResult = await trialExecutor.execute({ id: 'trial-1', number: 1, params: {} }, [
{ input: 'Dreadnode is an AI platform.' },
]);
console.log(trialResult.objectiveValue, trialResult.metrics);