Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 1 addition & 12 deletions .github/workflows/evals.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,7 @@ jobs:
"${TARGET_FLAG[@]}" \
--workers 3 \
--threshold ${{ steps.filter.outputs.threshold }} \
--output .agentv/ci-results/artifacts \
--export .agentv/ci-results/junit.xml
--output .agentv/ci-results/artifacts
EXIT_CODE=$?

echo "exit_code=$EXIT_CODE" >> "$GITHUB_OUTPUT"
Expand All @@ -104,16 +103,6 @@ jobs:
if: always()
run: bun run scripts/ci-summary.ts .agentv/ci-results >> "$GITHUB_STEP_SUMMARY"

- name: Publish JUnit test results
if: always()
continue-on-error: true
uses: dorny/test-reporter@v1
with:
name: AgentV Eval Results
path: .agentv/ci-results/junit.xml
reporter: java-junit
fail-on-error: false

- name: Upload eval artifacts
if: always()
uses: actions/upload-artifact@v4
Expand Down
5 changes: 2 additions & 3 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -384,8 +384,7 @@ Unit tests alone are insufficient for grader changes. After implementing or modi
```bash
# 1. Run the eval, writing results to a sibling *.results.jsonl file
bun apps/cli/src/cli.ts eval examples/path/to/suite.eval.yaml --target azure \
--output examples/path/to/suite.run \
--export examples/path/to/suite.results.jsonl
--output examples/path/to/suite.run

# 2. Assert all expected score ranges pass
bun scripts/check-grader-scores.ts
Expand All @@ -396,7 +395,7 @@ The script auto-discovers `examples/**/*.grader-scores.yaml`, locates the siblin
**To add score checks for a new eval:**
1. Create `<eval-stem>.grader-scores.yaml` next to the eval YAML.
2. Add entries for each `(test_id, grader, range)` you care about — `grader` must match a `scores[].name` value in the JSONL output, and `range.min`/`range.max` default to 0/1 if omitted.
3. Run the eval with `--output <eval-stem>.run --export <eval-stem>.results.jsonl`, then run the script.
3. Run the eval with `--output <eval-stem>.run`, then run the script.

See `examples/red-team/archetypes/coding-agent/suites/screenshot-pii-upload.grader-scores.yaml` for a concrete example.

Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,7 @@ agentv compare .agentv/results/runs/<timestamp>/index.jsonl

```bash
agentv eval evals/my-eval.yaml --output ./run # writes ./run/index.jsonl
agentv eval evals/my-eval.yaml --export report.html
agentv eval evals/my-eval.yaml --export results.xml # JUnit XML for CI
cat ./run/index.jsonl # JSONL results for scripts/CI
```

## TypeScript SDK
Expand Down
11 changes: 2 additions & 9 deletions apps/cli/src/commands/eval/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ export const evalRunCommand = command({
out: option({
type: optional(string),
long: 'out',
description: '[Removed: use --output <dir> and --export <file>] Former flat result path',
description: '[Removed: use --output <dir>] Former flat result path',
}),
output: option({
type: optional(string),
Expand All @@ -58,19 +58,13 @@ export const evalRunCommand = command({
outputFormat: option({
type: optional(string),
long: 'output-format',
description: '[Removed: use --export <file>] Run directories always write index.jsonl',
description: '[Removed] Run directories always write index.jsonl',
}),
experiment: option({
type: optional(string),
long: 'experiment',
description: 'Experiment label for canonical run output (default: default)',
}),
export: multioption({
type: array(string),
long: 'export',
description:
'Write additional output file(s). Format inferred from extension: .jsonl, .json, .xml, .yaml, .html (repeatable)',
}),
dryRun: flag({
long: 'dry-run',
description: 'Use mock provider responses instead of real LLM calls',
Expand Down Expand Up @@ -252,7 +246,6 @@ export const evalRunCommand = command({
output: args.output,
outputFormat: args.outputFormat,
experiment: args.experiment,
export: args.export,
dryRun: args.dryRun,
dryRunDelay: args.dryRunDelay,
dryRunDelayMin: args.dryRunDelayMin,
Expand Down
52 changes: 0 additions & 52 deletions apps/cli/src/commands/eval/json-writer.ts

This file was deleted.

109 changes: 0 additions & 109 deletions apps/cli/src/commands/eval/junit-writer.ts

This file was deleted.

68 changes: 1 addition & 67 deletions apps/cli/src/commands/eval/output-writer.ts
Original file line number Diff line number Diff line change
@@ -1,81 +1,15 @@
import path from 'node:path';

import type { EvaluationResult } from '@agentv/core';

import { HtmlWriter } from './html-writer.js';
import { JsonWriter } from './json-writer.js';
import { JsonlWriter } from './jsonl-writer.js';
import { JunitWriter } from './junit-writer.js';
import { YamlWriter } from './yaml-writer.js';

export type OutputFormat = 'jsonl' | 'yaml' | 'html';

export interface OutputWriter {
append(result: EvaluationResult): Promise<void>;
close(): Promise<void>;
}

export interface WriterOptions {
readonly threshold?: number;
}

export async function createOutputWriter(
filePath: string,
format: OutputFormat,
options?: { append?: boolean },
): Promise<OutputWriter> {
switch (format) {
case 'jsonl':
return JsonlWriter.open(filePath, { append: options?.append });
case 'yaml':
return YamlWriter.open(filePath);
case 'html':
return HtmlWriter.open(filePath);
default: {
const exhaustiveCheck: never = format;
throw new Error(`Unsupported output format: ${exhaustiveCheck}`);
}
}
}

const SUPPORTED_EXTENSIONS = new Set(['.jsonl', '.json', '.xml', '.yaml', '.yml', '.html', '.htm']);

export function createWriterFromPath(
filePath: string,
options?: WriterOptions,
): Promise<OutputWriter> {
const ext = path.extname(filePath).toLowerCase();
switch (ext) {
case '.jsonl':
return JsonlWriter.open(filePath);
case '.json':
return JsonWriter.open(filePath);
case '.xml':
return JunitWriter.open(filePath, { threshold: options?.threshold });
case '.yaml':
case '.yml':
return YamlWriter.open(filePath);
case '.html':
case '.htm':
return HtmlWriter.open(filePath);
default:
throw new Error(
`Unsupported output file extension "${ext}". Supported: ${[...SUPPORTED_EXTENSIONS].join(', ')}`,
);
}
}

export async function createMultiWriter(
filePaths: readonly string[],
options?: WriterOptions,
): Promise<OutputWriter> {
const writers = await Promise.all(filePaths.map((fp) => createWriterFromPath(fp, options)));
return {
async append(result: EvaluationResult): Promise<void> {
await Promise.all(writers.map((w) => w.append(result)));
},
async close(): Promise<void> {
await Promise.all(writers.map((w) => w.close()));
},
};
return JsonlWriter.open(filePath, { append: options?.append });
}
Loading
Loading