Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 0 additions & 85 deletions apps/cli/src/commands/eval/benchmark-writer.ts

This file was deleted.

7 changes: 0 additions & 7 deletions apps/cli/src/commands/eval/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -179,12 +179,6 @@ export const evalRunCommand = command({
long: 'strict',
description: 'Exit with error on version mismatch (instead of warning)',
}),
benchmarkJson: option({
type: optional(string),
long: 'benchmark-json',
description:
'[Deprecated: benchmark.json is included in artifact dir] Write Agent Skills benchmark.json to the specified path',
}),
artifacts: option({
type: optional(string),
long: 'artifacts',
Expand Down Expand Up @@ -282,7 +276,6 @@ export const evalRunCommand = command({
resume: args.resume,
rerunFailed: args.rerunFailed,
strict: args.strict,
benchmarkJson: args.benchmarkJson,
artifacts: args.artifacts,
graderTarget: args.graderTarget,
model: args.model,
Expand Down
18 changes: 0 additions & 18 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ import {
writeArtifactsFromResults,
writeInitialBenchmarkArtifact,
} from './artifact-writer.js';
import { writeBenchmarkJson } from './benchmark-writer.js';
import { loadEnvFromHierarchy } from './env.js';
import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js';
import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js';
Expand Down Expand Up @@ -113,8 +112,6 @@ interface NormalizedOptions {
readonly workspaceMode?: 'pooled' | 'temp' | 'static';
readonly workspacePath?: string;
readonly keepWorkspaces: boolean;
/** Deprecated: benchmark.json is always written to artifact dir */
readonly benchmarkJson?: string;
/** Removed: use --output instead */
readonly artifacts?: string;
/** Removed: the run directory always uses index.jsonl */
Expand Down Expand Up @@ -461,7 +458,6 @@ function normalizeOptions(
normalizeBoolean(rawOptions.keepWorkspaces) ||
yamlExecution?.keep_workspaces === true ||
config?.execution?.keepWorkspaces === true,
benchmarkJson: normalizeString(rawOptions.benchmarkJson),
artifacts: normalizeString(rawOptions.artifacts),
outputFormat: normalizeString(rawOptions.outputFormat),
graderTarget: normalizeString(rawOptions.graderTarget),
Expand Down Expand Up @@ -1250,13 +1246,6 @@ export async function runEvalCommand(
console.log(`Repository root: ${repoRoot}`);
}

// Emit deprecation warnings for remaining legacy flags.
if (options.benchmarkJson) {
console.warn(
'Warning: --benchmark-json is deprecated. benchmark.json is always written to the artifact directory.',
);
}

// Resolve artifact directory (runDir) and primary output path.
// Precedence: --output > config output.dir > default
const explicitDir = options.outputDir;
Expand Down Expand Up @@ -1776,13 +1765,6 @@ export async function runEvalCommand(
console.log(formatMatrixSummary(summaryResults));
}

// Write Agent Skills benchmark.json if requested (deprecated flag — backward compat)
if (options.benchmarkJson && allResults.length > 0) {
const benchmarkPath = path.resolve(options.benchmarkJson);
await writeBenchmarkJson(benchmarkPath, allResults);
console.log(`Benchmark written to: ${benchmarkPath}`);
}

// Write artifacts to the run directory (always, not conditional on flags)
if (allResults.length > 0) {
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';
Expand Down
74 changes: 0 additions & 74 deletions apps/cli/test/commands/eval/benchmark-writer.test.ts

This file was deleted.

31 changes: 31 additions & 0 deletions apps/cli/test/eval.integration.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -557,4 +557,35 @@ describe('agentv eval CLI', () => {
.toLowerCase();
expect(transcriptHelp).not.toContain('cache');
}, 30_000);

it('omits removed benchmark JSON export flag from help', async () => {
const result = await execa('bun', ['--no-env-file', CLI_ENTRY, 'eval', 'run', '--help'], {
cwd: projectRoot,
env: { ...process.env, CI: 'true' },
reject: false,
});
const helpText = `${result.stdout}\n${result.stderr}`;
expect(helpText).not.toContain('--benchmark-json');
expect(helpText).toContain('--output');
expect(helpText).toContain('benchmark.json');
}, 30_000);

it('rejects the removed benchmark JSON export flag as an unknown argument', async () => {
const fixture = await createFixture();
try {
const result = await runCli(fixture, [
'eval',
fixture.testFilePath,
'--benchmark-json',
path.join(fixture.baseDir, 'benchmark.json'),
]);

expect(result.exitCode).not.toBe(0);
const output = `${result.stdout}\n${result.stderr}`;
expect(output).toContain('Unknown arguments');
expect(output).toContain('--benchmark-json');
} finally {
await rm(fixture.baseDir, { recursive: true, force: true });
}
}, 30_000);
});
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ If you've been using the Agent Skills skill-creator workflow, AgentV reads your
| `evals.json` | `agentv eval evals.json` | Direct — no conversion needed |
| `claude -p "prompt"` | `agentv eval evals.json --target claude` | Same eval, richer engine |
| `grading.json` (read) | `<test-id>/grading.json` (write) | Same per-test schema, AgentV writes one grading file per test case |
| `benchmark.json` (read) | `benchmark.json` (write) | Same schema, AgentV produces it |
| `benchmark.json` (read) | `<output>/benchmark.json` (write) | AgentV writes the canonical run summary; convert it in a wrapper if another tool needs a narrower compatibility shape |
| n/a | `index.jsonl` (write) | AgentV-specific per-test manifest for filtering, retry, and replay workflows |
| with-skill vs without-skill | `--target baseline --target candidate` | Structured comparison |
| Graduate to richer evals | `agentv convert evals.json` → EVAL.yaml | Adds workspace, code graders, etc. |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,19 +107,23 @@ The rest of the bundle follows the same pattern:

## Benchmark output

Generate an Agent Skills compatible `benchmark.json` alongside the standard result JSONL. The `benchmark.json` is automatically written to the artifact directory:
Generate the run `benchmark.json` alongside the standard result JSONL. The `benchmark.json` is automatically written to the artifact directory:

```bash
agentv eval evals.json --target claude --output ./results
# benchmark.json is written to ./results/benchmark.json
```

The benchmark uses AgentV's pass threshold (score >= 0.8) to map continuous scores to the binary pass/fail that Agent Skills `pass_rate` expects:
The benchmark uses AgentV's pass threshold (score >= 0.8) for each target's `pass_rate`, plus timing and token summaries:

```json
{
"metadata": {
"targets": ["claude"],
"tests_run": ["example-test"]
},
"run_summary": {
"with_skill": {
"claude": {
"pass_rate": {"mean": 0.83, "stddev": 0.06},
"time_seconds": {"mean": 45.0, "stddev": 12.0},
"tokens": {"mean": 3800, "stddev": 400}
Expand All @@ -128,6 +132,8 @@ The benchmark uses AgentV's pass threshold (score >= 0.8) to map continuous scor
}
```

If another tool needs a different benchmark shape, keep `--output` as the source of truth and convert `<output>/benchmark.json` in a wrapper.

## Converting to EVAL.yaml

When you're ready to graduate, convert your evals.json to EVAL.yaml:
Expand Down
4 changes: 2 additions & 2 deletions docs/plans/2026-06-09-eval-output-surface.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ The eval run command currently exposes several overlapping ways to choose where
- `--out <path>` is deprecated and currently treated as a file path whose dirname becomes the artifact directory.
- `--artifacts <dir>` is deprecated and currently aliases the artifact directory.
- `--output-format` is deprecated and ignored because run directories always use `index.jsonl`.
- `--benchmark-json` is deprecated, still writes an extra Agent Skills compatibility file, and is outside this cleanup's requested removal set.
- `--benchmark-json` was a deprecated extra Agent Skills compatibility output path outside this cleanup's requested removal set; a follow-up cleanup removes that flag and keeps the run directory `benchmark.json` as canonical.
- Dashboard launch paths already pass `--output <dir>` and expect `<dir>/index.jsonl`.
- Repository docs/examples still contain old `agentv eval --out <file>` guidance in compare workflows, grader-score helper comments, and local scripts.

Expand Down Expand Up @@ -51,7 +51,7 @@ Removed now:

Warned/scheduled:

- `--benchmark-json` remains deprecated for now because the Bead did not list it as a known surface and it writes a specialized compatibility artifact. Follow-up cleanup should remove it after a separate audit.
- `--benchmark-json` is removed by the follow-up cleanup after auditing for consumers; use `--output <dir>` and read `<dir>/benchmark.json` instead of requesting a second benchmark file.

## Migration

Expand Down
Loading