EntityProcess · christso · Jun 10, 2026 · Jun 10, 2026
diff --git a/apps/cli/src/commands/eval/benchmark-writer.ts b/apps/cli/src/commands/eval/benchmark-writer.ts
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
@@ -179,12 +179,6 @@ export const evalRunCommand = command({
       long: 'strict',
       description: 'Exit with error on version mismatch (instead of warning)',
     }),
-    benchmarkJson: option({
-      type: optional(string),
-      long: 'benchmark-json',
-      description:
-        '[Deprecated: benchmark.json is included in artifact dir] Write Agent Skills benchmark.json to the specified path',
-    }),
     artifacts: option({
       type: optional(string),
       long: 'artifacts',
@@ -282,7 +276,6 @@ export const evalRunCommand = command({
       resume: args.resume,
       rerunFailed: args.rerunFailed,
       strict: args.strict,
-      benchmarkJson: args.benchmarkJson,
       artifacts: args.artifacts,
       graderTarget: args.graderTarget,
       model: args.model,

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -40,7 +40,6 @@ import {
   writeArtifactsFromResults,
   writeInitialBenchmarkArtifact,
 } from './artifact-writer.js';
-import { writeBenchmarkJson } from './benchmark-writer.js';
 import { loadEnvFromHierarchy } from './env.js';
 import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js';
 import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js';
@@ -113,8 +112,6 @@ interface NormalizedOptions {
   readonly workspaceMode?: 'pooled' | 'temp' | 'static';
   readonly workspacePath?: string;
   readonly keepWorkspaces: boolean;
-  /** Deprecated: benchmark.json is always written to artifact dir */
-  readonly benchmarkJson?: string;
   /** Removed: use --output instead */
   readonly artifacts?: string;
   /** Removed: the run directory always uses index.jsonl */
@@ -461,7 +458,6 @@ function normalizeOptions(
       normalizeBoolean(rawOptions.keepWorkspaces) ||
       yamlExecution?.keep_workspaces === true ||
       config?.execution?.keepWorkspaces === true,
-    benchmarkJson: normalizeString(rawOptions.benchmarkJson),
     artifacts: normalizeString(rawOptions.artifacts),
     outputFormat: normalizeString(rawOptions.outputFormat),
     graderTarget: normalizeString(rawOptions.graderTarget),
@@ -1250,13 +1246,6 @@ export async function runEvalCommand(
     console.log(`Repository root: ${repoRoot}`);
   }
 
-  // Emit deprecation warnings for remaining legacy flags.
-  if (options.benchmarkJson) {
-    console.warn(
-      'Warning: --benchmark-json is deprecated. benchmark.json is always written to the artifact directory.',
-    );
-  }
-
   // Resolve artifact directory (runDir) and primary output path.
   // Precedence: --output > config output.dir > default
   const explicitDir = options.outputDir;
@@ -1776,13 +1765,6 @@ export async function runEvalCommand(
       console.log(formatMatrixSummary(summaryResults));
     }
 
-    // Write Agent Skills benchmark.json if requested (deprecated flag — backward compat)
-    if (options.benchmarkJson && allResults.length > 0) {
-      const benchmarkPath = path.resolve(options.benchmarkJson);
-      await writeBenchmarkJson(benchmarkPath, allResults);
-      console.log(`Benchmark written to: ${benchmarkPath}`);
-    }
-
     // Write artifacts to the run directory (always, not conditional on flags)
     if (allResults.length > 0) {
       const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : '';

diff --git a/apps/cli/test/commands/eval/benchmark-writer.test.ts b/apps/cli/test/commands/eval/benchmark-writer.test.ts
diff --git a/apps/cli/test/eval.integration.test.ts b/apps/cli/test/eval.integration.test.ts
@@ -557,4 +557,35 @@ describe('agentv eval CLI', () => {
       .toLowerCase();
     expect(transcriptHelp).not.toContain('cache');
   }, 30_000);
+
+  it('omits removed benchmark JSON export flag from help', async () => {
+    const result = await execa('bun', ['--no-env-file', CLI_ENTRY, 'eval', 'run', '--help'], {
+      cwd: projectRoot,
+      env: { ...process.env, CI: 'true' },
+      reject: false,
+    });
+    const helpText = `${result.stdout}\n${result.stderr}`;
+    expect(helpText).not.toContain('--benchmark-json');
+    expect(helpText).toContain('--output');
+    expect(helpText).toContain('benchmark.json');
+  }, 30_000);
+
+  it('rejects the removed benchmark JSON export flag as an unknown argument', async () => {
+    const fixture = await createFixture();
+    try {
+      const result = await runCli(fixture, [
+        'eval',
+        fixture.testFilePath,
+        '--benchmark-json',
+        path.join(fixture.baseDir, 'benchmark.json'),
+      ]);
+
+      expect(result.exitCode).not.toBe(0);
+      const output = `${result.stdout}\n${result.stderr}`;
+      expect(output).toContain('Unknown arguments');
+      expect(output).toContain('--benchmark-json');
+    } finally {
+      await rm(fixture.baseDir, { recursive: true, force: true });
+    }
+  }, 30_000);
 });
diff --git a/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx b/apps/web/src/content/docs/docs/guides/skill-improvement-workflow.mdx
@@ -255,7 +255,7 @@ If you've been using the Agent Skills skill-creator workflow, AgentV reads your
 | `evals.json` | `agentv eval evals.json` | Direct — no conversion needed |
 | `claude -p "prompt"` | `agentv eval evals.json --target claude` | Same eval, richer engine |
 | `grading.json` (read) | `<test-id>/grading.json` (write) | Same per-test schema, AgentV writes one grading file per test case |
-| `benchmark.json` (read) | `benchmark.json` (write) | Same schema, AgentV produces it |
+| `benchmark.json` (read) | `<output>/benchmark.json` (write) | AgentV writes the canonical run summary; convert it in a wrapper if another tool needs a narrower compatibility shape |
 | n/a | `index.jsonl` (write) | AgentV-specific per-test manifest for filtering, retry, and replay workflows |
 | with-skill vs without-skill | `--target baseline --target candidate` | Structured comparison |
 | Graduate to richer evals | `agentv convert evals.json` → EVAL.yaml | Adds workspace, code graders, etc. |

diff --git a/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx b/apps/web/src/content/docs/docs/integrations/agent-skills-evals.mdx
@@ -107,19 +107,23 @@ The rest of the bundle follows the same pattern:
 
 ## Benchmark output
 
-Generate an Agent Skills compatible `benchmark.json` alongside the standard result JSONL. The `benchmark.json` is automatically written to the artifact directory:
+Generate the run `benchmark.json` alongside the standard result JSONL. The `benchmark.json` is automatically written to the artifact directory:
 
 ```bash
 agentv eval evals.json --target claude --output ./results
 # benchmark.json is written to ./results/benchmark.json
 ```
 
-The benchmark uses AgentV's pass threshold (score >= 0.8) to map continuous scores to the binary pass/fail that Agent Skills `pass_rate` expects:
+The benchmark uses AgentV's pass threshold (score >= 0.8) for each target's `pass_rate`, plus timing and token summaries:
 
 ```json
 {
+  "metadata": {
+    "targets": ["claude"],
+    "tests_run": ["example-test"]
+  },
   "run_summary": {
-    "with_skill": {
+    "claude": {
       "pass_rate": {"mean": 0.83, "stddev": 0.06},
       "time_seconds": {"mean": 45.0, "stddev": 12.0},
       "tokens": {"mean": 3800, "stddev": 400}
@@ -128,6 +132,8 @@ The benchmark uses AgentV's pass threshold (score >= 0.8) to map continuous scor
 }
 ```
 
+If another tool needs a different benchmark shape, keep `--output` as the source of truth and convert `<output>/benchmark.json` in a wrapper.
+
 ## Converting to EVAL.yaml
 
 When you're ready to graduate, convert your evals.json to EVAL.yaml:

diff --git a/docs/plans/2026-06-09-eval-output-surface.md b/docs/plans/2026-06-09-eval-output-surface.md
@@ -14,7 +14,7 @@ The eval run command currently exposes several overlapping ways to choose where
 - `--out <path>` is deprecated and currently treated as a file path whose dirname becomes the artifact directory.
 - `--artifacts <dir>` is deprecated and currently aliases the artifact directory.
 - `--output-format` is deprecated and ignored because run directories always use `index.jsonl`.
-- `--benchmark-json` is deprecated, still writes an extra Agent Skills compatibility file, and is outside this cleanup's requested removal set.
+- `--benchmark-json` was a deprecated extra Agent Skills compatibility output path outside this cleanup's requested removal set; a follow-up cleanup removes that flag and keeps the run directory `benchmark.json` as canonical.
 - Dashboard launch paths already pass `--output <dir>` and expect `<dir>/index.jsonl`.
 - Repository docs/examples still contain old `agentv eval --out <file>` guidance in compare workflows, grader-score helper comments, and local scripts.
 
@@ -51,7 +51,7 @@ Removed now:
 
 Warned/scheduled:
 
-- `--benchmark-json` remains deprecated for now because the Bead did not list it as a known surface and it writes a specialized compatibility artifact. Follow-up cleanup should remove it after a separate audit.
+- `--benchmark-json` is removed by the follow-up cleanup after auditing for consumers; use `--output <dir>` and read `<dir>/benchmark.json` instead of requesting a second benchmark file.
 
 ## Migration