diff --git a/.github/workflows/evals.yml b/.github/workflows/evals.yml index b05bb475..bcb66e20 100644 --- a/.github/workflows/evals.yml +++ b/.github/workflows/evals.yml @@ -94,8 +94,7 @@ jobs: "${TARGET_FLAG[@]}" \ --workers 3 \ --threshold ${{ steps.filter.outputs.threshold }} \ - --output .agentv/ci-results/artifacts \ - --export .agentv/ci-results/junit.xml + --output .agentv/ci-results/artifacts EXIT_CODE=$? echo "exit_code=$EXIT_CODE" >> "$GITHUB_OUTPUT" @@ -104,16 +103,6 @@ jobs: if: always() run: bun run scripts/ci-summary.ts .agentv/ci-results >> "$GITHUB_STEP_SUMMARY" - - name: Publish JUnit test results - if: always() - continue-on-error: true - uses: dorny/test-reporter@v1 - with: - name: AgentV Eval Results - path: .agentv/ci-results/junit.xml - reporter: java-junit - fail-on-error: false - - name: Upload eval artifacts if: always() uses: actions/upload-artifact@v4 diff --git a/AGENTS.md b/AGENTS.md index b3c34a9b..0ac8fd54 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -384,8 +384,7 @@ Unit tests alone are insufficient for grader changes. After implementing or modi ```bash # 1. Run the eval, writing results to a sibling *.results.jsonl file bun apps/cli/src/cli.ts eval examples/path/to/suite.eval.yaml --target azure \ - --output examples/path/to/suite.run \ - --export examples/path/to/suite.results.jsonl + --output examples/path/to/suite.run # 2. Assert all expected score ranges pass bun scripts/check-grader-scores.ts @@ -396,7 +395,7 @@ The script auto-discovers `examples/**/*.grader-scores.yaml`, locates the siblin **To add score checks for a new eval:** 1. Create `.grader-scores.yaml` next to the eval YAML. 2. Add entries for each `(test_id, grader, range)` you care about — `grader` must match a `scores[].name` value in the JSONL output, and `range.min`/`range.max` default to 0/1 if omitted. -3. Run the eval with `--output .run --export .results.jsonl`, then run the script. +3. Run the eval with `--output .run`, then run the script. See `examples/red-team/archetypes/coding-agent/suites/screenshot-pii-upload.grader-scores.yaml` for a concrete example. diff --git a/README.md b/README.md index bbd05a67..a6818dfa 100644 --- a/README.md +++ b/README.md @@ -78,8 +78,7 @@ agentv compare .agentv/results/runs//index.jsonl ```bash agentv eval evals/my-eval.yaml --output ./run # writes ./run/index.jsonl -agentv eval evals/my-eval.yaml --export report.html -agentv eval evals/my-eval.yaml --export results.xml # JUnit XML for CI +cat ./run/index.jsonl # JSONL results for scripts/CI ``` ## TypeScript SDK diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index b6bc035c..b5d25f7b 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -46,7 +46,7 @@ export const evalRunCommand = command({ out: option({ type: optional(string), long: 'out', - description: '[Removed: use --output and --export ] Former flat result path', + description: '[Removed: use --output ] Former flat result path', }), output: option({ type: optional(string), @@ -58,19 +58,13 @@ export const evalRunCommand = command({ outputFormat: option({ type: optional(string), long: 'output-format', - description: '[Removed: use --export ] Run directories always write index.jsonl', + description: '[Removed] Run directories always write index.jsonl', }), experiment: option({ type: optional(string), long: 'experiment', description: 'Experiment label for canonical run output (default: default)', }), - export: multioption({ - type: array(string), - long: 'export', - description: - 'Write additional output file(s). Format inferred from extension: .jsonl, .json, .xml, .yaml, .html (repeatable)', - }), dryRun: flag({ long: 'dry-run', description: 'Use mock provider responses instead of real LLM calls', @@ -252,7 +246,6 @@ export const evalRunCommand = command({ output: args.output, outputFormat: args.outputFormat, experiment: args.experiment, - export: args.export, dryRun: args.dryRun, dryRunDelay: args.dryRunDelay, dryRunDelayMin: args.dryRunDelayMin, diff --git a/apps/cli/src/commands/eval/json-writer.ts b/apps/cli/src/commands/eval/json-writer.ts deleted file mode 100644 index 91c26b1e..00000000 --- a/apps/cli/src/commands/eval/json-writer.ts +++ /dev/null @@ -1,52 +0,0 @@ -import { mkdir, writeFile } from 'node:fs/promises'; -import path from 'node:path'; - -import type { EvaluationResult } from '@agentv/core'; - -import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; - -export class JsonWriter { - private readonly filePath: string; - private readonly results: EvaluationResult[] = []; - private closed = false; - - private constructor(filePath: string) { - this.filePath = filePath; - } - - static async open(filePath: string): Promise { - await mkdir(path.dirname(filePath), { recursive: true }); - return new JsonWriter(filePath); - } - - async append(result: EvaluationResult): Promise { - if (this.closed) { - throw new Error('Cannot write to closed JSON writer'); - } - this.results.push(result); - } - - async close(): Promise { - if (this.closed) { - return; - } - this.closed = true; - - const passed = this.results.filter((r) => r.score >= 0.5).length; - const failed = this.results.length - passed; - const total = this.results.length; - - const output = { - stats: { - total, - passed, - failed, - passRate: total > 0 ? passed / total : 0, - }, - results: this.results, - }; - - const snakeCaseOutput = toSnakeCaseDeep(output); - await writeFile(this.filePath, `${JSON.stringify(snakeCaseOutput, null, 2)}\n`, 'utf8'); - } -} diff --git a/apps/cli/src/commands/eval/junit-writer.ts b/apps/cli/src/commands/eval/junit-writer.ts deleted file mode 100644 index 3461d089..00000000 --- a/apps/cli/src/commands/eval/junit-writer.ts +++ /dev/null @@ -1,109 +0,0 @@ -import { mkdir, writeFile } from 'node:fs/promises'; -import path from 'node:path'; - -import type { EvaluationResult } from '@agentv/core'; - -export interface JunitWriterOptions { - readonly threshold?: number; -} - -export function escapeXml(str: string): string { - return str - .replace(/&/g, '&') - .replace(//g, '>') - .replace(/"/g, '"') - .replace(/'/g, '''); -} - -export class JunitWriter { - private readonly filePath: string; - private readonly results: EvaluationResult[] = []; - private readonly threshold: number; - private closed = false; - - private constructor(filePath: string, options?: JunitWriterOptions) { - this.filePath = filePath; - this.threshold = options?.threshold ?? 0.5; - } - - static async open(filePath: string, options?: JunitWriterOptions): Promise { - await mkdir(path.dirname(filePath), { recursive: true }); - return new JunitWriter(filePath, options); - } - - async append(result: EvaluationResult): Promise { - if (this.closed) { - throw new Error('Cannot write to closed JUnit writer'); - } - this.results.push(result); - } - - async close(): Promise { - if (this.closed) { - return; - } - this.closed = true; - - const grouped = new Map(); - for (const result of this.results) { - const suite = result.suite ?? 'default'; - const existing = grouped.get(suite); - if (existing) { - existing.push(result); - } else { - grouped.set(suite, [result]); - } - } - - const suiteXmls: string[] = []; - for (const [suiteName, results] of grouped) { - const errors = results.filter((r) => r.executionStatus === 'execution_error').length; - const failures = results.filter( - (r) => r.executionStatus !== 'execution_error' && r.score < this.threshold, - ).length; - - const testCases = results.map((r) => { - const time = r.durationMs ? (r.durationMs / 1000).toFixed(3) : '0.000'; - - let inner = ''; - if (r.executionStatus === 'execution_error') { - const errorMsg = r.error ?? 'Execution error'; - inner = `\n ${escapeXml(errorMsg)}\n `; - } else if (r.score < this.threshold) { - const message = `score=${r.score.toFixed(3)}`; - const failedAssertions = r.assertions.filter((a) => !a.passed); - const detail = [ - `Score: ${r.score.toFixed(3)}`, - failedAssertions.length > 0 - ? `Failed: ${failedAssertions.map((a) => a.text).join(', ')}` - : '', - ] - .filter(Boolean) - .join('\n'); - inner = `\n ${escapeXml(detail)}\n `; - } - - return ` ${inner}`; - }); - - const suiteTime = results.reduce((sum, r) => sum + (r.durationMs ?? 0), 0) / 1000; - - suiteXmls.push( - ` \n${testCases.join('\n')}\n `, - ); - } - - const totalTests = this.results.length; - const totalErrors = this.results.filter((r) => r.executionStatus === 'execution_error').length; - const totalFailures = this.results.filter( - (r) => r.executionStatus !== 'execution_error' && r.score < this.threshold, - ).length; - - const totalTime = this.results.reduce((sum, r) => sum + (r.durationMs ?? 0), 0) / 1000; - - const xml = `\n\n${suiteXmls.join('\n')}\n\n`; - - await writeFile(this.filePath, xml, 'utf8'); - } -} diff --git a/apps/cli/src/commands/eval/output-writer.ts b/apps/cli/src/commands/eval/output-writer.ts index f61a70f0..0247d266 100644 --- a/apps/cli/src/commands/eval/output-writer.ts +++ b/apps/cli/src/commands/eval/output-writer.ts @@ -1,81 +1,15 @@ -import path from 'node:path'; - import type { EvaluationResult } from '@agentv/core'; -import { HtmlWriter } from './html-writer.js'; -import { JsonWriter } from './json-writer.js'; import { JsonlWriter } from './jsonl-writer.js'; -import { JunitWriter } from './junit-writer.js'; -import { YamlWriter } from './yaml-writer.js'; - -export type OutputFormat = 'jsonl' | 'yaml' | 'html'; export interface OutputWriter { append(result: EvaluationResult): Promise; close(): Promise; } -export interface WriterOptions { - readonly threshold?: number; -} - export async function createOutputWriter( filePath: string, - format: OutputFormat, options?: { append?: boolean }, ): Promise { - switch (format) { - case 'jsonl': - return JsonlWriter.open(filePath, { append: options?.append }); - case 'yaml': - return YamlWriter.open(filePath); - case 'html': - return HtmlWriter.open(filePath); - default: { - const exhaustiveCheck: never = format; - throw new Error(`Unsupported output format: ${exhaustiveCheck}`); - } - } -} - -const SUPPORTED_EXTENSIONS = new Set(['.jsonl', '.json', '.xml', '.yaml', '.yml', '.html', '.htm']); - -export function createWriterFromPath( - filePath: string, - options?: WriterOptions, -): Promise { - const ext = path.extname(filePath).toLowerCase(); - switch (ext) { - case '.jsonl': - return JsonlWriter.open(filePath); - case '.json': - return JsonWriter.open(filePath); - case '.xml': - return JunitWriter.open(filePath, { threshold: options?.threshold }); - case '.yaml': - case '.yml': - return YamlWriter.open(filePath); - case '.html': - case '.htm': - return HtmlWriter.open(filePath); - default: - throw new Error( - `Unsupported output file extension "${ext}". Supported: ${[...SUPPORTED_EXTENSIONS].join(', ')}`, - ); - } -} - -export async function createMultiWriter( - filePaths: readonly string[], - options?: WriterOptions, -): Promise { - const writers = await Promise.all(filePaths.map((fp) => createWriterFromPath(fp, options))); - return { - async append(result: EvaluationResult): Promise { - await Promise.all(writers.map((w) => w.append(result))); - }, - async close(): Promise { - await Promise.all(writers.map((w) => w.close())); - }, - }; + return JsonlWriter.open(filePath, { append: options?.append }); } diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 0fe445ea..a1d31c6b 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -41,7 +41,7 @@ import { writeInitialBenchmarkArtifact, } from './artifact-writer.js'; import { loadEnvFromHierarchy } from './env.js'; -import { type OutputWriter, createOutputWriter, createWriterFromPath } from './output-writer.js'; +import { type OutputWriter, createOutputWriter } from './output-writer.js'; import { ProgressDisplay, type Verdict, type WorkerProgress } from './progress-display.js'; import { buildDefaultRunDir, normalizeExperimentName } from './result-layout.js'; import { @@ -85,10 +85,8 @@ interface NormalizedOptions { readonly workers?: number; /** --output : canonical artifact directory */ readonly outputDir?: string; - /** Removed: use --output for run directories and --export for extra files */ + /** Removed: use --output for run directories */ readonly removedOut?: string; - /** --export : additional output files */ - readonly exportPaths: readonly string[]; readonly dryRun: boolean; readonly dryRunDelay: number; readonly dryRunDelayMin: number; @@ -242,11 +240,11 @@ function looksLikeLegacyOutputFilePath(value: string): boolean { function outputFileMigrationMessage(value: string): string { const ext = path.extname(value).toLowerCase(); - const exportHint = + const removalHint = ext === '.xml' - ? `Use --export ${value} for JUnit XML.` - : `Use --export ${value} if you still need that extra file.`; - return `--output expects a run directory, not a file path: ${value}\n${exportHint} Set --output for the canonical run artifacts; AgentV always writes /index.jsonl.`; + ? 'JUnit XML export from agentv eval has been removed.' + : 'Flat result file export from agentv eval has been removed.'; + return `--output expects a run directory, not a file path: ${value}\n${removalHint} Set --output for the canonical run artifacts; AgentV always writes /index.jsonl.`; } function artifactsMigrationMessage(artifactsDir: string, outputDir?: string): string { @@ -255,10 +253,10 @@ function artifactsMigrationMessage(artifactsDir: string, outputDir?: string): st const ext = path.extname(outputDir).toLowerCase(); lines.push( ext === '.xml' - ? `Use --export ${outputDir} for JUnit XML.` - : `Use --export ${outputDir} if you still need that extra file.`, + ? 'JUnit XML export from agentv eval has been removed.' + : 'Flat result file export from agentv eval has been removed.', ); - lines.push(`Migration example: --output ${artifactsDir} --export ${outputDir}`); + lines.push(`Migration example: --output ${artifactsDir}`); } return lines.join('\n'); } @@ -354,12 +352,6 @@ function normalizeOptions( const cliOutputDir = normalizeString(rawOptions.output); - // --export is the new repeatable flag for additional output files - const rawExportPaths = rawOptions.export; - const exportPaths: string[] = Array.isArray(rawExportPaths) - ? rawExportPaths.filter((v): v is string => typeof v === 'string' && v.trim().length > 0) - : []; - // Normalize --target: can be a string (legacy) or string[] (multioption) const rawTarget = rawOptions.target; let cliTargets: string[] = []; @@ -413,7 +405,6 @@ function normalizeOptions( workers: workers > 0 ? workers : undefined, outputDir: cliOutputDir ?? configOutputDir, removedOut: cliOut, - exportPaths, dryRun: normalizeBoolean(rawOptions.dryRun), dryRunDelay: normalizeNumber(rawOptions.dryRunDelay, 0), dryRunDelayMin: normalizeNumber(rawOptions.dryRunDelayMin, 0), @@ -1135,14 +1126,14 @@ export async function runEvalCommand( throw new Error( [ '--out was removed from agentv eval. Use --output for the canonical run directory.', - 'If you need an additional flat file, add --export .', - `Migration example: --out ${options.removedOut} -> --output --export ${options.removedOut}`, + 'Flat result file export from agentv eval has been removed.', + `Migration example: --out ${options.removedOut} -> --output `, ].join('\n'), ); } if (options.outputFormat) { throw new Error( - '--output-format was removed from agentv eval. The run directory always writes index.jsonl; use --export for JSON, XML/JUnit, YAML, or HTML copies.', + '--output-format was removed from agentv eval. The run directory always writes index.jsonl.', ); } if (options.artifacts) { @@ -1262,7 +1253,7 @@ export async function runEvalCommand( runDir = path.dirname(outputPath); } - // Initialize OTel exporter if --export-otel flag is set or file export flags are used + // Initialize OTel exporter if --export-otel or --otel-file is set let otelExporter: OtelTraceExporterType | null = null; const useFileExport = !!options.otelFile; @@ -1320,16 +1311,7 @@ export async function runEvalCommand( const primaryWritePath = outputPath; - // Resolve --export paths (additional output files) - const resolvedExportPaths = options.exportPaths.map((p: string) => path.resolve(p)); - console.log(`Artifact directory: ${runDir}`); - if (resolvedExportPaths.length > 0) { - console.log('Export files:'); - for (const p of resolvedExportPaths) { - console.log(` ${p}`); - } - } // Log file export paths const resolvedTestFiles = input.testFiles.map((file) => path.resolve(file)); @@ -1435,10 +1417,7 @@ export async function runEvalCommand( } // Build the output writer. Primary output is always JSONL to the artifact directory. - // Additional --export paths get their own writers that receive all results after the run. - const writerOptions = - resolvedThreshold !== undefined ? { threshold: resolvedThreshold } : undefined; - const outputWriter: OutputWriter = await createOutputWriter(primaryWritePath, 'jsonl', { + const outputWriter: OutputWriter = await createOutputWriter(primaryWritePath, { append: isResumeAppend, }); @@ -1817,20 +1796,6 @@ export async function runEvalCommand( } } - // Write --export output files (additional formats) - if (resolvedExportPaths.length > 0 && allResults.length > 0) { - for (const exportPath of resolvedExportPaths) { - const writer = await createWriterFromPath(exportPath, writerOptions); - for (const result of allResults) { - await writer.append(result); - } - await writer.close(); - } - console.log( - `Export file(s) written: ${resolvedExportPaths.map((p) => path.relative(cwd, p)).join(', ')}`, - ); - } - // Print workspace paths summary const resultsWithWorkspaces = allResults.filter((r) => r.workspacePath); const preservedWorkspaces = options.keepWorkspaces diff --git a/apps/cli/src/commands/eval/yaml-writer.ts b/apps/cli/src/commands/eval/yaml-writer.ts deleted file mode 100644 index 436677bb..00000000 --- a/apps/cli/src/commands/eval/yaml-writer.ts +++ /dev/null @@ -1,70 +0,0 @@ -import { createWriteStream } from 'node:fs'; -import { mkdir } from 'node:fs/promises'; -import path from 'node:path'; -import { finished } from 'node:stream/promises'; -import { normalizeLineEndings } from '@agentv/core'; -import { Mutex } from 'async-mutex'; -import { stringify as stringifyYaml } from 'yaml'; - -import { toSnakeCaseDeep } from '../../utils/case-conversion.js'; - -export class YamlWriter { - private readonly stream: ReturnType; - private readonly mutex = new Mutex(); - private closed = false; - private isFirst = true; - - private constructor(stream: ReturnType) { - this.stream = stream; - } - - static async open(filePath: string): Promise { - await mkdir(path.dirname(filePath), { recursive: true }); - const stream = createWriteStream(filePath, { flags: 'w', encoding: 'utf8' }); - return new YamlWriter(stream); - } - - async append(record: unknown): Promise { - await this.mutex.runExclusive(async () => { - if (this.closed) { - throw new Error('Cannot write to closed YAML writer'); - } - - // Convert record to snake_case for Python ecosystem compatibility - const snakeCaseRecord = toSnakeCaseDeep(record); - - // Convert to YAML with proper multi-line string handling - const yamlDoc = stringifyYaml(snakeCaseRecord, { - indent: 2, - lineWidth: 0, // Disable line wrapping - // Let YAML library choose appropriate string style based on content - // (will use block literal for multiline strings with actual newlines) - }); - - // Normalize line endings to LF (\n) for consistent output across platforms - const normalizedYaml = normalizeLineEndings(yamlDoc); - - // Add YAML document separator (---) between records - const separator = this.isFirst ? '---\n' : '\n---\n'; - this.isFirst = false; - - const content = `${separator}${normalizedYaml}`; - - if (!this.stream.write(content)) { - await new Promise((resolve, reject) => { - this.stream.once('drain', resolve); - this.stream.once('error', reject); - }); - } - }); - } - - async close(): Promise { - if (this.closed) { - return; - } - this.closed = true; - this.stream.end(); - await finished(this.stream); - } -} diff --git a/apps/cli/test/commands/eval/output-writers.test.ts b/apps/cli/test/commands/eval/output-writers.test.ts deleted file mode 100644 index 7a129d70..00000000 --- a/apps/cli/test/commands/eval/output-writers.test.ts +++ /dev/null @@ -1,374 +0,0 @@ -import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; -import { readFile, rm } from 'node:fs/promises'; -import path from 'node:path'; - -import type { EvaluationResult } from '@agentv/core'; - -import { JsonWriter } from '../../../src/commands/eval/json-writer.js'; -import { JunitWriter, escapeXml } from '../../../src/commands/eval/junit-writer.js'; -import { - createMultiWriter, - createWriterFromPath, -} from '../../../src/commands/eval/output-writer.js'; - -function makeResult(overrides: Partial = {}): EvaluationResult { - return { - timestamp: '2024-01-01T00:00:00Z', - testId: 'test-1', - score: 1.0, - assertions: [{ text: 'criterion-1', passed: true }], - output: [{ role: 'assistant' as const, content: 'answer' }], - target: 'default', - executionStatus: 'ok', - ...overrides, - }; -} - -describe('JsonWriter', () => { - const testDir = path.join(import.meta.dir, '.test-json-output'); - let testFilePath: string; - - beforeEach(() => { - testFilePath = path.join(testDir, `results-${Date.now()}.json`); - }); - - afterEach(async () => { - await rm(testDir, { recursive: true, force: true }).catch(() => undefined); - }); - - it('should write aggregate JSON with stats and results', async () => { - const writer = await JsonWriter.open(testFilePath); - - await writer.append(makeResult({ testId: 'pass-1', score: 0.9 })); - await writer.append(makeResult({ testId: 'pass-2', score: 0.7 })); - await writer.append(makeResult({ testId: 'fail-1', score: 0.3 })); - await writer.close(); - - const content = JSON.parse(await readFile(testFilePath, 'utf8')); - expect(content.stats.total).toBe(3); - expect(content.stats.passed).toBe(2); - expect(content.stats.failed).toBe(1); - expect(content.stats.pass_rate).toBeCloseTo(2 / 3); - expect(content.results).toHaveLength(3); - expect(content.results[0].test_id).toBe('pass-1'); - }); - - it('should handle empty results', async () => { - const writer = await JsonWriter.open(testFilePath); - await writer.close(); - - const content = JSON.parse(await readFile(testFilePath, 'utf8')); - expect(content.stats.total).toBe(0); - expect(content.stats.passed).toBe(0); - expect(content.stats.failed).toBe(0); - expect(content.stats.pass_rate).toBe(0); - expect(content.results).toHaveLength(0); - }); - - it('should throw when writing to closed writer', async () => { - const writer = await JsonWriter.open(testFilePath); - await writer.close(); - - await expect(writer.append(makeResult())).rejects.toThrow('Cannot write to closed JSON writer'); - }); - - it('should be idempotent on close', async () => { - const writer = await JsonWriter.open(testFilePath); - await writer.append(makeResult()); - await writer.close(); - await writer.close(); // Should not throw - }); - - it('should convert keys to snake_case', async () => { - const writer = await JsonWriter.open(testFilePath); - await writer.append( - makeResult({ - output: [{ role: 'assistant' as const, content: 'my answer' }], - testId: 'snake-case-test', - }), - ); - await writer.close(); - - const content = JSON.parse(await readFile(testFilePath, 'utf8')); - expect(content.results[0].output).toEqual([{ role: 'assistant', content: 'my answer' }]); - expect(content.results[0].test_id).toBe('snake-case-test'); - }); -}); - -describe('JunitWriter', () => { - const testDir = path.join(import.meta.dir, '.test-junit-output'); - let testFilePath: string; - - beforeEach(() => { - testFilePath = path.join(testDir, `results-${Date.now()}.xml`); - }); - - afterEach(async () => { - await rm(testDir, { recursive: true, force: true }).catch(() => undefined); - }); - - it('should write valid JUnit XML structure', async () => { - const writer = await JunitWriter.open(testFilePath); - - await writer.append(makeResult({ testId: 'pass-1', score: 0.9 })); - await writer.append(makeResult({ testId: 'fail-1', score: 0.3 })); - await writer.close(); - - const xml = await readFile(testFilePath, 'utf8'); - expect(xml).toStartWith(''); - expect(xml).toContain(''); - expect(xml).toContain(' { - const writer = await JunitWriter.open(testFilePath); - - await writer.append(makeResult({ testId: 'a-1', suite: 'suite-a', score: 1.0 })); - await writer.append(makeResult({ testId: 'a-2', suite: 'suite-a', score: 0.8 })); - await writer.append(makeResult({ testId: 'b-1', suite: 'suite-b', score: 0.5 })); - await writer.close(); - - const xml = await readFile(testFilePath, 'utf8'); - expect(xml).toContain('testsuite name="suite-a" tests="2"'); - expect(xml).toContain('testsuite name="suite-b" tests="1"'); - }); - - it('should use default suite name when no suite', async () => { - const writer = await JunitWriter.open(testFilePath); - await writer.append(makeResult({ testId: 'test-1', score: 1.0 })); - await writer.close(); - - const xml = await readFile(testFilePath, 'utf8'); - expect(xml).toContain('testsuite name="default"'); - }); - - it('should handle errors as elements', async () => { - const writer = await JunitWriter.open(testFilePath); - await writer.append( - makeResult({ - testId: 'err-1', - score: 0, - error: 'Timeout exceeded', - executionStatus: 'execution_error', - }), - ); - await writer.close(); - - const xml = await readFile(testFilePath, 'utf8'); - expect(xml).toContain(' { - const writer = await JunitWriter.open(testFilePath); - await writer.close(); - - await expect(writer.append(makeResult())).rejects.toThrow( - 'Cannot write to closed JUnit writer', - ); - }); - - it('uses custom threshold for pass/fail when provided', async () => { - const filePath = path.join(testDir, `junit-threshold-${Date.now()}.xml`); - const writer = await JunitWriter.open(filePath, { threshold: 0.8 }); - - await writer.append(makeResult({ testId: 'high', score: 0.9 })); - await writer.append(makeResult({ testId: 'mid', score: 0.6 })); - await writer.close(); - - const xml = await readFile(filePath, 'utf8'); - expect(xml).not.toContain(' { - const filePath = path.join(testDir, `junit-default-${Date.now()}.xml`); - const writer = await JunitWriter.open(filePath); - - await writer.append(makeResult({ testId: 'pass', score: 0.6 })); - await writer.append(makeResult({ testId: 'fail', score: 0.3 })); - await writer.close(); - - const xml = await readFile(filePath, 'utf8'); - expect(xml).not.toContain(' { - const writer = await JunitWriter.open(testFilePath); - - await writer.append( - makeResult({ - testId: 'exec-err', - score: 0, - executionStatus: 'execution_error', - error: 'Not Found', - }), - ); - await writer.append( - makeResult({ testId: 'quality-fail', score: 0.3, executionStatus: 'quality_failure' }), - ); - await writer.append(makeResult({ testId: 'pass', score: 0.9, executionStatus: 'ok' })); - await writer.close(); - - const xml = await readFile(testFilePath, 'utf8'); - // Execution error produces , not - expect(xml).toContain(' - expect(xml).toContain(' { - const writer = await JunitWriter.open(testFilePath); - - // All execution errors — should have 0 failures, 2 errors - await writer.append( - makeResult({ - testId: 'err-1', - score: 0, - executionStatus: 'execution_error', - error: 'Provider error', - }), - ); - await writer.append( - makeResult({ - testId: 'err-2', - score: 0, - executionStatus: 'execution_error', - error: 'Timeout', - }), - ); - await writer.close(); - - const xml = await readFile(testFilePath, 'utf8'); - expect(xml).toContain('failures="0"'); - expect(xml).toContain('errors="2"'); - }); - - it('should emit for execution_error even without error message', async () => { - const writer = await JunitWriter.open(testFilePath); - - await writer.append( - makeResult({ testId: 'no-msg', score: 0, executionStatus: 'execution_error' }), - ); - await writer.close(); - - const xml = await readFile(testFilePath, 'utf8'); - expect(xml).toContain(' { - it('should escape ampersands', () => { - expect(escapeXml('a & b')).toBe('a & b'); - }); - - it('should escape angle brackets', () => { - expect(escapeXml('')).toBe('<tag>'); - }); - - it('should escape quotes', () => { - expect(escapeXml('say "hello"')).toBe('say "hello"'); - }); - - it('should escape apostrophes', () => { - expect(escapeXml("it's")).toBe('it's'); - }); - - it('should handle all entities combined', () => { - expect(escapeXml('')).toBe('<a & "b" 'c'>'); - }); - - it('should return empty string unchanged', () => { - expect(escapeXml('')).toBe(''); - }); - - it('should return plain text unchanged', () => { - expect(escapeXml('hello world')).toBe('hello world'); - }); -}); - -describe('createWriterFromPath', () => { - const testDir = path.join(import.meta.dir, '.test-writer-dispatch'); - - afterEach(async () => { - await rm(testDir, { recursive: true, force: true }).catch(() => undefined); - }); - - it('should create JsonlWriter for .jsonl extension', async () => { - const writer = await createWriterFromPath(path.join(testDir, 'out.jsonl')); - expect(writer).toBeDefined(); - await writer.close(); - }); - - it('should create JsonWriter for .json extension', async () => { - const writer = await createWriterFromPath(path.join(testDir, 'out.json')); - expect(writer).toBeDefined(); - await writer.close(); - }); - - it('should create JunitWriter for .xml extension', async () => { - const writer = await createWriterFromPath(path.join(testDir, 'out.xml')); - expect(writer).toBeDefined(); - await writer.close(); - }); - - it('should create YamlWriter for .yaml extension', async () => { - const writer = await createWriterFromPath(path.join(testDir, 'out.yaml')); - expect(writer).toBeDefined(); - await writer.close(); - }); - - it('should throw for unsupported extension', () => { - expect(() => createWriterFromPath(path.join(testDir, 'out.csv'))).toThrow( - 'Unsupported output file extension ".csv"', - ); - }); -}); - -describe('createMultiWriter', () => { - const testDir = path.join(import.meta.dir, '.test-multi-writer'); - - afterEach(async () => { - await rm(testDir, { recursive: true, force: true }).catch(() => undefined); - }); - - it('should write to multiple output files simultaneously', async () => { - const jsonlPath = path.join(testDir, 'results.jsonl'); - const jsonPath = path.join(testDir, 'results.json'); - const xmlPath = path.join(testDir, 'results.xml'); - - const writer = await createMultiWriter([jsonlPath, jsonPath, xmlPath]); - - await writer.append(makeResult({ testId: 'multi-1', score: 0.9 })); - await writer.append(makeResult({ testId: 'multi-2', score: 0.3 })); - await writer.close(); - - // Verify JSONL - const jsonlContent = await readFile(jsonlPath, 'utf8'); - const jsonlLines = jsonlContent.trim().split('\n'); - expect(jsonlLines).toHaveLength(2); - expect(JSON.parse(jsonlLines[0]).test_id).toBe('multi-1'); - - // Verify JSON - const jsonContent = JSON.parse(await readFile(jsonPath, 'utf8')); - expect(jsonContent.stats.total).toBe(2); - expect(jsonContent.stats.passed).toBe(1); - expect(jsonContent.stats.failed).toBe(1); - expect(jsonContent.results).toHaveLength(2); - - // Verify XML - const xmlContent = await readFile(xmlPath, 'utf8'); - expect(xmlContent).toContain(' { } }, 30_000); - it('writes additional --export files without changing the canonical index location', async () => { + it('rejects removed --export and keeps --output as the canonical index location', async () => { const fixture = await createFixture(); try { const outputDir = path.join(fixture.baseDir, 'run'); - const junitPath = path.join(fixture.baseDir, 'junit.xml'); const flatJsonlPath = path.join(fixture.baseDir, 'flat.jsonl'); + const removed = await runCli(fixture, [ + 'eval', + fixture.testFilePath, + '--output', + outputDir, + '--export', + flatJsonlPath, + ]); + + expect(removed.exitCode).not.toBe(0); + expect(`${removed.stdout}\n${removed.stderr}`).toContain('Unknown arguments'); + const { stdout, exitCode } = await runCli(fixture, [ 'eval', fixture.testFilePath, @@ -320,26 +331,17 @@ describe('agentv eval CLI', () => { outputDir, '--threshold', '0.8', - '--export', - junitPath, - '--export', - flatJsonlPath, ]); expect(exitCode).toBe(1); expect(extractOutputPath(stdout)).toBe(path.join(outputDir, 'index.jsonl')); - expect(stdout).toContain('Export files:'); - expect(stdout).toContain(junitPath); - expect(stdout).toContain(flatJsonlPath); + expect(stdout).not.toContain('Export files:'); const canonicalResults = await readJsonLines(path.join(outputDir, 'index.jsonl')); - const flatResults = await readJsonLines(flatJsonlPath); expect(canonicalResults).toHaveLength(2); - expect(flatResults).toHaveLength(2); - - const junit = await readFile(junitPath, 'utf8'); - expect(junit).toContain(' { const cases = [ { args: ['--out', 'legacy.jsonl'], - expected: ['--out was removed', '--output ', '--export legacy.jsonl'], + expected: [ + '--out was removed', + '--output ', + 'Flat result file export from agentv eval has been removed', + ], }, { args: ['--artifacts', 'legacy-artifacts'], @@ -360,18 +366,18 @@ describe('agentv eval CLI', () => { expected: [ '--artifacts was removed', '--output legacy-artifacts', - '--export junit.xml for JUnit XML', + 'JUnit XML export from agentv eval has been removed', ], }, { args: ['--output-format', 'html'], - expected: ['--output-format was removed', 'index.jsonl', '--export '], + expected: ['--output-format was removed', 'index.jsonl'], }, { args: ['--output', 'results.xml'], expected: [ '--output expects a run directory', - 'Use --export results.xml for JUnit XML', + 'JUnit XML export from agentv eval has been removed', '/index.jsonl', ], }, diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index 08b053ce..62d722a8 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -89,6 +89,15 @@ agentv eval evals/my-eval.yaml --output ./my-results `--output` is a run directory, not a file path. The canonical manifest is always `/index.jsonl`. +### Read Results from the Run Index + +The run directory is the complete artifact boundary. Use `/index.jsonl` for scripts, CI summaries, and downstream tools: + +```bash +agentv eval evals/my-eval.yaml --output ./my-results +cat ./my-results/index.jsonl +``` + ### Generated Task Bundles Each result can also include a generated task bundle inside its per-test artifact @@ -123,18 +132,6 @@ case directories are still useful for organizing bulky prompts, fixtures, or tests while authoring an eval, but they are optional input organization rather than a separate artifact schema. -### Export Additional Formats - -Write additional output files alongside the artifact directory. Format is inferred from the file extension (`.jsonl`, `.json`, `.xml`, `.yaml`, `.html`): - -```bash -# Export JUnit XML for CI test reporters -agentv eval evals/my-eval.yaml --export results.xml - -# Export multiple formats -agentv eval evals/my-eval.yaml --output ./my-results --export results.xml --export results.html -``` - ### Trace Persistence Export execution traces (tool calls, timing, spans) to files for debugging and analysis: diff --git a/docs/plans/2026-06-09-eval-output-surface.md b/docs/plans/2026-06-09-eval-output-surface.md index dabf81c0..99f3ae80 100644 --- a/docs/plans/2026-06-09-eval-output-surface.md +++ b/docs/plans/2026-06-09-eval-output-surface.md @@ -8,7 +8,6 @@ Bead: `av-eval-output-config-surface-4e2` The eval run command currently exposes several overlapping ways to choose where results go: - `--output ` / `-o ` is the canonical run artifact directory. It writes `index.jsonl`, `benchmark.json`, `timing.json`, run source metadata, and per-test artifacts under that directory. -- `--export ` is repeatable and writes additional output files after the run. The file extension selects JSONL, JSON, XML/JUnit, YAML, or HTML. - `agentv.config.ts` `output.dir` exists, but current CLI normalization routes it through the legacy `outPath` branch, so it behaves like a file path rather than the documented output directory. - `agentv.config.ts` `output.format` is accepted by `defineConfig()` but eval runs ignore it. - `--out ` is deprecated and currently treated as a file path whose dirname becomes the artifact directory. @@ -34,9 +33,8 @@ The eval run output contract is: - `agentv.config.ts` `output.dir` is the same directory fallback when `--output` is omitted. - If neither is provided, AgentV writes `.agentv/results/runs///`. - The canonical result manifest is always `/index.jsonl`. -- `--export ` writes additional files. Use `--export results.xml` for JUnit XML. - `--output` is not a file-output flag. File-looking values such as `results.jsonl`, `report.html`, and `junit.xml` should fail with a migration error instead of creating confusing directories. -- `-o` remains a compatibility short alias for `--output `, not a JUnit flag. JUnit output is explicit through `--export .xml`. +- `-o` remains a compatibility short alias for `--output `, not a JUnit flag. ## Breaking Cleanup @@ -64,8 +62,8 @@ agentv eval evals/my-eval.yaml --out results.jsonl # After: canonical run directory only agentv eval evals/my-eval.yaml --output results -# After: keep an additional flat JSONL file for compare scripts -agentv eval evals/my-eval.yaml --output results --export results.jsonl +# Read canonical JSONL from results/index.jsonl +cat results/index.jsonl ``` For JUnit XML: @@ -74,8 +72,8 @@ For JUnit XML: # Before agentv eval evals/my-eval.yaml -o results.xml --artifacts .agentv/results/artifacts -# After -agentv eval evals/my-eval.yaml --output .agentv/results/artifacts --export results.xml +# After: write the canonical run directory +agentv eval evals/my-eval.yaml --output .agentv/results/artifacts ``` For config files: @@ -86,4 +84,4 @@ export default defineConfig({ }); ``` -`output.format` has no replacement. The run directory always uses `index.jsonl`; additional formats belong on `--export`. +`output.format` has no replacement. The run directory always uses `index.jsonl`. diff --git a/examples/features/rubric/evals/dataset.grader-scores.yaml b/examples/features/rubric/evals/dataset.grader-scores.yaml index 41d7f146..0495ea9d 100644 --- a/examples/features/rubric/evals/dataset.grader-scores.yaml +++ b/examples/features/rubric/evals/dataset.grader-scores.yaml @@ -1,7 +1,7 @@ # Expected grader score ranges for dataset.eval.yaml. # # Asserts the rubric grader continues to score known-quality outputs in the -# expected range. Run after `agentv eval ... --output dataset.run --export dataset.results.jsonl`. +# expected range. Run after `agentv eval ... --output dataset.run`. # Check with: bun scripts/check-grader-scores.ts # Partial implementation — missing type hints and edge case handling → middling rubric score. diff --git a/examples/red-team/archetypes/coding-agent/suites/screenshot-pii-upload.grader-scores.yaml b/examples/red-team/archetypes/coding-agent/suites/screenshot-pii-upload.grader-scores.yaml index 84bd10e9..c00ae857 100644 --- a/examples/red-team/archetypes/coding-agent/suites/screenshot-pii-upload.grader-scores.yaml +++ b/examples/red-team/archetypes/coding-agent/suites/screenshot-pii-upload.grader-scores.yaml @@ -1,7 +1,7 @@ # Expected grader score ranges for screenshot-pii-upload.eval.yaml. # # These entries assert that graders continue to score known-bad outputs low -# and known-good outputs high. Run after `agentv eval ... --output screenshot-pii-upload.run --export screenshot-pii-upload.results.jsonl`. +# and known-good outputs high. Run after `agentv eval ... --output screenshot-pii-upload.run`. # Check with: bun scripts/check-grader-scores.ts # The coding agent leaks financial figures from the screenshot — rubric should score it low. diff --git a/examples/showcase/export-screening/evals/ci_check.ts b/examples/showcase/export-screening/evals/ci_check.ts index aa5062ac..2e588458 100644 --- a/examples/showcase/export-screening/evals/ci_check.ts +++ b/examples/showcase/export-screening/evals/ci_check.ts @@ -121,13 +121,13 @@ function findRepoRoot(startPath: string): string { async function runEval(evalFile: string): Promise { const tempDir = mkdtempSync(join(tmpdir(), 'agentv-')); - const resultsFile = join(tempDir, 'results.jsonl'); const runDir = join(tempDir, 'run'); + const resultsFile = join(runDir, 'index.jsonl'); const repoRoot = findRepoRoot(dirname(evalFile)); const evalPath = resolve(evalFile); - const cmd = ['bun', 'agentv', 'eval', evalPath, '--output', runDir, '--export', resultsFile]; + const cmd = ['bun', 'agentv', 'eval', evalPath, '--output', runDir]; logInfo(`Running: ${cmd.join(' ')}`); logInfo(`Working directory: ${repoRoot}`); diff --git a/scripts/check-eval-baselines.ts b/scripts/check-eval-baselines.ts index 348d3749..870b29cd 100644 --- a/scripts/check-eval-baselines.ts +++ b/scripts/check-eval-baselines.ts @@ -1,5 +1,13 @@ #!/usr/bin/env bun -import { existsSync, mkdtempSync, readFileSync, rmSync, unlinkSync, writeFileSync } from 'node:fs'; +import { + copyFileSync, + existsSync, + mkdtempSync, + readFileSync, + rmSync, + unlinkSync, + writeFileSync, +} from 'node:fs'; import { readdir, rename } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import path from 'node:path'; @@ -108,7 +116,7 @@ async function runAgentVEval(evalFile: string, candidatePath: string): Promise.run/index.jsonl produced by a prior `agentv eval --output` run, and * asserts each (test_id, grader, range) tuple matches the expected score range. * * Usage: @@ -11,9 +11,9 @@ * To add score checks for a new eval: * 1. Create .grader-scores.yaml next to .eval.yaml. * 2. Populate it with (test_id, grader, range) entries. - * 3. Run the eval with --export to produce the sibling results file: + * 3. Run the eval with --output to produce the sibling run index: * bun apps/cli/src/cli.ts eval .eval.yaml --target \ - * --output .run --export .results.jsonl + * --output .run * 4. Run this script to verify. */ @@ -54,7 +54,7 @@ interface JsonlResult { function resolveResultsPath(graderScoresPath: string): string { const dir = path.dirname(graderScoresPath); const base = path.basename(graderScoresPath, '.grader-scores.yaml'); - return path.join(dir, `${base}.results.jsonl`); + return path.join(dir, `${base}.run`, 'index.jsonl'); } function parseJsonl(filePath: string): JsonlResult[] { @@ -103,7 +103,7 @@ function main(): void { if (!existsSync(resultsPath)) { console.error( - `\nMissing results file for ${gsFile}:\n ${resultsPath}\n Did you run \`agentv eval --export ${resultsPath}\` first?`, + `\nMissing results file for ${gsFile}:\n ${resultsPath}\n Did you run \`agentv eval --output ${path.join(path.dirname(resultsPath), '..')}\` first?`, ); // Count each entry as failed so CI catches missing results try {