Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions packages/core/test/evaluation/orchestrator.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3249,7 +3249,10 @@ fs.writeFileSync(path.join(payload.workspace_path, 'hook.txt'), payload.test_id
],
});

// Use a slow evaluator to ensure measurable duration
// Use an async evaluator so timing spans a real awaited grader call.
// Timer implementations may resume a millisecond early under CI load, so
// the contract here is that timing is present, positive, and internally
// consistent with the recorded timestamps rather than equal to the delay.
const slowEvaluatorRegistry = {
'llm-grader': {
kind: 'llm-grader',
Expand Down Expand Up @@ -3280,8 +3283,8 @@ fs.writeFileSync(path.join(payload.workspace_path, 'hook.txt'), payload.test_id
expect(result.scores).toHaveLength(1);
const graderScore = result.scores?.[0];

// durationMs should be present and reflect real wall-clock time
expect(graderScore?.durationMs).toBeGreaterThanOrEqual(50);
// durationMs should be present and reflect elapsed wall-clock time
expect(graderScore?.durationMs).toBeGreaterThan(0);

// startedAt and endedAt should be valid ISO 8601 UTC strings
expect(graderScore?.startedAt).toBeDefined();
Expand Down Expand Up @@ -3329,7 +3332,7 @@ fs.writeFileSync(path.join(payload.workspace_path, 'hook.txt'), payload.test_id
const graderScore = result.scores?.[0];

// Timing should still be present even on failure
expect(graderScore?.durationMs).toBeGreaterThanOrEqual(20);
expect(graderScore?.durationMs).toBeGreaterThan(0);
expect(graderScore?.startedAt).toBeDefined();
expect(graderScore?.endedAt).toBeDefined();
});
Expand Down
Loading