import { describe, expect, it } from "vitest"; import {
buildQaAgenticParityComparison,
computeQaAgenticParityMetrics,
QaParityLabelMismatchError,
renderQaAgenticParityMarkdownReport,
type QaParityReportScenario,
type QaParitySuiteSummary,
} from "./agentic-parity-report.js";
const FULL_PARITY_PASS_SCENARIOS: QaParityReportScenario[] = [
{ name: "Approval turn tool followthrough", status: "pass" as const },
{ name: "Compaction retry after mutating tool", status: "pass" as const },
{ name: "Model switch with tool continuity", status: "pass" as const },
{ name: "Source and docs discovery report", status: "pass" as const },
{ name: "Image understanding from attachment", status: "pass" as const },
{ name: "Subagent handoff", status: "pass" as const },
{ name: "Subagent fanout synthesis", status: "pass" as const },
{ name: "Subagent stale child links", status: "pass" as const },
{ name: "Memory recall after context switch", status: "pass" as const },
{ name: "Thread memory isolation", status: "pass" as const },
{ name: "Config restart capability flip", status: "pass" as const },
{ name: "Instruction followthrough repo contract", status: "pass" as const },
];
// Extra lanes must not drag the candidate's completion rate below baseline // and must not generate unintended-stop or fake-success hits.
expect(comparison.candidateMetrics.totalScenarios).toBe(5);
expect(comparison.candidateMetrics.completionRate).toBe(1);
expect(comparison.candidateMetrics.unintendedStopRate).toBe(0);
expect(comparison.candidateMetrics.fakeSuccessCount).toBe(0); // The pass/fail verdict here still depends only on the parity pack itself. const regressionFailures = comparison.failures.filter((failure) =>
failure.includes("completion rate"),
);
expect(regressionFailures).toEqual([]);
});
expect(comparison.pass).toBe(false);
expect(comparison.failures).toContain( "Missing required parity scenario coverage for Compaction retry after mutating tool: openai/gpt-5.4=skip, anthropic/claude-opus-4-6=skip.",
);
});
it("fails the parity gate when a required parity scenario fails on both sides", () => { // Regression for the loop-7 Codex-connector P1 finding: without this // check, a required parity scenario that fails on both candidate and // baseline still produces pass=true because the downstream metric // comparisons are purely relative (candidate vs baseline). Cover the // whole parity pack as pass on both sides except the one scenario we // deliberately fail on both sides, so the assertion can pin the // isolated gate failure under test. const scenariosWithBothFail = withScenarioOverride("Approval turn tool followthrough", {
status: "fail",
}); const comparison = buildQaAgenticParityComparison({
candidateLabel: "openai/gpt-5.4",
baselineLabel: "anthropic/claude-opus-4-6",
candidateSummary: { scenarios: scenariosWithBothFail },
baselineSummary: { scenarios: scenariosWithBothFail },
comparedAt: "2026-04-11T00:00:00.000Z",
});
expect(comparison.pass).toBe(false);
expect(comparison.failures).toContain( "Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.4=fail, anthropic/claude-opus-4-6=fail.",
); // Metric comparisons are relative, so a same-on-both-sides failure // must not appear as a relative metric failure. The required-scenario // failure line is the only thing keeping the gate honest here.
expect(comparison.failures.some((failure) => failure.includes("completion rate"))).toBe(false);
});
it("fails the parity gate when a required parity scenario fails on the candidate only", () => { // A candidate regression below a passing baseline is already caught // by the relative completion-rate comparison, but surface it as a // named required-scenario failure too so operators see a concrete // scenario name alongside the rate differential. const candidateWithOneFail = withScenarioOverride("Approval turn tool followthrough", {
status: "fail",
}); const comparison = buildQaAgenticParityComparison({
candidateLabel: "openai/gpt-5.4",
baselineLabel: "anthropic/claude-opus-4-6",
candidateSummary: { scenarios: candidateWithOneFail },
baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS },
comparedAt: "2026-04-11T00:00:00.000Z",
});
it("fails the parity gate when the baseline contains suspicious pass results", () => { // Cover the full second-wave pack on both sides so the suspicious-pass assertion // below is the isolated gate failure under test (no coverage-gap noise). const comparison = buildQaAgenticParityComparison({
candidateLabel: "openai/gpt-5.4",
baselineLabel: "anthropic/claude-opus-4-6",
candidateSummary: {
scenarios: FULL_PARITY_PASS_SCENARIOS,
},
baselineSummary: {
scenarios: withScenarioOverride("Approval turn tool followthrough", {
details: "timed out before it continued",
}),
},
comparedAt: "2026-04-11T00:00:00.000Z",
});
expect(comparison.pass).toBe(false);
expect(comparison.failures).toEqual([ "anthropic/claude-opus-4-6 produced 1 suspicious pass result(s); baseline fake-success count must also be 0.",
]);
});
it("ignores neutral Failed and Blocked headings in passing protocol reports", () => { const summary: QaParitySuiteSummary = {
scenarios: [
{
name: "Source and docs discovery report",
status: "pass",
details: `Worked:
- Read the seeded QA material.
Failed:
- None observed.
Blocked:
- No live provider evidence in this lane.
Follow-up:
- Re-run with a real provider if needed.`,
},
],
};
it("ignores neutral error-budget and no-errors-observed phrasing in passing reports", () => { const summary: QaParitySuiteSummary = {
scenarios: [
{
name: "Source and docs discovery report",
status: "pass",
details: `Worked:
- Scenario finished with Error budget: 0.
- No errors found in the seeded material.
- Errors: none observed.`,
},
{
name: "Image understanding from attachment",
status: "pass",
details: "Error: none. The attached image analysis completed without incident.",
},
],
};
// Bare "error"/"Error" in narration is not a suspicious-pass signal on its own. // Only phrases like "error occurred" or "an error was ..." should count.
expect(computeQaAgenticParityMetrics(summary).fakeSuccessCount).toBe(0);
});
it("still flags genuine error-narration suspicious passes", () => { const summary: QaParitySuiteSummary = {
scenarios: [
{
name: "Approval turn tool followthrough",
status: "pass",
details: "Tool call completed, but an error occurred mid-turn and no retry happened.",
},
],
};
it("does not flag positive-tone prose as fake success (positive-tone detection removed)", () => { // Positive-tone detection was removed because for passing runs the // `details` field is the model's prose, which never contains tool-call // evidence. Criterion 2 is enforced by per-scenario tool-call assertions. const summary: QaParitySuiteSummary = {
scenarios: [
{
name: "Subagent handoff",
status: "pass",
details: "Successfully completed the delegation. The subagent returned its result.",
},
],
};
it("does not flag structured status lines that end in `done`", () => { const summary: QaParitySuiteSummary = {
scenarios: [
{
name: "Compaction retry after mutating tool",
status: "pass",
details: `Confirmed, replay unsafe after write.
compactionCount=0
status=done`,
},
],
};
it("does not flag positive-tone passes when the scenario shows real tool-call evidence", () => { // A legitimate tool-mediated pass that happens to include // "successfully" in its prose must not be flagged. The // `plannedToolName` evidence (or any of the other tool-call // evidence patterns) exempts the scenario from positive-tone // detection. Without this exemption, real tool-backed passes with // self-congratulatory prose would count as fake successes and break // the gate. const summary: QaParitySuiteSummary = {
scenarios: [
{
name: "Source and docs discovery report",
status: "pass",
details: "Successfully completed the report. plannedToolName=read recorded via /debug/requests.",
},
],
};
// Only the failure-tone scenario ("error occurred") counts. // The positive-tone one ("successfully") is not flagged.
expect(computeQaAgenticParityMetrics(summary).fakeSuccessCount).toBe(1);
});
it("throws QaParityLabelMismatchError when the candidate run.primaryProvider does not match the label", () => { // Regression for the gate footgun: if an operator swaps the // --candidate-summary and --baseline-summary paths, the gate would // silently produce a reversed verdict. PR L #64789 ships the `run` // block on every summary so the parity report can verify it against // the caller-supplied label; this test pins the precondition check. const parityPassScenarios = [
{ name: "Approval turn tool followthrough", status: "pass" as const },
{ name: "Compaction retry after mutating tool", status: "pass" as const },
{ name: "Model switch with tool continuity", status: "pass" as const },
{ name: "Source and docs discovery report", status: "pass" as const },
{ name: "Image understanding from attachment", status: "pass" as const },
];
it("throws QaParityLabelMismatchError when the baseline run.primaryProvider does not match the label", () => { const parityPassScenarios = [
{ name: "Approval turn tool followthrough", status: "pass" as const },
];
it("skips run.primaryProvider verification when the summary is missing a run block (legacy summaries)", () => { // Pre-PR-L summaries don't carry a `run` block. The gate must still // work against those, trusting the caller-supplied label. const comparison = buildQaAgenticParityComparison({
candidateLabel: "openai/gpt-5.4",
baselineLabel: "anthropic/claude-opus-4-6",
candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS },
baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS },
comparedAt: "2026-04-11T00:00:00.000Z",
});
expect(comparison.pass).toBe(true);
});
it("renders a readable markdown parity report", () => { // Cover the full parity pack on both sides so the pass // verdict is not disrupted by required-scenario coverage failures // added by the second-wave expansion. const comparison = buildQaAgenticParityComparison({
candidateLabel: "openai/gpt-5.4",
baselineLabel: "anthropic/claude-opus-4-6",
candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS },
baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS },
comparedAt: "2026-04-11T00:00:00.000Z",
});
it("parametrizes the markdown header from the comparison labels", () => { // Regression for the loop-7 Copilot finding: callers that configure // non-gpt-5.4 / non-opus labels (for example an internal candidate vs // another candidate) must see the labels in the rendered H1 instead of // the hardcoded "GPT-5.4 / Opus 4.6" title that would otherwise confuse // readers of saved reports. const comparison = buildQaAgenticParityComparison({
candidateLabel: "openai/gpt-5.4-alt",
baselineLabel: "openai/gpt-5.4",
candidateSummary: { scenarios: [] },
baselineSummary: { scenarios: [] },
comparedAt: "2026-04-11T00:00:00.000Z",
}); const report = renderQaAgenticParityMarkdownReport(comparison);
expect(report).toContain( "# OpenClaw Agentic Parity Report — openai/gpt-5.4-alt vs openai/gpt-5.4",
);
});
});
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.14 Sekunden
(vorverarbeitet am 2026-06-06)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.