{
  "adapters": [
    {
      "slug": "provider-api-harness",
      "title": "Provider API Harness",
      "owner": "Benchmark engineer",
      "input": "Suite task packet plus immutable prompt packet hash.",
      "output": "One JSONL row per model run with model id, provider id, latency, token/cost fields, response artifact URI, and reviewer queue id.",
      "evidence": [
        "provider response id",
        "model version",
        "raw prompt packet",
        "raw answer artifact",
        "cost ledger"
      ],
      "failureMode": "A provider run is rejected when the model id, response id, cost, or prompt packet cannot be reproduced.",
      "artifactPath": "/reports/benchmark-harness-kit/provider-api-harness.md"
    },
    {
      "slug": "browser-agent-harness",
      "title": "Browser Agent Harness",
      "owner": "Agent evaluator",
      "input": "Browser task packet with start URL, target state, allowed tools, and blocked shortcuts.",
      "output": "Trace archive with DOM checkpoints, screenshot evidence, console warnings, timing, recovered errors, and final state proof.",
      "evidence": [
        "screenshot proof",
        "DOM checkpoint",
        "tool trace",
        "console warnings",
        "target-state assertion"
      ],
      "failureMode": "A browser run is rejected when the agent claims completion without state proof or screenshot/DOM evidence.",
      "artifactPath": "/reports/benchmark-harness-kit/browser-agent-harness.md"
    },
    {
      "slug": "coding-agent-harness",
      "title": "Coding Agent Harness",
      "owner": "Engineering reviewer",
      "input": "Issue-style repository task with baseline commit, acceptance criteria, test command, and forbidden unrelated changes.",
      "output": "Patch bundle with diff, terminal log, test output, browser proof when needed, and reviewer verdict.",
      "evidence": [
        "git diff",
        "test log",
        "lint/build log",
        "browser screenshot",
        "reviewer verdict"
      ],
      "failureMode": "A coding run is rejected when it changes unrelated files, skips required checks, or cannot reproduce the patch from the baseline commit.",
      "artifactPath": "/reports/benchmark-harness-kit/coding-agent-harness.md"
    },
    {
      "slug": "support-agent-harness",
      "title": "Support Agent Harness",
      "owner": "Policy reviewer",
      "input": "Support scenario packet with policy excerpt, customer message, escalation boundary, and gold resolution.",
      "output": "Conversation transcript with policy citations, escalation choice, tone review, and final customer outcome.",
      "evidence": [
        "policy citation",
        "conversation transcript",
        "handoff decision",
        "tone score",
        "gold outcome comparison"
      ],
      "failureMode": "A support run is rejected when it invents a policy, misses a required escalation, or cannot cite the policy boundary it used.",
      "artifactPath": "/reports/benchmark-harness-kit/support-agent-harness.md"
    }
  ],
  "qualityGates": [
    {
      "gate": "Immutable task packet",
      "proof": "The row references a task id, source packet, prompt packet hash, and expected-output rubric.",
      "blocks": "Prevents silently changing the task after a model has already run."
    },
    {
      "gate": "Exact system identity",
      "proof": "The row records provider, model id, model version, agent version, route, and settings.",
      "blocks": "Prevents comparing vague brand names instead of reproducible systems."
    },
    {
      "gate": "State and artifact proof",
      "proof": "The row links raw answer, tool trace, screenshot, diff, terminal log, or transcript as appropriate.",
      "blocks": "Prevents fluent completion claims from becoming benchmark evidence."
    },
    {
      "gate": "Human scoring reason",
      "proof": "The row includes reviewer, score, failure category, acceptance state, and reviewer note.",
      "blocks": "Prevents a number from appearing without an auditable reason."
    }
  ]
}
