{
  "version": "0.1.0",
  "generatedAt": "2026-05-16T04:18:23.663Z",
  "purpose": "Trace-derived Agentic Reliability Index for comparing coding, browser, and support agents by completion, state proof, recovery, tool/policy correctness, and cost-latency discipline.",
  "suiteSlugs": [
    "coding-agent-maintenance-suite",
    "browser-operations-suite",
    "support-agent-policy-suite",
    "ai-security-risk-suite"
  ],
  "suiteCount": 4,
  "traceCount": 16,
  "weights": [
    {
      "component": "Task completion",
      "weight": 30,
      "sourceField": "score",
      "rationale": "Measures whether the agent produced the accepted workflow result."
    },
    {
      "component": "Evidence and state verification",
      "weight": 25,
      "sourceField": "scoreCalculation evidence/proof labels",
      "rationale": "Rewards proof that the browser, codebase, support policy, or tool state actually reached the target."
    },
    {
      "component": "Recovery behavior",
      "weight": 20,
      "sourceField": "recovery",
      "rationale": "Separates agents that recover from validation errors, failed tools, and partial state from agents that simply stop."
    },
    {
      "component": "Tool and policy correctness",
      "weight": 15,
      "sourceField": "toolCalls and scoringFocus",
      "rationale": "Captures tool discipline, policy adherence, escalation quality, and repository hygiene."
    },
    {
      "component": "Cost and latency discipline",
      "weight": 10,
      "sourceField": "costIndex and latencyIndex",
      "rationale": "Prevents slow or expensive agents from ranking well unless quality justifies the operating cost."
    }
  ],
  "rows": [
    {
      "modelId": "frontier-reasoning",
      "model": "Frontier reasoning model",
      "provider": "Frontier API provider",
      "weightedScore": 72,
      "components": {
        "taskCompletion": 80,
        "evidenceStateVerification": 66,
        "recoveryBehavior": 76,
        "toolPolicyCorrectness": 69,
        "costLatencyDiscipline": 55
      },
      "traceCount": 16,
      "suiteCount": 4,
      "interpretation": "Useful on constrained workflows with reviewer oversight and fallback routing."
    },
    {
      "modelId": "fast-mid-tier",
      "model": "Fast mid-tier model",
      "provider": "Fast hosted API provider",
      "weightedScore": 65,
      "components": {
        "taskCompletion": 69,
        "evidenceStateVerification": 59,
        "recoveryBehavior": 62,
        "toolPolicyCorrectness": 58,
        "costLatencyDiscipline": 83
      },
      "traceCount": 16,
      "suiteCount": 4,
      "interpretation": "Useful on constrained workflows with reviewer oversight and fallback routing."
    },
    {
      "modelId": "open-weight-local",
      "model": "Open-weight local model",
      "provider": "Self-hosted/open-weight stack",
      "weightedScore": 47,
      "components": {
        "taskCompletion": 49,
        "evidenceStateVerification": 44,
        "recoveryBehavior": 44,
        "toolPolicyCorrectness": 41,
        "costLatencyDiscipline": 67
      },
      "traceCount": 16,
      "suiteCount": 4,
      "interpretation": "Research or routing role only until recovery and state proof improve."
    },
    {
      "modelId": "small-routing",
      "model": "Small routing model",
      "provider": "Low-cost routing endpoint",
      "weightedScore": 37,
      "components": {
        "taskCompletion": 32,
        "evidenceStateVerification": 32,
        "recoveryBehavior": 30,
        "toolPolicyCorrectness": 29,
        "costLatencyDiscipline": 94
      },
      "traceCount": 16,
      "suiteCount": 4,
      "interpretation": "Research or routing role only until recovery and state proof improve."
    }
  ],
  "topModel": {
    "modelId": "frontier-reasoning",
    "model": "Frontier reasoning model",
    "provider": "Frontier API provider",
    "weightedScore": 72,
    "components": {
      "taskCompletion": 80,
      "evidenceStateVerification": 66,
      "recoveryBehavior": 76,
      "toolPolicyCorrectness": 69,
      "costLatencyDiscipline": 55
    },
    "traceCount": 16,
    "suiteCount": 4,
    "interpretation": "Useful on constrained workflows with reviewer oversight and fallback routing."
  },
  "methodology": "Scores are generated from current benchmark task traces and suite rows. Replace synthetic rows with real provider or agent harness exports before treating the index as a public procurement ranking."
}
