{
  "slug": "agent-benchmark-explorer",
  "title": "Agent Benchmark Explorer",
  "stage": "Preview",
  "audience": "AI teams comparing autonomous workflows",
  "summary": "A structured benchmark surface for measuring whether agents can plan, use tools, recover from errors, and complete useful work rather than only answer prompts.",
  "href": "/studio/agent-benchmark-explorer",
  "problem": "Agent buyers need to know whether a system can finish work across tools, not whether the base model can write an impressive paragraph. This product turns messy agent demos into repeatable runs with pass/fail evidence.",
  "metrics": [
    "Task completion",
    "Tool-call quality",
    "Recovery rate",
    "Cost per resolved task"
  ],
  "outputs": [
    "Agent scorecard",
    "Trace review table",
    "Failure taxonomy",
    "Cost per resolved task"
  ],
  "buyerQuestions": [
    "Can the agent recover after a bad tool call?",
    "Does it verify state before claiming completion?",
    "How much does each accepted workflow actually cost?",
    "Which failures should trigger human handoff?"
  ],
  "evidenceLinks": [
    [
      "Agentic Reliability Index",
      "/leaderboards#agentic-reliability-index"
    ],
    [
      "Browser Operations Suite",
      "/benchmarks/browser-operations-suite"
    ],
    [
      "Agent benchmarks article",
      "/articles/agent-benchmarks-that-survive-real-work"
    ]
  ],
  "demoState": "Preview dashboard with synthetic benchmark traces; real trace ingestion is the next build step.",
  "catalogRank": 1,
  "maturity": "Can be shown as a product demo today",
  "packetPath": "/reports/studio/agent-benchmark-explorer.md",
  "jsonPath": "/reports/studio/agent-benchmark-explorer.json",
  "previewImage": "/reports/studio/previews/agent-benchmark-explorer.png",
  "previewAlt": "Agent Benchmark Explorer live Studio preview screenshot",
  "demoReadinessScore": 76,
  "demoReadiness": "Research-demo ready with generated traces; needs reviewer-signed real run artifacts before it should drive a leaderboard or procurement decision.",
  "missingForLiveDemo": [
    "Reviewer-signed trace",
    "Real screenshot/media",
    "Provider or agent export"
  ],
  "handoffOwner": "Sanjay Prasad",
  "nextDemoArtifact": "/reports/studio/agent-benchmark-explorer.md"
}