{
  "version": "0.1.0",
  "generatedAt": "2026-05-16T00:00:00+05:30",
  "purpose": "Evidence readiness ledger for preventing synthetic benchmark scaffolds from being mistaken for production-grade leaderboard evidence.",
  "position": "Current benchmark pages are useful as scaffolds and buyer methodology examples. They should not be treated as final public leaderboard evidence until raw harness logs, provider response ids, screenshot/state proof, repeated seeds, and reviewer signoff are imported.",
  "gateCount": 7,
  "suiteCount": 5,
  "averageReadinessScore": 57,
  "publicPath": "/reports/benchmark-evidence-readiness/readiness.json",
  "markdownPath": "/reports/benchmark-evidence-readiness/readiness.md",
  "gates": [
    {
      "id": "task-design",
      "label": "Task design and split",
      "weight": 14,
      "proof": "Public/private split, task brief, expected output, scoring focus, leakage policy, and retirement rule exist for the suite."
    },
    {
      "id": "gold-answers",
      "label": "Gold answer and rubric",
      "weight": 16,
      "proof": "Every scored task has expected evidence, rubric components, and reviewer-facing scorecard."
    },
    {
      "id": "raw-run-log",
      "label": "Raw run log",
      "weight": 16,
      "proof": "Every model run has raw harness logs, prompt packet id, run seed, exact model version, tool calls, retries, and answer excerpt."
    },
    {
      "id": "provider-identity",
      "label": "Provider response identity",
      "weight": 12,
      "proof": "Every model run preserves provider, model identifier, model version, timestamp, and response id or equivalent audit handle."
    },
    {
      "id": "state-proof",
      "label": "Screenshot or state proof",
      "weight": 12,
      "proof": "Browser/app/document tasks preserve screenshot, DOM state, file diff, or comparable state proof for completion claims."
    },
    {
      "id": "repeatability",
      "label": "Repeat-run stability",
      "weight": 14,
      "proof": "Leaderboard-boundary results are repeated across multiple seeds and instability is reported before ranking."
    },
    {
      "id": "review-signoff",
      "label": "Reviewer signoff",
      "weight": 16,
      "proof": "A named reviewer signs the scorecard, failure reason, and buyer recommendation after inspecting trace artifacts."
    }
  ],
  "suites": [
    {
      "slug": "indian-enterprise-workflow-suite",
      "title": "Indian Enterprise Workflow Suite",
      "status": "Not leaderboard-ready",
      "readinessScore": 57,
      "publicTasks": 10,
      "holdoutTasks": 14,
      "traceCount": 4,
      "artifactBundles": 4,
      "modelRunRows": 16,
      "gates": [
        {
          "id": "task-design",
          "label": "Task design and split",
          "weight": 14,
          "proof": "Public/private split, task brief, expected output, scoring focus, leakage policy, and retirement rule exist for the suite.",
          "status": "Ready scaffold",
          "credit": 14,
          "evidence": "10 public / 14 holdout tasks plus control metadata.",
          "nextStep": "Replace synthetic split counts with the first real harness task manifest."
        },
        {
          "id": "gold-answers",
          "label": "Gold answer and rubric",
          "weight": 16,
          "proof": "Every scored task has expected evidence, rubric components, and reviewer-facing scorecard.",
          "status": "Ready scaffold",
          "credit": 16,
          "evidence": "4 trace packets include expected evidence and score components.",
          "nextStep": "Replace generated expected outputs with reviewer-approved gold packets from real task sources."
        },
        {
          "id": "raw-run-log",
          "label": "Raw run log",
          "weight": 16,
          "proof": "Every model run has raw harness logs, prompt packet id, run seed, exact model version, tool calls, retries, and answer excerpt.",
          "status": "Scaffold only",
          "credit": 7,
          "evidence": "Replay scaffold logs exist, but they are not real provider/harness exports.",
          "nextStep": "Import raw notebook or harness logs through the benchmark intake kit before leaderboard claims depend on these rows."
        },
        {
          "id": "provider-identity",
          "label": "Provider response identity",
          "weight": 12,
          "proof": "Every model run preserves provider, model identifier, model version, timestamp, and response id or equivalent audit handle.",
          "status": "Missing real evidence",
          "credit": 3,
          "evidence": "Rows contain generated modelVersion and provider class labels, not provider response ids.",
          "nextStep": "Capture exact provider model identifiers, timestamps, request ids, and response ids for every run."
        },
        {
          "id": "state-proof",
          "label": "Screenshot or state proof",
          "weight": 12,
          "proof": "Browser/app/document tasks preserve screenshot, DOM state, file diff, or comparable state proof for completion claims.",
          "status": "Missing real evidence",
          "credit": 2,
          "evidence": "Artifact bundles explicitly mark screenshot/state proof as pending.",
          "nextStep": "Attach screenshots, DOM snapshots, diffs, or document outputs to each artifact bundle."
        },
        {
          "id": "repeatability",
          "label": "Repeat-run stability",
          "weight": 14,
          "proof": "Leaderboard-boundary results are repeated across multiple seeds and instability is reported before ranking.",
          "status": "Seeded once",
          "credit": 5,
          "evidence": "Rows preserve generated seeds, but there are no repeated real runs across seeds.",
          "nextStep": "Repeat boundary results across at least three seeds and publish instability bands."
        },
        {
          "id": "review-signoff",
          "label": "Reviewer signoff",
          "weight": 16,
          "proof": "A named reviewer signs the scorecard, failure reason, and buyer recommendation after inspecting trace artifacts.",
          "status": "Ready scaffold",
          "credit": 10,
          "evidence": "Reviewer notes and failure reasons exist for generated trace rows.",
          "nextStep": "Move reviewer signoff into the intake schema with named reviewer, date, and accepted/rejected decision."
        }
      ],
      "blockingGates": [
        {
          "id": "raw-run-log",
          "label": "Raw run log",
          "status": "Scaffold only",
          "nextStep": "Import raw notebook or harness logs through the benchmark intake kit before leaderboard claims depend on these rows."
        },
        {
          "id": "provider-identity",
          "label": "Provider response identity",
          "status": "Missing real evidence",
          "nextStep": "Capture exact provider model identifiers, timestamps, request ids, and response ids for every run."
        },
        {
          "id": "state-proof",
          "label": "Screenshot or state proof",
          "status": "Missing real evidence",
          "nextStep": "Attach screenshots, DOM snapshots, diffs, or document outputs to each artifact bundle."
        },
        {
          "id": "repeatability",
          "label": "Repeat-run stability",
          "status": "Seeded once",
          "nextStep": "Repeat boundary results across at least three seeds and publish instability bands."
        }
      ],
      "nextRealRunStep": "Use /reports/benchmark-intake/runbook.md and /reports/benchmark-intake/run-template.csv to import real model/provider outputs, provider response ids, screenshots/state proof, and repeated seeds."
    },
    {
      "slug": "coding-agent-maintenance-suite",
      "title": "Coding Agent Maintenance Suite",
      "status": "Not leaderboard-ready",
      "readinessScore": 57,
      "publicTasks": 8,
      "holdoutTasks": 12,
      "traceCount": 4,
      "artifactBundles": 4,
      "modelRunRows": 16,
      "gates": [
        {
          "id": "task-design",
          "label": "Task design and split",
          "weight": 14,
          "proof": "Public/private split, task brief, expected output, scoring focus, leakage policy, and retirement rule exist for the suite.",
          "status": "Ready scaffold",
          "credit": 14,
          "evidence": "8 public / 12 holdout tasks plus control metadata.",
          "nextStep": "Replace synthetic split counts with the first real harness task manifest."
        },
        {
          "id": "gold-answers",
          "label": "Gold answer and rubric",
          "weight": 16,
          "proof": "Every scored task has expected evidence, rubric components, and reviewer-facing scorecard.",
          "status": "Ready scaffold",
          "credit": 16,
          "evidence": "4 trace packets include expected evidence and score components.",
          "nextStep": "Replace generated expected outputs with reviewer-approved gold packets from real task sources."
        },
        {
          "id": "raw-run-log",
          "label": "Raw run log",
          "weight": 16,
          "proof": "Every model run has raw harness logs, prompt packet id, run seed, exact model version, tool calls, retries, and answer excerpt.",
          "status": "Scaffold only",
          "credit": 7,
          "evidence": "Replay scaffold logs exist, but they are not real provider/harness exports.",
          "nextStep": "Import raw notebook or harness logs through the benchmark intake kit before leaderboard claims depend on these rows."
        },
        {
          "id": "provider-identity",
          "label": "Provider response identity",
          "weight": 12,
          "proof": "Every model run preserves provider, model identifier, model version, timestamp, and response id or equivalent audit handle.",
          "status": "Missing real evidence",
          "credit": 3,
          "evidence": "Rows contain generated modelVersion and provider class labels, not provider response ids.",
          "nextStep": "Capture exact provider model identifiers, timestamps, request ids, and response ids for every run."
        },
        {
          "id": "state-proof",
          "label": "Screenshot or state proof",
          "weight": 12,
          "proof": "Browser/app/document tasks preserve screenshot, DOM state, file diff, or comparable state proof for completion claims.",
          "status": "Missing real evidence",
          "credit": 2,
          "evidence": "Artifact bundles explicitly mark screenshot/state proof as pending.",
          "nextStep": "Attach screenshots, DOM snapshots, diffs, or document outputs to each artifact bundle."
        },
        {
          "id": "repeatability",
          "label": "Repeat-run stability",
          "weight": 14,
          "proof": "Leaderboard-boundary results are repeated across multiple seeds and instability is reported before ranking.",
          "status": "Seeded once",
          "credit": 5,
          "evidence": "Rows preserve generated seeds, but there are no repeated real runs across seeds.",
          "nextStep": "Repeat boundary results across at least three seeds and publish instability bands."
        },
        {
          "id": "review-signoff",
          "label": "Reviewer signoff",
          "weight": 16,
          "proof": "A named reviewer signs the scorecard, failure reason, and buyer recommendation after inspecting trace artifacts.",
          "status": "Ready scaffold",
          "credit": 10,
          "evidence": "Reviewer notes and failure reasons exist for generated trace rows.",
          "nextStep": "Move reviewer signoff into the intake schema with named reviewer, date, and accepted/rejected decision."
        }
      ],
      "blockingGates": [
        {
          "id": "raw-run-log",
          "label": "Raw run log",
          "status": "Scaffold only",
          "nextStep": "Import raw notebook or harness logs through the benchmark intake kit before leaderboard claims depend on these rows."
        },
        {
          "id": "provider-identity",
          "label": "Provider response identity",
          "status": "Missing real evidence",
          "nextStep": "Capture exact provider model identifiers, timestamps, request ids, and response ids for every run."
        },
        {
          "id": "state-proof",
          "label": "Screenshot or state proof",
          "status": "Missing real evidence",
          "nextStep": "Attach screenshots, DOM snapshots, diffs, or document outputs to each artifact bundle."
        },
        {
          "id": "repeatability",
          "label": "Repeat-run stability",
          "status": "Seeded once",
          "nextStep": "Repeat boundary results across at least three seeds and publish instability bands."
        }
      ],
      "nextRealRunStep": "Use /reports/benchmark-intake/runbook.md and /reports/benchmark-intake/run-template.csv to import real model/provider outputs, provider response ids, screenshots/state proof, and repeated seeds."
    },
    {
      "slug": "browser-operations-suite",
      "title": "Browser Operations Suite",
      "status": "Not leaderboard-ready",
      "readinessScore": 57,
      "publicTasks": 6,
      "holdoutTasks": 10,
      "traceCount": 4,
      "artifactBundles": 4,
      "modelRunRows": 16,
      "gates": [
        {
          "id": "task-design",
          "label": "Task design and split",
          "weight": 14,
          "proof": "Public/private split, task brief, expected output, scoring focus, leakage policy, and retirement rule exist for the suite.",
          "status": "Ready scaffold",
          "credit": 14,
          "evidence": "6 public / 10 holdout tasks plus control metadata.",
          "nextStep": "Replace synthetic split counts with the first real harness task manifest."
        },
        {
          "id": "gold-answers",
          "label": "Gold answer and rubric",
          "weight": 16,
          "proof": "Every scored task has expected evidence, rubric components, and reviewer-facing scorecard.",
          "status": "Ready scaffold",
          "credit": 16,
          "evidence": "4 trace packets include expected evidence and score components.",
          "nextStep": "Replace generated expected outputs with reviewer-approved gold packets from real task sources."
        },
        {
          "id": "raw-run-log",
          "label": "Raw run log",
          "weight": 16,
          "proof": "Every model run has raw harness logs, prompt packet id, run seed, exact model version, tool calls, retries, and answer excerpt.",
          "status": "Scaffold only",
          "credit": 7,
          "evidence": "Replay scaffold logs exist, but they are not real provider/harness exports.",
          "nextStep": "Import raw notebook or harness logs through the benchmark intake kit before leaderboard claims depend on these rows."
        },
        {
          "id": "provider-identity",
          "label": "Provider response identity",
          "weight": 12,
          "proof": "Every model run preserves provider, model identifier, model version, timestamp, and response id or equivalent audit handle.",
          "status": "Missing real evidence",
          "credit": 3,
          "evidence": "Rows contain generated modelVersion and provider class labels, not provider response ids.",
          "nextStep": "Capture exact provider model identifiers, timestamps, request ids, and response ids for every run."
        },
        {
          "id": "state-proof",
          "label": "Screenshot or state proof",
          "weight": 12,
          "proof": "Browser/app/document tasks preserve screenshot, DOM state, file diff, or comparable state proof for completion claims.",
          "status": "Missing real evidence",
          "credit": 2,
          "evidence": "Artifact bundles explicitly mark screenshot/state proof as pending.",
          "nextStep": "Attach screenshots, DOM snapshots, diffs, or document outputs to each artifact bundle."
        },
        {
          "id": "repeatability",
          "label": "Repeat-run stability",
          "weight": 14,
          "proof": "Leaderboard-boundary results are repeated across multiple seeds and instability is reported before ranking.",
          "status": "Seeded once",
          "credit": 5,
          "evidence": "Rows preserve generated seeds, but there are no repeated real runs across seeds.",
          "nextStep": "Repeat boundary results across at least three seeds and publish instability bands."
        },
        {
          "id": "review-signoff",
          "label": "Reviewer signoff",
          "weight": 16,
          "proof": "A named reviewer signs the scorecard, failure reason, and buyer recommendation after inspecting trace artifacts.",
          "status": "Ready scaffold",
          "credit": 10,
          "evidence": "Reviewer notes and failure reasons exist for generated trace rows.",
          "nextStep": "Move reviewer signoff into the intake schema with named reviewer, date, and accepted/rejected decision."
        }
      ],
      "blockingGates": [
        {
          "id": "raw-run-log",
          "label": "Raw run log",
          "status": "Scaffold only",
          "nextStep": "Import raw notebook or harness logs through the benchmark intake kit before leaderboard claims depend on these rows."
        },
        {
          "id": "provider-identity",
          "label": "Provider response identity",
          "status": "Missing real evidence",
          "nextStep": "Capture exact provider model identifiers, timestamps, request ids, and response ids for every run."
        },
        {
          "id": "state-proof",
          "label": "Screenshot or state proof",
          "status": "Missing real evidence",
          "nextStep": "Attach screenshots, DOM snapshots, diffs, or document outputs to each artifact bundle."
        },
        {
          "id": "repeatability",
          "label": "Repeat-run stability",
          "status": "Seeded once",
          "nextStep": "Repeat boundary results across at least three seeds and publish instability bands."
        }
      ],
      "nextRealRunStep": "Use /reports/benchmark-intake/runbook.md and /reports/benchmark-intake/run-template.csv to import real model/provider outputs, provider response ids, screenshots/state proof, and repeated seeds."
    },
    {
      "slug": "support-agent-policy-suite",
      "title": "Support Agent Policy Suite",
      "status": "Not leaderboard-ready",
      "readinessScore": 57,
      "publicTasks": 8,
      "holdoutTasks": 12,
      "traceCount": 4,
      "artifactBundles": 4,
      "modelRunRows": 16,
      "gates": [
        {
          "id": "task-design",
          "label": "Task design and split",
          "weight": 14,
          "proof": "Public/private split, task brief, expected output, scoring focus, leakage policy, and retirement rule exist for the suite.",
          "status": "Ready scaffold",
          "credit": 14,
          "evidence": "8 public / 12 holdout tasks plus control metadata.",
          "nextStep": "Replace synthetic split counts with the first real harness task manifest."
        },
        {
          "id": "gold-answers",
          "label": "Gold answer and rubric",
          "weight": 16,
          "proof": "Every scored task has expected evidence, rubric components, and reviewer-facing scorecard.",
          "status": "Ready scaffold",
          "credit": 16,
          "evidence": "4 trace packets include expected evidence and score components.",
          "nextStep": "Replace generated expected outputs with reviewer-approved gold packets from real task sources."
        },
        {
          "id": "raw-run-log",
          "label": "Raw run log",
          "weight": 16,
          "proof": "Every model run has raw harness logs, prompt packet id, run seed, exact model version, tool calls, retries, and answer excerpt.",
          "status": "Scaffold only",
          "credit": 7,
          "evidence": "Replay scaffold logs exist, but they are not real provider/harness exports.",
          "nextStep": "Import raw notebook or harness logs through the benchmark intake kit before leaderboard claims depend on these rows."
        },
        {
          "id": "provider-identity",
          "label": "Provider response identity",
          "weight": 12,
          "proof": "Every model run preserves provider, model identifier, model version, timestamp, and response id or equivalent audit handle.",
          "status": "Missing real evidence",
          "credit": 3,
          "evidence": "Rows contain generated modelVersion and provider class labels, not provider response ids.",
          "nextStep": "Capture exact provider model identifiers, timestamps, request ids, and response ids for every run."
        },
        {
          "id": "state-proof",
          "label": "Screenshot or state proof",
          "weight": 12,
          "proof": "Browser/app/document tasks preserve screenshot, DOM state, file diff, or comparable state proof for completion claims.",
          "status": "Missing real evidence",
          "credit": 2,
          "evidence": "Artifact bundles explicitly mark screenshot/state proof as pending.",
          "nextStep": "Attach screenshots, DOM snapshots, diffs, or document outputs to each artifact bundle."
        },
        {
          "id": "repeatability",
          "label": "Repeat-run stability",
          "weight": 14,
          "proof": "Leaderboard-boundary results are repeated across multiple seeds and instability is reported before ranking.",
          "status": "Seeded once",
          "credit": 5,
          "evidence": "Rows preserve generated seeds, but there are no repeated real runs across seeds.",
          "nextStep": "Repeat boundary results across at least three seeds and publish instability bands."
        },
        {
          "id": "review-signoff",
          "label": "Reviewer signoff",
          "weight": 16,
          "proof": "A named reviewer signs the scorecard, failure reason, and buyer recommendation after inspecting trace artifacts.",
          "status": "Ready scaffold",
          "credit": 10,
          "evidence": "Reviewer notes and failure reasons exist for generated trace rows.",
          "nextStep": "Move reviewer signoff into the intake schema with named reviewer, date, and accepted/rejected decision."
        }
      ],
      "blockingGates": [
        {
          "id": "raw-run-log",
          "label": "Raw run log",
          "status": "Scaffold only",
          "nextStep": "Import raw notebook or harness logs through the benchmark intake kit before leaderboard claims depend on these rows."
        },
        {
          "id": "provider-identity",
          "label": "Provider response identity",
          "status": "Missing real evidence",
          "nextStep": "Capture exact provider model identifiers, timestamps, request ids, and response ids for every run."
        },
        {
          "id": "state-proof",
          "label": "Screenshot or state proof",
          "status": "Missing real evidence",
          "nextStep": "Attach screenshots, DOM snapshots, diffs, or document outputs to each artifact bundle."
        },
        {
          "id": "repeatability",
          "label": "Repeat-run stability",
          "status": "Seeded once",
          "nextStep": "Repeat boundary results across at least three seeds and publish instability bands."
        }
      ],
      "nextRealRunStep": "Use /reports/benchmark-intake/runbook.md and /reports/benchmark-intake/run-template.csv to import real model/provider outputs, provider response ids, screenshots/state proof, and repeated seeds."
    },
    {
      "slug": "ai-security-risk-suite",
      "title": "AI Security & Risk Suite",
      "status": "Not leaderboard-ready",
      "readinessScore": 57,
      "publicTasks": 6,
      "holdoutTasks": 10,
      "traceCount": 4,
      "artifactBundles": 4,
      "modelRunRows": 16,
      "gates": [
        {
          "id": "task-design",
          "label": "Task design and split",
          "weight": 14,
          "proof": "Public/private split, task brief, expected output, scoring focus, leakage policy, and retirement rule exist for the suite.",
          "status": "Ready scaffold",
          "credit": 14,
          "evidence": "6 public / 10 holdout tasks plus control metadata.",
          "nextStep": "Replace synthetic split counts with the first real harness task manifest."
        },
        {
          "id": "gold-answers",
          "label": "Gold answer and rubric",
          "weight": 16,
          "proof": "Every scored task has expected evidence, rubric components, and reviewer-facing scorecard.",
          "status": "Ready scaffold",
          "credit": 16,
          "evidence": "4 trace packets include expected evidence and score components.",
          "nextStep": "Replace generated expected outputs with reviewer-approved gold packets from real task sources."
        },
        {
          "id": "raw-run-log",
          "label": "Raw run log",
          "weight": 16,
          "proof": "Every model run has raw harness logs, prompt packet id, run seed, exact model version, tool calls, retries, and answer excerpt.",
          "status": "Scaffold only",
          "credit": 7,
          "evidence": "Replay scaffold logs exist, but they are not real provider/harness exports.",
          "nextStep": "Import raw notebook or harness logs through the benchmark intake kit before leaderboard claims depend on these rows."
        },
        {
          "id": "provider-identity",
          "label": "Provider response identity",
          "weight": 12,
          "proof": "Every model run preserves provider, model identifier, model version, timestamp, and response id or equivalent audit handle.",
          "status": "Missing real evidence",
          "credit": 3,
          "evidence": "Rows contain generated modelVersion and provider class labels, not provider response ids.",
          "nextStep": "Capture exact provider model identifiers, timestamps, request ids, and response ids for every run."
        },
        {
          "id": "state-proof",
          "label": "Screenshot or state proof",
          "weight": 12,
          "proof": "Browser/app/document tasks preserve screenshot, DOM state, file diff, or comparable state proof for completion claims.",
          "status": "Missing real evidence",
          "credit": 2,
          "evidence": "Artifact bundles explicitly mark screenshot/state proof as pending.",
          "nextStep": "Attach screenshots, DOM snapshots, diffs, or document outputs to each artifact bundle."
        },
        {
          "id": "repeatability",
          "label": "Repeat-run stability",
          "weight": 14,
          "proof": "Leaderboard-boundary results are repeated across multiple seeds and instability is reported before ranking.",
          "status": "Seeded once",
          "credit": 5,
          "evidence": "Rows preserve generated seeds, but there are no repeated real runs across seeds.",
          "nextStep": "Repeat boundary results across at least three seeds and publish instability bands."
        },
        {
          "id": "review-signoff",
          "label": "Reviewer signoff",
          "weight": 16,
          "proof": "A named reviewer signs the scorecard, failure reason, and buyer recommendation after inspecting trace artifacts.",
          "status": "Ready scaffold",
          "credit": 10,
          "evidence": "Reviewer notes and failure reasons exist for generated trace rows.",
          "nextStep": "Move reviewer signoff into the intake schema with named reviewer, date, and accepted/rejected decision."
        }
      ],
      "blockingGates": [
        {
          "id": "raw-run-log",
          "label": "Raw run log",
          "status": "Scaffold only",
          "nextStep": "Import raw notebook or harness logs through the benchmark intake kit before leaderboard claims depend on these rows."
        },
        {
          "id": "provider-identity",
          "label": "Provider response identity",
          "status": "Missing real evidence",
          "nextStep": "Capture exact provider model identifiers, timestamps, request ids, and response ids for every run."
        },
        {
          "id": "state-proof",
          "label": "Screenshot or state proof",
          "status": "Missing real evidence",
          "nextStep": "Attach screenshots, DOM snapshots, diffs, or document outputs to each artifact bundle."
        },
        {
          "id": "repeatability",
          "label": "Repeat-run stability",
          "status": "Seeded once",
          "nextStep": "Repeat boundary results across at least three seeds and publish instability bands."
        }
      ],
      "nextRealRunStep": "Use /reports/benchmark-intake/runbook.md and /reports/benchmark-intake/run-template.csv to import real model/provider outputs, provider response ids, screenshots/state proof, and repeated seeds."
    }
  ],
  "nextImportArtifacts": [
    "/reports/benchmark-intake/runbook.md",
    "/reports/benchmark-intake/run-template.csv",
    "/reports/benchmark-intake/run-schema.json",
    "/reports/benchmark-intake/reviewer-checklist.md"
  ]
}
