{
  "purpose": "Procurement matrix for deciding when an AI buyer should stay on managed APIs, test hosted open-weight inference, reserve dedicated endpoints, self-host on cloud GPUs, or buy hardware.",
  "decisionGates": [
    {
      "gate": "Demand shape",
      "buyerQuestion": "Is the workload bursty, steady, interactive, or offline?",
      "evidenceNeeded": "Hourly request histogram, p95 latency target, batchable share, concurrency peaks, and idle windows.",
      "failureIfSkipped": "Teams reserve expensive capacity for a workload that could have stayed serverless or batched."
    },
    {
      "gate": "Utilization proof",
      "buyerQuestion": "Can the buyer keep the accelerator busy enough to beat hosted inference?",
      "evidenceNeeded": "Accepted output tokens per second, GPU occupancy, queue depth, autoscaling floor, and peak-to-idle ratio.",
      "failureIfSkipped": "A self-hosted route looks cheap on token math but loses money during idle time and incidents."
    },
    {
      "gate": "Memory and context fit",
      "buyerQuestion": "Does the model, KV cache, batch size, and context window fit the planned hardware route?",
      "evidenceNeeded": "Model size, quantization plan, context length, batch size, KV-cache budget, and observed out-of-memory events.",
      "failureIfSkipped": "The selected GPU can run a demo but cannot hold production context, throughput, or concurrent sessions."
    },
    {
      "gate": "Operations maturity",
      "buyerQuestion": "Who owns serving, upgrades, observability, security patches, and incident response?",
      "evidenceNeeded": "Runbook owner, monitoring plan, rollback route, model upgrade policy, and security review.",
      "failureIfSkipped": "The hardware route becomes an unowned platform project instead of a cheaper inference path."
    },
    {
      "gate": "Fallback economics",
      "buyerQuestion": "What happens when the open or self-hosted route fails the task or saturates?",
      "evidenceNeeded": "Fallback model, routing threshold, retry policy, review minutes, and fallback cost per accepted output.",
      "failureIfSkipped": "Savings disappear because hard cases silently route through a premium model without being measured."
    }
  ],
  "routes": [
    {
      "route": "Managed API only",
      "stage": "Baseline",
      "useWhen": "The buyer needs speed, high-quality reasoning, uncertain demand, and minimal infrastructure ownership.",
      "minimumEvidence": "Private task pass rate, cost per accepted output, p95 latency, and retry/review fallout.",
      "avoidWhen": "The workload is steady, privacy-constrained, or large enough that dedicated capacity can be proven.",
      "owner": "Product and AI lead"
    },
    {
      "route": "Hosted open-weight serverless",
      "stage": "Optionality",
      "useWhen": "Teams want to test Mistral, Qwen, DeepSeek, Llama, or specialist models without buying capacity.",
      "minimumEvidence": "Model fit by task class, token price, cold-start behavior, cache policy, and acceptance rate.",
      "avoidWhen": "Tail latency and provider variability break the workflow or procurement needs a fixed data boundary.",
      "owner": "AI engineer"
    },
    {
      "route": "Dedicated hosted endpoint",
      "stage": "Production lane",
      "useWhen": "Traffic is predictable enough to reserve replicas, and latency/governance need tighter control.",
      "minimumEvidence": "Seven-day traffic shape, autoscaling floor, utilization, p95/p99 latency, and fallback route.",
      "avoidWhen": "Utilization is weak, model choice is still changing weekly, or incident ownership is unclear.",
      "owner": "Platform and operations"
    },
    {
      "route": "Cloud GPU self-hosting",
      "stage": "Control lane",
      "useWhen": "The buyer needs custom serving, data control, fine-tuned models, or high-utilization open-weight inference.",
      "minimumEvidence": "GPU-hour cost, serving stack benchmark, occupancy, queue depth, upgrade runbook, and security review.",
      "avoidWhen": "The team is still learning which model solves the workflow or cannot monitor serving reliably.",
      "owner": "Infrastructure owner"
    },
    {
      "route": "Owned hardware",
      "stage": "Strategic capacity",
      "useWhen": "Workload volume, data boundary, procurement horizon, and engineering maturity justify long-lived capacity.",
      "minimumEvidence": "Three-month demand forecast, depreciation model, power/cooling plan, support contract, and fallback API.",
      "avoidWhen": "Demand is speculative, model architecture is volatile, or utilization cannot be audited continuously.",
      "owner": "Executive sponsor and infrastructure"
    }
  ],
  "readinessScores": [
    {
      "dimension": "Demand forecast",
      "score": 82,
      "note": "Enough traffic evidence to compare hosted, dedicated, and self-hosted routes."
    },
    {
      "dimension": "Utilization model",
      "score": 68,
      "note": "Needs measured occupancy from real provider or GPU traces before buying capacity."
    },
    {
      "dimension": "Ops ownership",
      "score": 56,
      "note": "Serving, rollback, upgrade, and incident ownership must be explicit before self-hosting."
    },
    {
      "dimension": "Fallback design",
      "score": 74,
      "note": "Route hard cases back to a managed model and report blended cost per accepted output."
    },
    {
      "dimension": "Procurement clarity",
      "score": 61,
      "note": "Security, residency, vendor support, and budget horizon still need buyer-specific input."
    }
  ],
  "artifacts": {
    "markdown": "/reports/hardware-procurement-matrix.md",
    "json": "/reports/hardware-procurement-matrix.json"
  }
}
