{
  "version": "0.1.0",
  "generatedAt": "2026-05-16T01:15:55.365Z",
  "purpose": "Inference economics playbook for comparing managed APIs, hosted open-weight inference, dedicated endpoints, and self-hosted GPU stacks.",
  "routeCount": 4,
  "leverCount": 5,
  "sourceCount": 6,
  "traceFieldCount": 16,
  "traceMetricCount": 5,
  "hardwareGateCount": 5,
  "hardwareRouteCount": 5,
  "routes": [
    {
      "route": "Managed frontier API",
      "bestFor": "High-stakes reasoning, low ops burden, fastest path to production evidence.",
      "costDrivers": [
        "Input/output tokens",
        "reasoning overhead",
        "tool calls",
        "retries",
        "cache hit rate"
      ],
      "watchouts": "Nominal token price can be cheaper than self-hosting when quality reduces retries and review.",
      "buyerDecision": "Use as the baseline route for private evals and fallback routing."
    },
    {
      "route": "Hosted open-weight serverless",
      "bestFor": "Testing Mistral, Qwen, DeepSeek, Llama, and other open models without owning GPUs.",
      "costDrivers": [
        "Per-token price",
        "model host",
        "cold-start policy",
        "tail latency",
        "prompt-cache behavior"
      ],
      "watchouts": "Provider/model choice, batching, and replica-local cache behavior can dominate the final curve.",
      "buyerDecision": "Use when optionality matters and private infrastructure is not justified yet."
    },
    {
      "route": "Dedicated endpoint",
      "bestFor": "Predictable production traffic, data boundary control, and more consistent latency.",
      "costDrivers": [
        "Reserved GPU class",
        "utilization",
        "autoscaling floor",
        "batching",
        "peak/idle ratio"
      ],
      "watchouts": "A dedicated endpoint can be more expensive than serverless if utilization stays low.",
      "buyerDecision": "Use after public/serverless traces prove steady volume and clear latency targets."
    },
    {
      "route": "Self-hosted GPU stack",
      "bestFor": "Strict data control, high utilization, custom kernels, fine-tuned models, and deep optimization.",
      "costDrivers": [
        "GPU capex/rental",
        "utilization",
        "memory bandwidth",
        "KV cache",
        "engineering time",
        "operations"
      ],
      "watchouts": "The real cost includes upgrades, observability, security, model serving, and incident response.",
      "buyerDecision": "Use when workload volume, governance, and engineering maturity justify owning the serving layer."
    }
  ],
  "levers": [
    {
      "lever": "Batch eligibility",
      "question": "Can this workflow wait minutes or hours instead of responding live?",
      "evidence": "Queue delay tolerance, SLA class, batch endpoint availability, accepted-output score at batch settings.",
      "impact": "High for offline evals, invoice review, document extraction, CRM cleanup, and synthetic-data generation."
    },
    {
      "lever": "Prompt cache shape",
      "question": "Which policy, schema, instruction, or retrieval blocks repeat across requests?",
      "evidence": "Stable prefix design, cache hit rate, cache-read/write pricing, session affinity behavior.",
      "impact": "High for policy-heavy support, legal review, RAG, and repeated benchmark harness runs."
    },
    {
      "lever": "Throughput and utilization",
      "question": "How many accepted tokens per second does the system produce at target latency?",
      "evidence": "Tokens/sec/user, requests/sec, GPU occupancy, queue depth, p50/p95/p99 latency.",
      "impact": "Decides whether serverless, dedicated endpoints, or self-hosting is economically rational."
    },
    {
      "lever": "Queueing and tail latency",
      "question": "What happens at peak load, long context, and retry storms?",
      "evidence": "p95/p99 latency, timeout rate, retry count, queue wait, rate-limit behavior.",
      "impact": "Often more important than average latency for browser agents, support, and live workflows."
    },
    {
      "lever": "Quality-adjusted cost",
      "question": "What is the cost per accepted workflow after retries and human review?",
      "evidence": "Pass rate, review minutes, failure severity, fallback route, cost per accepted output.",
      "impact": "Prevents teams from picking a cheap model that creates expensive review fallout."
    }
  ],
  "sourceTrail": [
    {
      "label": "NVIDIA Blackwell MLPerf inference results",
      "url": "https://blogs.nvidia.com/blog/blackwell-mlperf-inference/",
      "signal": "Rack-scale inference performance and throughput can change materially with hardware generation and serving software."
    },
    {
      "label": "NVIDIA Blackwell Ultra MLPerf inference",
      "url": "https://blogs.nvidia.com/blog/mlperf-inference-blackwell-ultra/",
      "signal": "Reasoning inference throughput is an active benchmark surface, not a fixed property of a GPU alone."
    },
    {
      "label": "Together AI pricing",
      "url": "https://www.together.ai/pricing",
      "signal": "Hosted inference surfaces serverless, dedicated, and batch economics that should be compared by workload class."
    },
    {
      "label": "Together AI Batch Inference docs",
      "url": "https://docs.together.ai/docs/inference/batch",
      "signal": "Batch is appropriate for offline classification, evals, generation, and summarization, not interactive work."
    },
    {
      "label": "Fireworks pricing",
      "url": "https://fireworks.ai/pricing",
      "signal": "Serverless token pricing and on-demand deployment pricing create distinct economics for open-model routes."
    },
    {
      "label": "Fireworks serverless overview",
      "url": "https://docs.fireworks.ai/serverless/overview",
      "signal": "Prompt caching and affinity behavior can affect effective cost and latency for repeated prefixes."
    }
  ],
  "hardwareProcurementMatrix": {
    "purpose": "Procurement matrix for deciding when an AI buyer should stay on managed APIs, test hosted open-weight inference, reserve dedicated endpoints, self-host on cloud GPUs, or buy hardware.",
    "decisionGates": [
      {
        "gate": "Demand shape",
        "buyerQuestion": "Is the workload bursty, steady, interactive, or offline?",
        "evidenceNeeded": "Hourly request histogram, p95 latency target, batchable share, concurrency peaks, and idle windows.",
        "failureIfSkipped": "Teams reserve expensive capacity for a workload that could have stayed serverless or batched."
      },
      {
        "gate": "Utilization proof",
        "buyerQuestion": "Can the buyer keep the accelerator busy enough to beat hosted inference?",
        "evidenceNeeded": "Accepted output tokens per second, GPU occupancy, queue depth, autoscaling floor, and peak-to-idle ratio.",
        "failureIfSkipped": "A self-hosted route looks cheap on token math but loses money during idle time and incidents."
      },
      {
        "gate": "Memory and context fit",
        "buyerQuestion": "Does the model, KV cache, batch size, and context window fit the planned hardware route?",
        "evidenceNeeded": "Model size, quantization plan, context length, batch size, KV-cache budget, and observed out-of-memory events.",
        "failureIfSkipped": "The selected GPU can run a demo but cannot hold production context, throughput, or concurrent sessions."
      },
      {
        "gate": "Operations maturity",
        "buyerQuestion": "Who owns serving, upgrades, observability, security patches, and incident response?",
        "evidenceNeeded": "Runbook owner, monitoring plan, rollback route, model upgrade policy, and security review.",
        "failureIfSkipped": "The hardware route becomes an unowned platform project instead of a cheaper inference path."
      },
      {
        "gate": "Fallback economics",
        "buyerQuestion": "What happens when the open or self-hosted route fails the task or saturates?",
        "evidenceNeeded": "Fallback model, routing threshold, retry policy, review minutes, and fallback cost per accepted output.",
        "failureIfSkipped": "Savings disappear because hard cases silently route through a premium model without being measured."
      }
    ],
    "routes": [
      {
        "route": "Managed API only",
        "stage": "Baseline",
        "useWhen": "The buyer needs speed, high-quality reasoning, uncertain demand, and minimal infrastructure ownership.",
        "minimumEvidence": "Private task pass rate, cost per accepted output, p95 latency, and retry/review fallout.",
        "avoidWhen": "The workload is steady, privacy-constrained, or large enough that dedicated capacity can be proven.",
        "owner": "Product and AI lead"
      },
      {
        "route": "Hosted open-weight serverless",
        "stage": "Optionality",
        "useWhen": "Teams want to test Mistral, Qwen, DeepSeek, Llama, or specialist models without buying capacity.",
        "minimumEvidence": "Model fit by task class, token price, cold-start behavior, cache policy, and acceptance rate.",
        "avoidWhen": "Tail latency and provider variability break the workflow or procurement needs a fixed data boundary.",
        "owner": "AI engineer"
      },
      {
        "route": "Dedicated hosted endpoint",
        "stage": "Production lane",
        "useWhen": "Traffic is predictable enough to reserve replicas, and latency/governance need tighter control.",
        "minimumEvidence": "Seven-day traffic shape, autoscaling floor, utilization, p95/p99 latency, and fallback route.",
        "avoidWhen": "Utilization is weak, model choice is still changing weekly, or incident ownership is unclear.",
        "owner": "Platform and operations"
      },
      {
        "route": "Cloud GPU self-hosting",
        "stage": "Control lane",
        "useWhen": "The buyer needs custom serving, data control, fine-tuned models, or high-utilization open-weight inference.",
        "minimumEvidence": "GPU-hour cost, serving stack benchmark, occupancy, queue depth, upgrade runbook, and security review.",
        "avoidWhen": "The team is still learning which model solves the workflow or cannot monitor serving reliably.",
        "owner": "Infrastructure owner"
      },
      {
        "route": "Owned hardware",
        "stage": "Strategic capacity",
        "useWhen": "Workload volume, data boundary, procurement horizon, and engineering maturity justify long-lived capacity.",
        "minimumEvidence": "Three-month demand forecast, depreciation model, power/cooling plan, support contract, and fallback API.",
        "avoidWhen": "Demand is speculative, model architecture is volatile, or utilization cannot be audited continuously.",
        "owner": "Executive sponsor and infrastructure"
      }
    ],
    "readinessScores": [
      {
        "dimension": "Demand forecast",
        "score": 82,
        "note": "Enough traffic evidence to compare hosted, dedicated, and self-hosted routes."
      },
      {
        "dimension": "Utilization model",
        "score": 68,
        "note": "Needs measured occupancy from real provider or GPU traces before buying capacity."
      },
      {
        "dimension": "Ops ownership",
        "score": 56,
        "note": "Serving, rollback, upgrade, and incident ownership must be explicit before self-hosting."
      },
      {
        "dimension": "Fallback design",
        "score": 74,
        "note": "Route hard cases back to a managed model and report blended cost per accepted output."
      },
      {
        "dimension": "Procurement clarity",
        "score": 61,
        "note": "Security, residency, vendor support, and budget horizon still need buyer-specific input."
      }
    ],
    "artifacts": {
      "markdown": "/reports/hardware-procurement-matrix.md",
      "json": "/reports/hardware-procurement-matrix.json"
    }
  },
  "traceKit": {
    "purpose": "Import contract for measured latency, throughput, cache, batch, cost, and acceptance traces across model/provider routes.",
    "fields": [
      {
        "field": "route",
        "description": "Managed frontier API, hosted open-weight serverless, dedicated endpoint, or self-hosted GPU stack."
      },
      {
        "field": "provider",
        "description": "Provider, deployment owner, or local stack name."
      },
      {
        "field": "model",
        "description": "Exact model identifier, quantization, or endpoint alias."
      },
      {
        "field": "workload",
        "description": "Task family such as support policy, browser extraction, coding agent, or invoice review."
      },
      {
        "field": "inputTokens",
        "description": "Observed input tokens for the run."
      },
      {
        "field": "outputTokens",
        "description": "Observed output tokens for the run."
      },
      {
        "field": "cacheHitRate",
        "description": "Observed cache hit rate from 0 to 1, or 0 if unavailable."
      },
      {
        "field": "batchEligible",
        "description": "true if the workload can run through a batch/offline lane."
      },
      {
        "field": "concurrency",
        "description": "Concurrent request count during the measurement."
      },
      {
        "field": "ttftMs",
        "description": "Time to first token in milliseconds."
      },
      {
        "field": "latencyP50Ms",
        "description": "p50 end-to-end latency in milliseconds."
      },
      {
        "field": "latencyP95Ms",
        "description": "p95 end-to-end latency in milliseconds."
      },
      {
        "field": "outputTokensPerSecond",
        "description": "Sustained output-token generation rate."
      },
      {
        "field": "acceptedOutputRate",
        "description": "Share of outputs accepted by the evaluator or reviewer."
      },
      {
        "field": "costUsd",
        "description": "Observed or modeled cost for this measurement row."
      },
      {
        "field": "reviewMinutes",
        "description": "Human review minutes per output after model generation."
      }
    ],
    "metrics": [
      {
        "metric": "Cost per accepted output",
        "formula": "costUsd / acceptedOutputRate + reviewer time cost",
        "why": "A cheap model is not cheap if it creates rejected outputs or review fallout."
      },
      {
        "metric": "p95 latency at concurrency",
        "formula": "latencyP95Ms grouped by workload and route",
        "why": "Interactive agents fail on slow tails, not average latency."
      },
      {
        "metric": "Cache-adjusted input burden",
        "formula": "inputTokens * (1 - cacheHitRate)",
        "why": "Repeated policy and schema prefixes should visibly bend the cost curve."
      },
      {
        "metric": "Batch displacement share",
        "formula": "batchEligible rows / all rows by workload",
        "why": "Offline work should not be priced like live chat if the product can wait."
      },
      {
        "metric": "Throughput headroom",
        "formula": "outputTokensPerSecond * concurrency at acceptedOutputRate",
        "why": "Dedicated or self-hosted routes only make sense when utilization can stay high."
      }
    ],
    "templateRows": [
      {
        "route": "Managed frontier API",
        "provider": "OpenAI",
        "model": "frontier-reasoning-model-id",
        "workload": "coding-agent-maintenance",
        "inputTokens": "6200",
        "outputTokens": "1200",
        "cacheHitRate": "0.35",
        "batchEligible": "false",
        "concurrency": "4",
        "ttftMs": "1900",
        "latencyP50Ms": "14800",
        "latencyP95Ms": "28200",
        "outputTokensPerSecond": "48",
        "acceptedOutputRate": "0.84",
        "costUsd": "0.215",
        "reviewMinutes": "3.5"
      },
      {
        "route": "Hosted open-weight serverless",
        "provider": "Together AI",
        "model": "open-weight-model-id",
        "workload": "invoice-review",
        "inputTokens": "4100",
        "outputTokens": "720",
        "cacheHitRate": "0.10",
        "batchEligible": "true",
        "concurrency": "16",
        "ttftMs": "850",
        "latencyP50Ms": "6200",
        "latencyP95Ms": "14900",
        "outputTokensPerSecond": "73",
        "acceptedOutputRate": "0.72",
        "costUsd": "0.046",
        "reviewMinutes": "5.0"
      },
      {
        "route": "Dedicated endpoint",
        "provider": "Fireworks",
        "model": "dedicated-open-model-id",
        "workload": "support-policy",
        "inputTokens": "2800",
        "outputTokens": "520",
        "cacheHitRate": "0.55",
        "batchEligible": "false",
        "concurrency": "32",
        "ttftMs": "420",
        "latencyP50Ms": "3100",
        "latencyP95Ms": "8800",
        "outputTokensPerSecond": "108",
        "acceptedOutputRate": "0.78",
        "costUsd": "0.032",
        "reviewMinutes": "2.8"
      }
    ],
    "artifacts": {
      "runbook": "/reports/inference-trace-kit/runbook.md",
      "csvTemplate": "/reports/inference-trace-kit/trace-template.csv",
      "schema": "/reports/inference-trace-kit/trace-schema.json",
      "manifest": "/reports/inference-trace-kit/manifest.json"
    }
  },
  "recommendedMetric": "Cost per accepted output at target p95 latency, reported with cache hit rate, batch eligibility, retry rate, utilization, and fallback route."
}
