{
  "slug": "circuit-tracing",
  "title": "Circuit Tracing",
  "role": "Build a graph-like hypothesis for how features and components combine to produce behavior.",
  "operatorQuestion": "What path connects input features, intermediate features, and the final behavior the audit cares about?",
  "evidenceToCollect": [
    "Attribution graph",
    "Feature-to-feature edges",
    "Intervention checks",
    "Behavioral case study"
  ],
  "usefulOutput": "Circuit audit note with a graph, the behavior it explains, causal tests, and open limits.",
  "limits": "Circuit traces are targeted explanations, not complete model maps. They should explain a specific behavior under a defined task distribution.",
  "sourceTrail": [
    [
      "Circuit Tracing methods",
      "https://transformer-circuits.pub/2025/attribution-graphs/methods.html"
    ],
    [
      "Tracing the Thoughts of a Large Language Model",
      "https://www.anthropic.com/research/tracing-thoughts-language-model"
    ],
    [
      "Anthropic Responsible Scaling Policy",
      "https://www.anthropic.com/responsible-scaling-policy"
    ]
  ],
  "readingTime": "8 min read",
  "explainerPath": "/articles/mechanistic/circuit-tracing",
  "reportPath": "/reports/mechanistic-series/circuit-tracing.md",
  "jsonPath": "/reports/mechanistic-series/circuit-tracing.json",
  "summary": "Circuit Tracing matters when a deployment owner needs more than a pass/fail eval. It gives the audit team a way to collect internal evidence, test a causal hypothesis, and state the limits before changing a production control.",
  "fieldGuide": [
    {
      "heading": "What the method is for",
      "body": "Build a graph-like hypothesis for how features and components combine to produce behavior."
    },
    {
      "heading": "The operator question",
      "body": "What path connects input features, intermediate features, and the final behavior the audit cares about?"
    },
    {
      "heading": "What the audit should produce",
      "body": "Circuit audit note with a graph, the behavior it explains, causal tests, and open limits."
    },
    {
      "heading": "Where the method fails",
      "body": "Circuit traces are targeted explanations, not complete model maps. They should explain a specific behavior under a defined task distribution."
    }
  ],
  "visualRows": [
    {
      "label": "Attribution graph",
      "value": 100,
      "note": "Must be collected before the interpretation is trusted."
    },
    {
      "label": "Feature-to-feature edges",
      "value": 87,
      "note": "Use as supporting evidence and keep unresolved ambiguity visible."
    },
    {
      "label": "Intervention checks",
      "value": 74,
      "note": "Use as supporting evidence and keep unresolved ambiguity visible."
    },
    {
      "label": "Behavioral case study",
      "value": 61,
      "note": "Use as supporting evidence and keep unresolved ambiguity visible."
    }
  ],
  "auditTemplate": [
    [
      "Behavior under review",
      "One narrow failure, policy behavior, shortcut, or refusal pattern."
    ],
    [
      "Candidate mechanism",
      "What path connects input features, intermediate features, and the final behavior the audit cares about?"
    ],
    [
      "Evidence packet",
      "Attribution graph; Feature-to-feature edges; Intervention checks; Behavioral case study"
    ],
    [
      "Decision boundary",
      "What can change in production if the causal claim survives review."
    ],
    [
      "Limit memo",
      "Circuit traces are targeted explanations, not complete model maps. They should explain a specific behavior under a defined task distribution."
    ]
  ]
}
