{
  "slug": "activation-patching",
  "title": "Activation Patching",
  "role": "Test whether a candidate internal component changes behavior when swapped or ablated.",
  "operatorQuestion": "Does changing the suspected feature, layer, head, or residual stream state change the model answer in the predicted direction?",
  "evidenceToCollect": [
    "Clean/corrupted prompt pair",
    "Patch target",
    "Metric before and after patch",
    "Random-feature baseline"
  ],
  "usefulOutput": "Causal test memo showing effect size, negative controls, and cases where the hypothesis fails.",
  "limits": "Patch results can be brittle, distributed mechanisms can be missed, and prompt-pair design can bias the finding.",
  "sourceTrail": [
    [
      "Transformer Circuits thread",
      "https://transformer-circuits.pub/"
    ],
    [
      "How Does Chain of Thought Think?",
      "https://ojs.aaai.org/index.php/AAAI/article/view/40281"
    ],
    [
      "Towards Automated Circuit Discovery",
      "https://arxiv.org/abs/2304.14997"
    ]
  ],
  "readingTime": "8 min read",
  "explainerPath": "/articles/mechanistic/activation-patching",
  "reportPath": "/reports/mechanistic-series/activation-patching.md",
  "jsonPath": "/reports/mechanistic-series/activation-patching.json",
  "summary": "Activation Patching matters when a deployment owner needs more than a pass/fail eval. It gives the audit team a way to collect internal evidence, test a causal hypothesis, and state the limits before changing a production control.",
  "fieldGuide": [
    {
      "heading": "What the method is for",
      "body": "Test whether a candidate internal component changes behavior when swapped or ablated."
    },
    {
      "heading": "The operator question",
      "body": "Does changing the suspected feature, layer, head, or residual stream state change the model answer in the predicted direction?"
    },
    {
      "heading": "What the audit should produce",
      "body": "Causal test memo showing effect size, negative controls, and cases where the hypothesis fails."
    },
    {
      "heading": "Where the method fails",
      "body": "Patch results can be brittle, distributed mechanisms can be missed, and prompt-pair design can bias the finding."
    }
  ],
  "visualRows": [
    {
      "label": "Clean/corrupted prompt pair",
      "value": 100,
      "note": "Must be collected before the interpretation is trusted."
    },
    {
      "label": "Patch target",
      "value": 87,
      "note": "Use as supporting evidence and keep unresolved ambiguity visible."
    },
    {
      "label": "Metric before and after patch",
      "value": 74,
      "note": "Use as supporting evidence and keep unresolved ambiguity visible."
    },
    {
      "label": "Random-feature baseline",
      "value": 61,
      "note": "Use as supporting evidence and keep unresolved ambiguity visible."
    }
  ],
  "auditTemplate": [
    [
      "Behavior under review",
      "One narrow failure, policy behavior, shortcut, or refusal pattern."
    ],
    [
      "Candidate mechanism",
      "Does changing the suspected feature, layer, head, or residual stream state change the model answer in the predicted direction?"
    ],
    [
      "Evidence packet",
      "Clean/corrupted prompt pair; Patch target; Metric before and after patch; Random-feature baseline"
    ],
    [
      "Decision boundary",
      "What can change in production if the causal claim survives review."
    ],
    [
      "Limit memo",
      "Patch results can be brittle, distributed mechanisms can be missed, and prompt-pair design can bias the finding."
    ]
  ]
}