{
  "slug": "audit-limitations",
  "title": "Audit Limitations",
  "role": "Prevent interpretability findings from being oversold as proof of safety or correctness.",
  "operatorQuestion": "What would make this interpretation fail, and what decision is still unsupported?",
  "evidenceToCollect": [
    "Counterexamples",
    "Feature consistency checks",
    "Random-model or random-feature baselines",
    "Out-of-domain tests"
  ],
  "usefulOutput": "Limit memo that states what the audit proves, what it does not prove, and what control should remain in production.",
  "limits": "SAEs can surface patterns in unexpected places, including random or weakly related systems, so operational claims need baselines and decision boundaries.",
  "sourceTrail": [
    [
      "Sparse Autoencoders Can Interpret Randomly Initialized Transformers",
      "https://arxiv.org/abs/2501.17727"
    ],
    [
      "Feature consistency in SAEs",
      "https://huggingface.co/papers/2505.20254"
    ],
    [
      "Anthropic Responsible Scaling Policy",
      "https://www.anthropic.com/responsible-scaling-policy"
    ]
  ],
  "readingTime": "8 min read",
  "explainerPath": "/articles/mechanistic/audit-limitations",
  "reportPath": "/reports/mechanistic-series/audit-limitations.md",
  "jsonPath": "/reports/mechanistic-series/audit-limitations.json",
  "summary": "Audit Limitations matters when a deployment owner needs more than a pass/fail eval. It gives the audit team a way to collect internal evidence, test a causal hypothesis, and state the limits before changing a production control.",
  "fieldGuide": [
    {
      "heading": "What the method is for",
      "body": "Prevent interpretability findings from being oversold as proof of safety or correctness."
    },
    {
      "heading": "The operator question",
      "body": "What would make this interpretation fail, and what decision is still unsupported?"
    },
    {
      "heading": "What the audit should produce",
      "body": "Limit memo that states what the audit proves, what it does not prove, and what control should remain in production."
    },
    {
      "heading": "Where the method fails",
      "body": "SAEs can surface patterns in unexpected places, including random or weakly related systems, so operational claims need baselines and decision boundaries."
    }
  ],
  "visualRows": [
    {
      "label": "Counterexamples",
      "value": 100,
      "note": "Must be collected before the interpretation is trusted."
    },
    {
      "label": "Feature consistency checks",
      "value": 87,
      "note": "Use as supporting evidence and keep unresolved ambiguity visible."
    },
    {
      "label": "Random-model or random-feature baselines",
      "value": 74,
      "note": "Use as supporting evidence and keep unresolved ambiguity visible."
    },
    {
      "label": "Out-of-domain tests",
      "value": 61,
      "note": "Use as supporting evidence and keep unresolved ambiguity visible."
    }
  ],
  "auditTemplate": [
    [
      "Behavior under review",
      "One narrow failure, policy behavior, shortcut, or refusal pattern."
    ],
    [
      "Candidate mechanism",
      "What would make this interpretation fail, and what decision is still unsupported?"
    ],
    [
      "Evidence packet",
      "Counterexamples; Feature consistency checks; Random-model or random-feature baselines; Out-of-domain tests"
    ],
    [
      "Decision boundary",
      "What can change in production if the causal claim survives review."
    ],
    [
      "Limit memo",
      "SAEs can surface patterns in unexpected places, including random or weakly related systems, so operational claims need baselines and decision boundaries."
    ]
  ]
}