{
  "slug": "sparse-autoencoders",
  "title": "Sparse Autoencoders",
  "role": "Turn dense activations into sparse feature dictionaries that analysts can inspect.",
  "operatorQuestion": "Which recurring features activate during the failure, refusal, shortcut, or policy behavior?",
  "evidenceToCollect": [
    "Activation dataset",
    "SAE dictionary health",
    "Top activating examples",
    "Feature labels with counterexamples"
  ],
  "usefulOutput": "Feature dashboard with example prompts, activation histograms, analyst labels, and unresolved ambiguity.",
  "limits": "Feature labels are hypotheses. Reliability depends on dictionary quality, feature consistency, and causal follow-up.",
  "sourceTrail": [
    [
      "Towards Monosemanticity",
      "https://transformer-circuits.pub/2023/monosemantic-features/index.html"
    ],
    [
      "Scaling Monosemanticity",
      "https://transformer-circuits.pub/2024/scaling-monosemanticity/index.html"
    ],
    [
      "Feature consistency in SAEs",
      "https://huggingface.co/papers/2505.20254"
    ]
  ],
  "readingTime": "8 min read",
  "explainerPath": "/articles/mechanistic/sparse-autoencoders",
  "reportPath": "/reports/mechanistic-series/sparse-autoencoders.md",
  "jsonPath": "/reports/mechanistic-series/sparse-autoencoders.json",
  "summary": "Sparse Autoencoders matters when a deployment owner needs more than a pass/fail eval. It gives the audit team a way to collect internal evidence, test a causal hypothesis, and state the limits before changing a production control.",
  "fieldGuide": [
    {
      "heading": "What the method is for",
      "body": "Turn dense activations into sparse feature dictionaries that analysts can inspect."
    },
    {
      "heading": "The operator question",
      "body": "Which recurring features activate during the failure, refusal, shortcut, or policy behavior?"
    },
    {
      "heading": "What the audit should produce",
      "body": "Feature dashboard with example prompts, activation histograms, analyst labels, and unresolved ambiguity."
    },
    {
      "heading": "Where the method fails",
      "body": "Feature labels are hypotheses. Reliability depends on dictionary quality, feature consistency, and causal follow-up."
    }
  ],
  "visualRows": [
    {
      "label": "Activation dataset",
      "value": 100,
      "note": "Must be collected before the interpretation is trusted."
    },
    {
      "label": "SAE dictionary health",
      "value": 87,
      "note": "Use as supporting evidence and keep unresolved ambiguity visible."
    },
    {
      "label": "Top activating examples",
      "value": 74,
      "note": "Use as supporting evidence and keep unresolved ambiguity visible."
    },
    {
      "label": "Feature labels with counterexamples",
      "value": 61,
      "note": "Use as supporting evidence and keep unresolved ambiguity visible."
    }
  ],
  "auditTemplate": [
    [
      "Behavior under review",
      "One narrow failure, policy behavior, shortcut, or refusal pattern."
    ],
    [
      "Candidate mechanism",
      "Which recurring features activate during the failure, refusal, shortcut, or policy behavior?"
    ],
    [
      "Evidence packet",
      "Activation dataset; SAE dictionary health; Top activating examples; Feature labels with counterexamples"
    ],
    [
      "Decision boundary",
      "What can change in production if the causal claim survives review."
    ],
    [
      "Limit memo",
      "Feature labels are hypotheses. Reliability depends on dictionary quality, feature consistency, and causal follow-up."
    ]
  ]
}
