p-e-w · ricyoung · Jan 10, 2026 · p-e-w · Jan 13, 2026 · spikymoth
diff --git a/config.default.toml b/config.default.toml
@@ -16,10 +16,12 @@ dtypes = [
 ]
 
 # Device map to pass to Accelerate when loading the model.
+# Use "cpu" for CPU-only systems without GPU support.
 device_map = "auto"
 
 # Quantization method to use when loading the model.
 # Options: "none" (no quantization), "bnb_4bit" (4-bit quantization using bitsandbytes).
+# Note: 4-bit quantization requires bitsandbytes, which needs CUDA support.
 quantization = "none"
 
 # Memory limits to impose. 0 is usually your first graphics card.

diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,8 @@ classifiers = [
 ]
 dependencies = [
     "accelerate>=1.10.0",
-    "bitsandbytes>=0.45.0",
+    # bitsandbytes requires CUDA or Apple Silicon. Exclude Intel Macs where it won't work.
+    "bitsandbytes>=0.45.0; (platform_system != 'Darwin') or (platform_machine == 'arm64')",
     "datasets>=4.0.0",
     "hf-transfer>=0.1.9",
     "huggingface-hub>=0.34.4",

diff --git a/src/heretic/model.py b/src/heretic/model.py
@@ -6,8 +6,16 @@
 from dataclasses import dataclass
 from typing import Any, cast
 
-import bitsandbytes as bnb
 import torch
+
+# bitsandbytes is optional - only available on systems with CUDA support.
+try:
+    import bitsandbytes as bnb
+
+    HAS_BITSANDBYTES = True
+except ImportError:
+    bnb = None  # type: ignore[assignment]
+    HAS_BITSANDBYTES = False
 import torch.nn.functional as F
 from peft import LoraConfig, PeftModel, get_peft_model
 from peft.tuners.lora.layer import Linear
@@ -181,6 +189,12 @@ def _get_quantization_config(self, dtype: str) -> BitsAndBytesConfig | None:
             BitsAndBytesConfig or None
         """
         if self.settings.quantization == QuantizationMethod.BNB_4BIT:
+            if not HAS_BITSANDBYTES:
+                raise RuntimeError(
+                    "4-bit quantization requires bitsandbytes, which is not available. "
+                    "Install it with 'pip install bitsandbytes' (requires CUDA) "
+                    "or set quantization = 'none' in your config."
+                )
             # BitsAndBytesConfig expects a torch.dtype, not a string.
             if dtype == "auto":
                 compute_dtype = torch.bfloat16