Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions config.default.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@ dtypes = [
]

# Device map to pass to Accelerate when loading the model.
# Use "cpu" for CPU-only systems without GPU support.
device_map = "auto"

# Quantization method to use when loading the model.
# Options: "none" (no quantization), "bnb_4bit" (4-bit quantization using bitsandbytes).
# Note: 4-bit quantization requires bitsandbytes, which needs CUDA support.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That contradicts the bnb docs: https://huggingface.co/docs/transformers/en/quantization/bitsandbytes

bitsandbytes is supported on NVIDIA GPUs for CUDA versions 11.8 - 13.0, Intel XPU, Intel Gaudi (HPU), and CPU.

There are several mentions on that page of CPU-only usage.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that ROCm-via-HIP is also (partially) supported: https://github.com/bitsandbytes-foundation/bitsandbytes#legend

But yeah it seems like CPUs should be supported as well, except for "8-bit Optimizers" (whatever that means).

quantization = "none"

# Memory limits to impose. 0 is usually your first graphics card.
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ classifiers = [
]
dependencies = [
"accelerate>=1.10.0",
"bitsandbytes>=0.45.0",
# bitsandbytes requires CUDA or Apple Silicon. Exclude Intel Macs where it won't work.
"bitsandbytes>=0.45.0; (platform_system != 'Darwin') or (platform_machine == 'arm64')",
"datasets>=4.0.0",
"hf-transfer>=0.1.9",
"huggingface-hub>=0.34.4",
Expand Down
16 changes: 15 additions & 1 deletion src/heretic/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,16 @@
from dataclasses import dataclass
from typing import Any, cast

import bitsandbytes as bnb
import torch

# bitsandbytes is optional - only available on systems with CUDA support.
try:
import bitsandbytes as bnb

HAS_BITSANDBYTES = True
except ImportError:
bnb = None # type: ignore[assignment]
HAS_BITSANDBYTES = False
import torch.nn.functional as F
from peft import LoraConfig, PeftModel, get_peft_model
from peft.tuners.lora.layer import Linear
Expand Down Expand Up @@ -181,6 +189,12 @@ def _get_quantization_config(self, dtype: str) -> BitsAndBytesConfig | None:
BitsAndBytesConfig or None
"""
if self.settings.quantization == QuantizationMethod.BNB_4BIT:
if not HAS_BITSANDBYTES:
raise RuntimeError(
"4-bit quantization requires bitsandbytes, which is not available. "
"Install it with 'pip install bitsandbytes' (requires CUDA) "
"or set quantization = 'none' in your config."
)
# BitsAndBytesConfig expects a torch.dtype, not a string.
if dtype == "auto":
compute_dtype = torch.bfloat16
Expand Down