Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions convert-hf-to-powerinfer-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ def from_model_architecture(model_architecture):
return FalconModel
if model_architecture == "LlamaForCausalLM":
return LlamaModel
if model_architecture == "OPTForCausalLM":
return OptModel

raise NotImplementedError(f'Architecture "{model_architecture}" not supported!')

Expand Down Expand Up @@ -218,6 +220,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH:
return gguf.MODEL_ARCH.FALCON
if arch == "RWForCausalLM" or arch == "LlamaForCausalLM":
return gguf.MODEL_ARCH.LLAMA
if arch == "OPTForCausalLM":
return gguf.MODEL_ARCH.OPT

raise NotImplementedError(f'Architecture "{arch}" not supported!')

Expand Down Expand Up @@ -513,7 +517,63 @@ def write_tensors(self):

self.gguf_writer.add_tensor(new_name, data)

class OptModel(Model):
def set_gguf_parameters(self, params: PredictorParams):
self.gguf_writer.add_name("opt")
self.gguf_writer.add_context_length(2050) # not in config.json
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
self.gguf_writer.add_feed_forward_length(self.hparams["ffn_dim"])
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
# self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
self.gguf_writer.add_file_type(self.ftype)

if params.sparse_threshold is not None:
self.gguf_writer.add_sparse_threshold(params.sparse_threshold)

def write_tensors(self):
for name, data_torch in self.get_tensors():
old_dtype = data_torch.dtype

# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)

data = data_torch.squeeze().numpy()

# map tensor names
new_name = self._translate_tensor_key(name)
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()

# We need to transpose the weight matrices for the FFN Down layers to support the
# Axpy operation in PowerInfer. So we don't need to transpose them at runtime.
if "ffn_down" in new_name:
new_name = new_name.replace("ffn_down", "ffn_down_t")
data = data.T

n_dims = len(data.shape)
data_dtype = data.dtype

# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if (
self.ftype == 1
and data_dtype == np.float32
and name.endswith(".weight")
and n_dims == 2
):
data = data.astype(np.float16)

print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

self.gguf_writer.add_tensor(new_name, data)

@dataclass
class PredictorParams:
Expand Down
17 changes: 16 additions & 1 deletion gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ class MODEL_ARCH(IntEnum):
GPT2 = auto()
GPTJ = auto()
GPTNEOX = auto()
OPT = auto()
MPT = auto()
STARCODER = auto()
PERSIMMON = auto()
Expand Down Expand Up @@ -135,6 +136,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.GPT2: "gpt2",
MODEL_ARCH.GPTJ: "gptj",
MODEL_ARCH.GPTNEOX: "gptneox",
MODEL_ARCH.OPT: "opt",
MODEL_ARCH.MPT: "mpt",
MODEL_ARCH.STARCODER: "starcoder",
MODEL_ARCH.PERSIMMON: "persimmon",
Expand Down Expand Up @@ -356,7 +358,20 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.GPT2: [
# TODO
],
# TODO
MODEL_ARCH.OPT: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.POS_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
}

# tensors that will not be serialized
Expand Down
11 changes: 11 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class TensorNameMap:
MODEL_TENSOR.TOKEN_EMBD: (
"gpt_neox.embed_in", # gptneox
"transformer.wte", # gpt2 gpt-j mpt refact
"decoder.embed_tokens", # opt
"transformer.word_embeddings", # falcon
"word_embeddings", # bloom
"model.embed_tokens", # llama-hf
Expand All @@ -33,6 +34,7 @@ class TensorNameMap:
MODEL_TENSOR.POS_EMBD: (
"transformer.wpe", # gpt2
"embeddings.position_embeddings", # bert
"decoder.embed_positions", # opt
),

# Output
Expand All @@ -47,6 +49,7 @@ class TensorNameMap:
MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", # gptneox
"transformer.ln_f", # gpt2 gpt-j falcon
"decoder.final_layer_norm", # opt
"model.norm", # llama-hf baichuan
"norm", # llama-pth
"embeddings.LayerNorm", # bert
Expand All @@ -66,6 +69,7 @@ class TensorNameMap:
MODEL_TENSOR.ATTN_NORM: (
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
"decoder.layers.{bid}.self_attn_layer_norm", # opt
"transformer.blocks.{bid}.norm_1", # mpt
"transformer.h.{bid}.input_layernorm", # falcon7b
"h.{bid}.input_layernorm", # bloom
Expand Down Expand Up @@ -98,6 +102,7 @@ class TensorNameMap:
"layers.{bid}.attention.wq", # llama-pth
"encoder.layer.{bid}.attention.self.query", # bert
"transformer.h.{bid}.attn.q_proj", # gpt-j
"decoder.layers.{bid}.self_attn.q_proj", # opt
),

# Attention key
Expand All @@ -106,6 +111,7 @@ class TensorNameMap:
"layers.{bid}.attention.wk", # llama-pth
"encoder.layer.{bid}.attention.self.key", # bert
"transformer.h.{bid}.attn.k_proj", # gpt-j
"decoder.layers.{bid}.self_attn.k_proj", # opt
),

# Attention value
Expand All @@ -114,12 +120,14 @@ class TensorNameMap:
"layers.{bid}.attention.wv", # llama-pth
"encoder.layer.{bid}.attention.self.value", # bert
"transformer.h.{bid}.attn.v_proj", # gpt-j
"decoder.layers.{bid}.self_attn.v_proj", # opt
),

# Attention output
MODEL_TENSOR.ATTN_OUT: (
"gpt_neox.layers.{bid}.attention.dense", # gptneox
"transformer.h.{bid}.attn.c_proj", # gpt2 refact
"decoder.layers.{bid}.self_attn.out_proj", # opt
"transformer.blocks.{bid}.attn.out_proj", # mpt
"transformer.h.{bid}.self_attention.dense", # falcon
"h.{bid}.self_attention.dense", # bloom
Expand All @@ -140,6 +148,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_NORM: (
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
"transformer.h.{bid}.ln_2", # gpt2 refact
"decoder.layers.{bid}.final_layer_norm", # opt
"h.{bid}.post_attention_layernorm", # bloom
"transformer.blocks.{bid}.norm_2", # mpt
"model.layers.{bid}.post_attention_layernorm", # llama-hf
Expand All @@ -153,6 +162,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_UP: (
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
"transformer.h.{bid}.mlp.c_fc", # gpt2
"decoder.layers.{bid}.fc1", # opt
"transformer.blocks.{bid}.ffn.up_proj", # mpt
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
"h.{bid}.mlp.dense_h_to_4h", # bloom
Expand All @@ -173,6 +183,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_DOWN: (
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact
"decoder.layers.{bid}.fc2", # opt
"transformer.blocks.{bid}.ffn.down_proj", # mpt
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
"h.{bid}.mlp.dense_4h_to_h", # bloom
Expand Down
Loading
Loading