Tiiny-AI · jeremyyx · Feb 19, 2025 · Jan 2, 2025 · Jan 16, 2025 · Jan 23, 2025
diff --git a/convert-hf-to-powerinfer-gguf.py b/convert-hf-to-powerinfer-gguf.py
@@ -185,6 +185,8 @@ def from_model_architecture(model_architecture):
             return FalconModel
         if model_architecture == "LlamaForCausalLM":
             return LlamaModel
+        if model_architecture == "OPTForCausalLM":
+            return OptModel
 
         raise NotImplementedError(f'Architecture "{model_architecture}" not supported!')
 
@@ -218,6 +220,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH:
             return gguf.MODEL_ARCH.FALCON
         if arch == "RWForCausalLM" or arch == "LlamaForCausalLM":
             return gguf.MODEL_ARCH.LLAMA
+        if arch == "OPTForCausalLM":
+            return gguf.MODEL_ARCH.OPT
 
         raise NotImplementedError(f'Architecture "{arch}" not supported!')
 
@@ -513,7 +517,63 @@ def write_tensors(self):
 
             self.gguf_writer.add_tensor(new_name, data)
 
+class OptModel(Model):
+    def set_gguf_parameters(self, params: PredictorParams):
+        self.gguf_writer.add_name("opt")
+        self.gguf_writer.add_context_length(2050)  # not in config.json
+        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
+        self.gguf_writer.add_feed_forward_length(self.hparams["ffn_dim"])
+        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
+        # self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+        if params.sparse_threshold is not None:
+            self.gguf_writer.add_sparse_threshold(params.sparse_threshold)
+
+    def write_tensors(self):
+        for name, data_torch in self.get_tensors():
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            new_name = self._translate_tensor_key(name)
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
 
+            # We need to transpose the weight matrices for the FFN Down layers to support the
+            # Axpy operation in PowerInfer. So we don't need to transpose them at runtime.
+            if "ffn_down" in new_name:
+                new_name = new_name.replace("ffn_down", "ffn_down_t")
+                data = data.T
+
+            n_dims = len(data.shape)    
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if (
+                self.ftype == 1
+                and data_dtype == np.float32
+                and name.endswith(".weight")
+                and n_dims == 2
+            ):
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
 
 @dataclass
 class PredictorParams:

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -90,6 +90,7 @@ class MODEL_ARCH(IntEnum):
     GPT2      = auto()
     GPTJ      = auto()
     GPTNEOX   = auto()
+    OPT       = auto()
     MPT       = auto()
     STARCODER = auto()
     PERSIMMON = auto()
@@ -135,6 +136,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.GPT2:           "gpt2",
     MODEL_ARCH.GPTJ:           "gptj",
     MODEL_ARCH.GPTNEOX:        "gptneox",
+    MODEL_ARCH.OPT:            "opt",
     MODEL_ARCH.MPT:            "mpt",
     MODEL_ARCH.STARCODER:      "starcoder",
     MODEL_ARCH.PERSIMMON:      "persimmon",
@@ -356,7 +358,20 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.GPT2: [
         # TODO
     ],
-    # TODO
+    MODEL_ARCH.OPT: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.POS_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
 }
 
 # tensors that will not be serialized

diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
@@ -11,6 +11,7 @@ class TensorNameMap:
         MODEL_TENSOR.TOKEN_EMBD: (
             "gpt_neox.embed_in",                         # gptneox
             "transformer.wte",                           # gpt2 gpt-j mpt refact
+            "decoder.embed_tokens",                      # opt
             "transformer.word_embeddings",               # falcon
             "word_embeddings",                           # bloom
             "model.embed_tokens",                        # llama-hf
@@ -33,6 +34,7 @@ class TensorNameMap:
         MODEL_TENSOR.POS_EMBD: (
             "transformer.wpe",                 # gpt2
             "embeddings.position_embeddings",  # bert
+            "decoder.embed_positions",         # opt
         ),
 
         # Output
@@ -47,6 +49,7 @@ class TensorNameMap:
         MODEL_TENSOR.OUTPUT_NORM: (
             "gpt_neox.final_layer_norm",               # gptneox
             "transformer.ln_f",                        # gpt2 gpt-j falcon
+            "decoder.final_layer_norm",                # opt
             "model.norm",                              # llama-hf baichuan
             "norm",                                    # llama-pth
             "embeddings.LayerNorm",                    # bert
@@ -66,6 +69,7 @@ class TensorNameMap:
         MODEL_TENSOR.ATTN_NORM: (
             "gpt_neox.layers.{bid}.input_layernorm",                # gptneox
             "transformer.h.{bid}.ln_1",                             # gpt2 gpt-j refact
+            "decoder.layers.{bid}.self_attn_layer_norm",            # opt
             "transformer.blocks.{bid}.norm_1",                      # mpt
             "transformer.h.{bid}.input_layernorm",                  # falcon7b
             "h.{bid}.input_layernorm",                              # bloom
@@ -98,6 +102,7 @@ class TensorNameMap:
             "layers.{bid}.attention.wq",                 # llama-pth
             "encoder.layer.{bid}.attention.self.query",  # bert
             "transformer.h.{bid}.attn.q_proj",           # gpt-j
+            "decoder.layers.{bid}.self_attn.q_proj",     # opt
         ),
 
         # Attention key
@@ -106,6 +111,7 @@ class TensorNameMap:
             "layers.{bid}.attention.wk",               # llama-pth
             "encoder.layer.{bid}.attention.self.key",  # bert
             "transformer.h.{bid}.attn.k_proj",         # gpt-j
+            "decoder.layers.{bid}.self_attn.k_proj",   # opt
         ),
 
         # Attention value
@@ -114,12 +120,14 @@ class TensorNameMap:
             "layers.{bid}.attention.wv",                 # llama-pth
             "encoder.layer.{bid}.attention.self.value",  # bert
             "transformer.h.{bid}.attn.v_proj",           # gpt-j
+            "decoder.layers.{bid}.self_attn.v_proj",     # opt
         ),
 
         # Attention output
         MODEL_TENSOR.ATTN_OUT: (
             "gpt_neox.layers.{bid}.attention.dense",                     # gptneox
             "transformer.h.{bid}.attn.c_proj",                           # gpt2 refact
+            "decoder.layers.{bid}.self_attn.out_proj",                   # opt
             "transformer.blocks.{bid}.attn.out_proj",                    # mpt
             "transformer.h.{bid}.self_attention.dense",                  # falcon
             "h.{bid}.self_attention.dense",                              # bloom
@@ -140,6 +148,7 @@ class TensorNameMap:
         MODEL_TENSOR.FFN_NORM: (
             "gpt_neox.layers.{bid}.post_attention_layernorm",                # gptneox
             "transformer.h.{bid}.ln_2",                                      # gpt2 refact
+            "decoder.layers.{bid}.final_layer_norm",                         # opt
             "h.{bid}.post_attention_layernorm",                              # bloom
             "transformer.blocks.{bid}.norm_2",                               # mpt
             "model.layers.{bid}.post_attention_layernorm",                   # llama-hf
@@ -153,6 +162,7 @@ class TensorNameMap:
         MODEL_TENSOR.FFN_UP: (
             "gpt_neox.layers.{bid}.mlp.dense_h_to_4h",                # gptneox
             "transformer.h.{bid}.mlp.c_fc",                           # gpt2
+            "decoder.layers.{bid}.fc1",                               # opt
             "transformer.blocks.{bid}.ffn.up_proj",                   # mpt
             "transformer.h.{bid}.mlp.dense_h_to_4h",                  # falcon
             "h.{bid}.mlp.dense_h_to_4h",                              # bloom
@@ -173,6 +183,7 @@ class TensorNameMap:
         MODEL_TENSOR.FFN_DOWN: (
             "gpt_neox.layers.{bid}.mlp.dense_4h_to_h",                # gptneox
             "transformer.h.{bid}.mlp.c_proj",                         # gpt2 refact
+            "decoder.layers.{bid}.fc2",                               # opt
             "transformer.blocks.{bid}.ffn.down_proj",                 # mpt
             "transformer.h.{bid}.mlp.dense_4h_to_h",                  # falcon
             "h.{bid}.mlp.dense_4h_to_h",                              # bloom