From 4e595b250ae55d0fc1bbb56c4b42cc183806fc0f Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Thu, 22 Jan 2026 19:24:37 +0100 Subject: [PATCH 1/6] server: do not log certain endpoints (avoid log spam) (#19028) --- tools/server/server-http.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index 5d67e5722d1..00897eeea5b 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -28,14 +28,20 @@ server_http_context::server_http_context() server_http_context::~server_http_context() = default; static void log_server_request(const httplib::Request & req, const httplib::Response & res) { - // skip GH copilot requests when using default port - if (req.path == "/v1/health") { + // skip logging requests that are regularly sent, to avoid log spam + if (req.path == "/health" + || req.path == "/v1/health" + || req.path == "/models" + || req.path == "/v1/models" + || req.path == "/props" + || req.path == "/metrics" + ) { return; } // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch - SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); + SRV_INF("done request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); SRV_DBG("request: %s\n", req.body.c_str()); SRV_DBG("response: %s\n", res.body.c_str()); From 9c96465f99e47a3a568c50969ff5c6b672ab2714 Mon Sep 17 00:00:00 2001 From: lhez Date: Thu, 22 Jan 2026 10:29:25 -0800 Subject: [PATCH 2/6] opencl: enable the general fp mm for non-cont input and as a fallback for specialized kqv kernel for adreno (#18970) * opencl: add `copy_to_contiguous` and utilize mm kernels * opencl: only copy to cont for f32 and f16 tensors * opencl: use cont mm for fallback when dst is large * opencl: use nb local to copy-to-cont * opencl: use local offset as well --- ggml/src/ggml-opencl/ggml-opencl.cpp | 179 +++++++++++++++++++++++++-- 1 file changed, 166 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index efdebe2bbaa..27b2761ef1e 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -398,6 +398,7 @@ struct ggml_backend_opencl_context { int adreno_wave_size; cl_bool non_uniform_workgroups; + size_t image_max_buffer_size; cl_context context; cl_command_queue queue; @@ -407,6 +408,10 @@ struct ggml_backend_opencl_context { ggml_cl_buffer prealloc_scales_trans; ggml_cl_buffer prealloc_act_trans; + // prealloc buffers for src0 and src1 + ggml_cl_buffer prealloc_src0; + ggml_cl_buffer prealloc_src1; + cl_program program_add; cl_program program_add_id; cl_program program_clamp; @@ -2658,6 +2663,9 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL); GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024); + clGetDeviceInfo(device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(size_t), &backend_ctx->image_max_buffer_size, NULL); + GGML_LOG_INFO("ggml_opencl: device max image buffer size (pixels): %lu\n", backend_ctx->image_max_buffer_size); + clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &backend_ctx->max_workgroup_size, NULL); GGML_LOG_INFO("ggml_opencl: device max workgroup size: %lu\n", backend_ctx->max_workgroup_size); @@ -4711,6 +4719,81 @@ static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct gg (ne0 >= 32 && ne1 >= 32 && ne10 >= 32); } +// Copy a noncontiguous tensor to contiguous tensor. ne[] remains the same but +// nb[] is recalculated such that tensor is contiguous. +static void ggml_cl_copy_to_contiguous(ggml_backend_t backend, const ggml_tensor * src, cl_mem dst, + cl_ulong &nb0, cl_ulong &nb1, cl_ulong &nb2, cl_ulong &nb3) { + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + const int tensor_type_size = ggml_type_size(src->type); + + const int ne00 = src->ne[0]; + const int ne01 = src->ne[1]; + const int ne02 = src->ne[2]; + const int ne03 = src->ne[3]; + + const cl_ulong nb00 = src->nb[0]; + const cl_ulong nb01 = src->nb[1]; + const cl_ulong nb02 = src->nb[2]; + const cl_ulong nb03 = src->nb[3]; + + const int ne0 = src->ne[0]; + const int ne1 = src->ne[1]; + const int ne2 = src->ne[2]; + const int ne3 = src->ne[3]; + + nb0 = tensor_type_size; + nb1 = tensor_type_size*ne00; + nb2 = tensor_type_size*ne00*ne01; + nb3 = tensor_type_size*ne00*ne01*ne02; + + ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *)src->extra; + + cl_ulong offset0 = extra->offset + src->view_offs; + cl_ulong offsetd = 0; + + cl_kernel kernel; + + switch (src->type) { + case GGML_TYPE_F32: + kernel = backend_ctx->kernel_cpy_f32_f32; + break; + case GGML_TYPE_F16: + kernel = backend_ctx->kernel_cpy_f16_f16; + break; + default: + GGML_ASSERT(false && "not implemented"); + } + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne0)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne1)); + CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne2)); + CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne3)); + CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb0)); + CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb1)); + CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb2)); + CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb3)); + + const int nth = MIN(64, ne00); + + size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; + size_t local_work_size[] = {(size_t)nth, 1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, src); +} + static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { UNUSED(backend); UNUSED(src0); @@ -7724,9 +7807,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co cl_context context = backend_ctx->context; if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){ - if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0) { + if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0 && + // dst is wrapped with image1d_buffer, the size limit applies, also src0 + (ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4 <= backend_ctx->image_max_buffer_size)) { // For KQ if (ggml_is_permuted(src0) && ggml_is_permuted(src1) && + ((nb01 * ne01 / 4)/4 <= backend_ctx->image_max_buffer_size) && nb00 <= nb02 && nb02 <= nb01 && nb01 <= nb03 && @@ -7737,7 +7823,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co return; } // For KQV - if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { + if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && + ((nb02 * ne02 / 4)/4 <= backend_ctx->image_max_buffer_size)) { ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst); return; } @@ -8043,9 +8130,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co // GEMM using local memory // Current BK = 16, so ne00 % 16 == 0 - if (ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - src1t == GGML_TYPE_F32 && + if (src1t == GGML_TYPE_F32 && ne00 % 16 == 0 && ne11 > 1) { switch(src0t) { @@ -8057,10 +8142,42 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co int batch_stride_b = ne10*ne11; int batch_stride_d = ne0*ne1; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + cl_mem mem_src0 = extra0->data_device; + cl_mem mem_src1 = extra1->data_device; + + cl_ulong nb00_cont = nb00; + cl_ulong nb01_cont = nb01; + cl_ulong nb02_cont = nb02; + cl_ulong nb03_cont = nb03; + + cl_ulong nb10_cont = nb10; + cl_ulong nb11_cont = nb11; + cl_ulong nb12_cont = nb12; + cl_ulong nb13_cont = nb13; + + cl_ulong offset0_cont = offset0; + cl_ulong offset1_cont = offset1; + + if (!ggml_is_contiguous(src0)) { + backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0)); + ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer, + nb00_cont, nb01_cont, nb02_cont, nb03_cont); + mem_src0 = backend_ctx->prealloc_src0.buffer; + offset0_cont = 0; + } + + if (!ggml_is_contiguous(src1)) { + backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1)); + ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer, + nb10_cont, nb11_cont, nb12_cont, nb13_cont); + mem_src1 = backend_ctx->prealloc_src1.buffer; + offset1_cont = 0; + } + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_src0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_cont)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_src1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1_cont)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); @@ -8092,10 +8209,42 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co int batch_stride_b = ne10*ne11; int batch_stride_d = ne0*ne1; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); + cl_mem mem_src0 = extra0->data_device; + cl_mem mem_src1 = extra1->data_device; + + cl_ulong nb00_cont = nb00; + cl_ulong nb01_cont = nb01; + cl_ulong nb02_cont = nb02; + cl_ulong nb03_cont = nb03; + + cl_ulong nb10_cont = nb10; + cl_ulong nb11_cont = nb11; + cl_ulong nb12_cont = nb12; + cl_ulong nb13_cont = nb13; + + cl_ulong offset0_cont = offset0; + cl_ulong offset1_cont = offset1; + + if (!ggml_is_contiguous(src0)) { + backend_ctx->prealloc_src0.allocate(backend_ctx->context, ggml_nbytes(src0)); + ggml_cl_copy_to_contiguous(backend, src0, backend_ctx->prealloc_src0.buffer, + nb00_cont, nb01_cont, nb02_cont, nb03_cont); + mem_src0 = backend_ctx->prealloc_src0.buffer; + offset0_cont = 0; + } + + if (!ggml_is_contiguous(src1)) { + backend_ctx->prealloc_src1.allocate(backend_ctx->context, ggml_nbytes(src1)); + ggml_cl_copy_to_contiguous(backend, src1, backend_ctx->prealloc_src1.buffer, + nb10_cont, nb11_cont, nb12_cont, nb13_cont); + mem_src1 = backend_ctx->prealloc_src1.buffer; + offset1_cont = 0; + } + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_src0)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0_cont)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_src1)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1_cont)); CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); @@ -8123,6 +8272,10 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co if (ne11 < 32) { break; } + if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) { + break; + } + kernel = backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm; nth0 = 128; // calculated as (BM*BN)/(TM*TN) From e34d6d03b25d9e8d07f3bd0190b27d0d01a7e416 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Fri, 23 Jan 2026 02:58:07 +0800 Subject: [PATCH 3/6] convert_hf_to_gguf.py: refactor modify_tensors to call super (#18866) --- convert_hf_to_gguf.py | 952 +++++++++++++++++------------------------- 1 file changed, 381 insertions(+), 571 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3fdfc5bf565..8cc4963fb22 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -514,8 +514,7 @@ def set_gguf_parameters(self): raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - + del bid # unused return [(self.map_tensor_name(name), data_torch)] def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: @@ -1981,13 +1980,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - tensors: list[tuple[str, Tensor]] = [] - if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): # Map bloom-style qkv_linear to gpt-style qkv_linear # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa @@ -2014,9 +2009,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter ) logger.info("re-format attention.linear_qkv.bias") - tensors.append((self.map_tensor_name(name), data_torch)) - - return tensors + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("BloomForCausalLM", "BloomModel") @@ -2036,15 +2029,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) name = re.sub(r'transformer\.', '', name) - tensors: list[tuple[str, Tensor]] = [] - if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): # Map bloom-style qkv_linear to gpt-style qkv_linear # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa @@ -2071,9 +2060,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter ) logger.info("re-format attention.linear_qkv.bias") - tensors.append((self.map_tensor_name(name), data_torch)) - - return tensors + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("MPTForCausalLM") @@ -2108,15 +2095,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_max_alibi_bias(0.0) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if "scales" in name: new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales")) new_name = new_name.replace("scales", "act.scales") else: new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias")) - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, new_name, bid) @ModelBase.register("OrionForCausalLM") @@ -2170,11 +2155,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) - tensors: list[tuple[str, Tensor]] = [] - if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight": logger.info(f"Unpacking and permuting layer {bid}") - tensors = [ + yield from [ (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)), (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), @@ -2183,9 +2166,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._reverse_hf_part(data_torch, 2)), ] else: - tensors = [(self.map_tensor_name(name), data_torch)] - - return tensors + yield from self.modify_tensors(data_torch, self.map_tensor_name(name), bid) def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: @@ -2266,8 +2247,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) @@ -2277,7 +2256,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("k_proj.weight"): data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: @@ -2314,8 +2293,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # QKV tensor transform # The original query_key_value tensor contains n_head_kv "kv groups", # each consisting of n_head/n_head_kv query weights followed by one key @@ -2337,7 +2314,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) data_torch = torch.cat((q, k, v)).reshape_as(data_torch) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("GPTBigCodeForCausalLM") @@ -2399,22 +2376,20 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter n_head_kv = 1 head_dim = self.hparams["n_embd"] // n_head - tensors: list[tuple[str, Tensor]] = [] - if bid is not None: if name == f"transformer.h.{bid}.attn.kv.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch[:n_head_kv * head_dim])) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), data_torch[n_head_kv * head_dim:])) - elif name == f"transformer.h.{bid}.attn.q.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch)) - elif name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])) - tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])) - - if len(tensors) == 0: - tensors.append((self.map_tensor_name(name), data_torch)) + yield from super().modify_tensors(data_torch[:n_head_kv * head_dim], self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) + yield from super().modify_tensors(data_torch[n_head_kv * head_dim:], self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) + return + if name == f"transformer.h.{bid}.attn.q.weight": + yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) + return + if name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": + yield from super().modify_tensors(data_torch[:ff_dim], self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid) + yield from super().modify_tensors(data_torch[ff_dim:], self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid) + return - return tensors + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") @@ -2461,7 +2436,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if len(self._q_norms[bid]) >= n_head: return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm") else: - return [] + return if name.find("k_layernorm.norms") != -1: assert bid is not None @@ -2474,9 +2449,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if len(self._k_norms[bid]) >= n_kv_head: return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm") else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"): datas: list[Tensor] = [] @@ -2488,9 +2463,8 @@ def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_ data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" - new_name = self.map_tensor_name(merged_name) - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, merged_name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -2616,7 +2590,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter ) if is_multimodal_tensor: - return [] # skip vision tensors + return # skip vision tensors elif self.hf_arch == "LlamaModel": name = "model." + name elif name.startswith("model.text_model"): @@ -2642,8 +2616,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for wid in ["w1", "w2", "w3"]: datas: list[Tensor] = [] @@ -2657,14 +2629,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): @@ -2755,8 +2725,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["gate_proj", "up_proj", "down_proj"]: datas: list[Tensor] = [] @@ -2768,17 +2736,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) + yield from super().modify_tensors(data_torch, merged_name, bid) - return tensors + return else: - return [] + return if name.endswith(".expert_bias"): name = name.replace(".expert_bias", ".expert_bias.bias") - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register( @@ -2835,7 +2802,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_spatial_merge_size(self.global_config["spatial_merge_size"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused n_head = ( self.hparams["num_attention_heads"] if not self.is_mistral_format else self.find_vparam(["num_attention_heads"]) ) @@ -2856,7 +2822,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = LlamaModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight", "k_proj.bias")) and not self.is_mistral_format: data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) + return embed_key = "embed_tokens.weight" if not self.is_mistral_format else "tok_embeddings.weight" if self.img_break_tok_id > 0 and embed_key in name: @@ -2864,9 +2831,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # for pixtral model, we need to extract the [IMG_BREAK] token embedding img_break_embd = data_torch[self.img_break_tok_id] name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK] - return [(self.map_tensor_name(name), img_break_embd)] + yield from super().modify_tensors(img_break_embd, name, bid) - return [] # skip other tensors + return # skip other tensors @ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration") @@ -2897,13 +2864,12 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name if is_vision_tensor: - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) - return [] # skip other tensors + return # skip other tensors @ModelBase.register( @@ -2942,18 +2908,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): name_gate = name.replace("gate_up_proj", "gate_proj.weight") dim_half = data_torch.shape[-1] // 2 gate_proj_weight, up_proj_weight = data_torch.transpose(-1, -2).split(dim_half, dim=-2) - return [ - (self.map_tensor_name(name_gate), gate_proj_weight), - (self.map_tensor_name(name_up), up_proj_weight) - ] + yield from super().modify_tensors(gate_proj_weight, name_gate, bid) + yield from super().modify_tensors(up_proj_weight, name_up, bid) + return if name.endswith("down_proj"): name += ".weight" data_torch = data_torch.transpose(-1, -2) if "multi_modal_projector" in name or "vision_model" in name: - return [] - return super().modify_tensors(data_torch, name, bid) + return + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Llama4ForConditionalGeneration") @@ -2967,16 +2932,15 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_use_gelu(True) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused if "multi_modal_projector" in name or "vision_model" in name: # process vision tensors if "positional_embedding_vlm" in name and ".weight" not in name: name += ".weight" if "multi_modal_projector.linear_1" in name: # despite the name with number postfix, this is a single fully connected layer - return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch)] - return [(self.map_tensor_name(name), data_torch)] - return [] + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_MMPROJ_FC] + '.weight', data_torch) + else: + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register( @@ -3008,9 +2972,9 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): name = name.replace("language_model.", "") if "multi_modal_projector" in name or "vision_tower" in name: - return [] + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("DeciLMForCausalLM") @@ -3149,7 +3113,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = DeciModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight", "k_proj.bias")): data_torch = DeciModel.permute(data_torch, n_head, n_kv_head) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): @@ -3223,7 +3187,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # transform weight into 1/0/-1 (in fp32) data_torch = self.weight_quant(data_torch) - yield (new_name, data_torch) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM") @@ -3279,11 +3243,11 @@ def set_gguf_parameters(self): _cur_expert = "" def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - tensors: list[tuple[str, Tensor]] = [] + deferred: list[tuple[Tensor, str, int | None]] = [] is_expert = ".moe." in name or ".block_sparse_moe.experts." in name if not is_expert: - tensors.append((self.map_tensor_name(name), data_torch)) + deferred.append((data_torch, name, bid)) # process the experts separately if is_expert or self._cur_expert: @@ -3298,11 +3262,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name in self._experts[bid]: self._cur_expert = name self._experts[bid][name].append(data_torch) - return [] + return elif is_expert: self._cur_expert = name self._experts[bid][name] = [data_torch] - return [] + return else: self._cur_expert = "" @@ -3324,11 +3288,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight" - new_name = self.map_tensor_name(merged_name) - - yield (new_name, data_torch) + yield from super().modify_tensors(data_torch, merged_name, bid) - yield from tensors + for t in deferred: + yield from super().modify_tensors(*t) @ModelBase.register("DbrxForCausalLM") @@ -3360,8 +3323,6 @@ def set_gguf_parameters(self): logger.info(f"gguf: file type = {self.ftype}") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - n_expert = self.hparams["ffn_config"]["moe_num_experts"] n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] n_embd = self.hparams["d_model"] @@ -3392,7 +3353,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, new_name, bid) def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: del name, new_name, bid # unused @@ -3437,8 +3398,6 @@ def set_vocab(self): self._set_vocab_sentencepiece() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") @@ -3448,7 +3407,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith(("k_proj.weight")): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("MiniCPM3ForCausalLM") @@ -3558,7 +3517,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter or name.startswith("vision_model") or name.startswith("audio_tower") \ or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"): # skip vision and audio tensors - return [] + return yield from super().modify_tensors(data_torch, name, bid) @@ -3755,23 +3714,20 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter total_k_dim = num_kv_heads * head_dim total_v_dim = num_kv_heads * head_dim q_proj_weight, k_proj_weight, v_proj_weight = data_torch.split([total_q_dim, total_k_dim, total_v_dim], dim=0) - return [ - (self.map_tensor_name(name_q), q_proj_weight), - (self.map_tensor_name(name_k), k_proj_weight), - (self.map_tensor_name(name_v), v_proj_weight) - ] + yield from super().modify_tensors(q_proj_weight, name_q, bid) + yield from super().modify_tensors(k_proj_weight, name_k, bid) + yield from super().modify_tensors(v_proj_weight, name_v, bid) # split the up_gate_proj into gate and up # up_gate_proj shape: [2 * intermediate_size, hidden_size] - if "up_gate_proj" in name: + elif "up_gate_proj" in name: name_up = name.replace("up_gate_proj.weight", "up_proj.weight") name_gate = name.replace("up_gate_proj.weight", "gate_proj.weight") dim_half = data_torch.shape[0] // 2 gate_proj_weight, up_proj_weight = data_torch.split(dim_half, dim=0) - return [ - (self.map_tensor_name(name_gate), gate_proj_weight), - (self.map_tensor_name(name_up), up_proj_weight) - ] - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(gate_proj_weight, name_gate, bid) + yield from super().modify_tensors(up_proj_weight, name_up, bid) + else: + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Ernie4_5_MoeForCausalLM") @@ -3804,20 +3760,20 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2) match = re.match(r"model.mtp_block.(\d+)", name) if match: - return [] + return # skip all other MTP tensors for now match = re.match(r"model.mtp_emb_norm.(\d+)", name) if match: - return [] + return match = re.match(r"model.mtp_hidden_norm.(\d+)", name) if match: - return [] + return match = re.match(r"model.mtp_linear_proj.(\d+)", name) if match: - return [] + return # process the experts separately if name.find("mlp.experts") != -1: @@ -3830,8 +3786,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["gate_proj", "up_proj", "down_proj"]: datas: list[Tensor] = [] @@ -3843,13 +3797,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) - - return tensors - else: - return [] - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, merged_name, bid) + else: + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -3880,14 +3830,13 @@ def set_vocab(self): self._set_vocab_gpt2() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused if name.startswith("thinker."): name = name.replace("thinker.", "") if name.startswith("visual") or name.startswith("audio") or \ name.startswith("talker") or name.startswith("token2wav"): # skip multimodal tensors - return [] - return [(self.map_tensor_name(name), data_torch)] + return + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") @@ -3936,7 +3885,6 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused if name.startswith("visual."): # process visual tensors # split QKV tensors if needed @@ -3950,23 +3898,18 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter wq = data_torch[:c] wk = data_torch[c: c * 2] wv = data_torch[c * 2:] - return [ - (self.map_tensor_name(name.replace("qkv", "q")), wq), - (self.map_tensor_name(name.replace("qkv", "k")), wk), - (self.map_tensor_name(name.replace("qkv", "v")), wv), - ] + yield from super().modify_tensors(wq, name.replace("qkv", "q"), bid) + yield from super().modify_tensors(wk, name.replace("qkv", "k"), bid) + yield from super().modify_tensors(wv, name.replace("qkv", "v"), bid) elif 'patch_embed.proj.weight' in name: # split Conv3D into Conv2Ds c1, c2, kt, kh, kw = data_torch.shape del c1, c2, kh, kw # unused assert kt == 2, "Current implmentation only support temporal_patch_size of 2" - return [ - (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight" , data_torch[:, :, 0, ...]), - (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]), - ] + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight" , data_torch[:, :, 0, ...]) + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]) else: - return [(self.map_tensor_name(name), data_torch)] - return [] # skip other tensors + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Qwen2_5OmniModel") @@ -4022,10 +3965,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if "audio_bos_eos_token" in name: # this tensor is left unused in transformers code # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809 - return [] - return [(self.map_tensor_name(name), data_torch)] - - return super().modify_tensors(data_torch, name, bid) + return + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("InternVisionModel") @@ -4072,7 +4013,6 @@ def _mapping_interns1_name(self, name): return name def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused vision_prefix = ['vision_model', 'mlp', 'model.vision_tower', 'model.multi_modal_projector'] # deal with intern-s1 special case name = self._mapping_interns1_name(name) @@ -4094,13 +4034,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter wq = data_torch[:c] wk = data_torch[c: c * 2] wv = data_torch[c * 2:] - return [ - (self.map_tensor_name(name.replace("attn.qkv", "self_attn.q_proj")), wq), - (self.map_tensor_name(name.replace("attn.qkv", "self_attn.k_proj")), wk), - (self.map_tensor_name(name.replace("attn.qkv", "self_attn.v_proj")), wv), - ] - return [(self.map_tensor_name(name), data_torch)] - return [] # skip other tensors + yield from super().modify_tensors(wq, name.replace("attn.qkv", "self_attn.q_proj"), bid) + yield from super().modify_tensors(wk, name.replace("attn.qkv", "self_attn.k_proj"), bid) + yield from super().modify_tensors(wv, name.replace("attn.qkv", "self_attn.v_proj"), bid) + else: + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("WavTokenizerDec") @@ -4108,18 +4046,16 @@ class WavTokenizerDecModel(TextModel): model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if \ name.endswith("codebook.cluster_size") or \ name.endswith("codebook.embed_avg") or \ name.endswith("codebook.inited"): logger.debug(f"Skipping {name!r}") - return [] + return logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}") - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def set_vocab(self): self._set_vocab_none() @@ -4174,7 +4110,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # Need PyTorch: (128, 2048, 768) [reversed of GGML] # So: permute(0, 2, 1): (128, 768, 2048) -> (128, 2048, 768) permuted = data_torch.permute(0, 2, 1).contiguous() - return [(self.map_tensor_name(mapped), permuted)] + yield from super().modify_tensors(permuted, mapped, bid) + return if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"): if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0: @@ -4192,14 +4129,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter mapped_up = f"{base}.up_proj.weight" perm_gate = gate.permute(0, 2, 1).contiguous() perm_up = up.permute(0, 2, 1).contiguous() - return [ - (self.map_tensor_name(mapped_gate), perm_gate), - (self.map_tensor_name(mapped_up), perm_up), - ] + yield from super().modify_tensors(perm_gate, mapped_gate, bid) + yield from super().modify_tensors(perm_up, mapped_up, bid) + return if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"): # skip visual tensors - return [] + return if name.find("experts") != -1: n_experts = self.hparams["num_experts"] assert bid is not None @@ -4210,8 +4146,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -4225,14 +4159,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -4312,7 +4244,7 @@ def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if "model.vision_" in name: # skip multimodal tensors - return [] + return if self.is_rerank: is_tied_head = self.is_tied_embeddings and "embed_tokens" in name @@ -4322,13 +4254,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight", self._get_cls_out_tensor(data_torch), ) + yield cls_out_head if is_tied_head: - embed = (self.map_tensor_name(name), data_torch) - return [cls_out_head, embed] - if is_real_head: - return [cls_out_head] + yield from super().modify_tensors(data_torch, name, bid) + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Qwen3MoeForCausalLM") @@ -4366,7 +4297,7 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.startswith("mtp"): - return [] # ignore MTP layers for now + return # ignore MTP layers for now if name.endswith(".A_log"): data_torch = -torch.exp(data_torch) elif name.endswith(".dt_bias"): @@ -4468,7 +4399,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter assert self.hparams_vision is not None # Skip text model tensors - they go in the text model file if name.startswith("model.language_model.") or name.startswith("lm_head."): - return [] + return if name.startswith("model.visual."): name = name.replace("model.visual.", "visual.", 1) @@ -4493,7 +4424,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter raise ValueError(f"Unexpected deepstack tensor: {name}") new_name = self.format_tensor_name(tensor_type, idx, suffix=f".{suffix}") - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, new_name, bid) + return if name.startswith("visual.merger."): suffix = name.split(".", 2)[2] @@ -4513,7 +4445,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_POST_NORM, suffix=f".{suffix.split('.', 1)[1]}") else: raise ValueError(f"Unexpected merger tensor: {name}") - return [(new_name, data_torch)] + yield (new_name, data_torch) + return if name == "visual.patch_embed.proj.weight": # split Conv3D into Conv2Ds along temporal dimension @@ -4521,20 +4454,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter del c1, c2 if kt != 2: raise ValueError("Current implementation only supports temporal_patch_size of 2") - return [ - (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...]), - (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]), - ] + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...]) + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]) + return if name == "visual.patch_embed.proj.bias": # Include the bias - it's used by the C++ code - return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch)] + yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch) + return if name.startswith("visual."): - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) + return # Fall back to parent class for other tensors - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration") @@ -4557,8 +4491,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.startswith("model.visual."): name = name.replace("model.visual.", "visual.") if name.startswith("visual.merger."): - return [(self.map_tensor_name(name), data_torch)] - return super().modify_tensors(data_torch, name, bid) + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + return + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Qwen3VLForConditionalGeneration") @@ -4576,9 +4511,9 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # Skip vision tensors - they go in the mmproj file if name.startswith("model.visual."): - return [] + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Qwen3VLMoeForConditionalGeneration") @@ -4594,9 +4529,9 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # Skip vision tensors - they go in the mmproj file if name.startswith("model.visual."): - return [] + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("GPT2LMHeadModel") @@ -4613,22 +4548,17 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - tensors: list[tuple[str, Tensor]] = [] - # we don't need these if name.endswith((".attn.bias", ".attn.masked_bias")): - return tensors + yield from super().modify_tensors(data_torch, name, bid) + return if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): data_torch = data_torch.transpose(1, 0) new_name = self.map_tensor_name(name) - tensors.append((new_name, data_torch)) - - return tensors + yield from super().modify_tensors(data_torch, new_name, bid) @ModelBase.register("PhiForCausalLM") @@ -4852,8 +4782,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["w1", "w2", "w3"]: datas: list[Tensor] = [] @@ -4867,14 +4795,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -4920,8 +4846,6 @@ def shuffle_attn_output_weight(self, data_torch): return data_torch def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - new_name = self.map_tensor_name(name) # shuffle for broadcasting of gqa in ggml_mul_mat @@ -4930,7 +4854,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter elif new_name.endswith("attn_output.weight"): data_torch = self.shuffle_attn_output_weight(data_torch) - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Plamo2ForCausalLM", "PLaMo2ForCausalLM") @@ -4991,8 +4915,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if name.endswith(".A_log"): data_torch = -torch.exp(data_torch) elif name.endswith(".dt_bias"): @@ -5021,9 +4943,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter elif name.endswith(".norm.weight"): data_torch += 1.0 - new_name = self.map_tensor_name(name) - - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Plamo3ForCausalLM", "PLaMo3ForCausalLM") @@ -5072,7 +4992,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter elif name.endswith(".norm.weight"): data_torch = data_torch + 1.0 - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("CodeShellForCausalLM") @@ -5234,7 +5154,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name = name.replace("language_model.", "") # InternVL if name.startswith("mlp") or name.startswith("vision_model"): # skip visual tensors - return [] + return if bid is not None and f"model.layers.{bid}.attention.wqkv" in name: qkv = data_torch @@ -5247,13 +5167,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter k = LlamaModel.permute(k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads) v = v.reshape((-1, v.shape[-1])) - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v), - ] + yield from super().modify_tensors(q, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) + yield from super().modify_tensors(k, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) + yield from super().modify_tensors(v, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) else: - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("InternLM3ForCausalLM") @@ -5305,12 +5223,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name = name.replace("language_model.", "") # InternVL if name.startswith("mlp") or name.startswith("vision_model"): # skip visual tensors - return [] + return if name.endswith(("q_proj.weight", "q_proj.bias")): data_torch = LlamaModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight", "k_proj.bias")): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel", "BertForSequenceClassification") @@ -5365,8 +5283,6 @@ def phantom(tok, toktype): special_vocab.add_to_gguf(self.gguf_writer) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if name.startswith("bert."): name = name[5:] @@ -5378,13 +5294,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # we are only using BERT for embeddings so we don't need the pooling layer if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): - return [] # we don't need these + return # we don't need these if name.startswith("cls.predictions"): - return [] + return if name.startswith("cls.seq_relationship"): - return [] + return if self.cls_out_labels: # For BertForSequenceClassification (direct projection layer) @@ -5394,7 +5310,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name == "classifier.bias": name = "classifier.out_proj.bias" - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def _xlmroberta_tokenizer_init(self) -> None: # we need the pad_token_id to know how to chop down position_embd matrix @@ -5549,9 +5465,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # These layers act as MLM head, so we don't need them if name.startswith("vocab_"): - return [] + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("RobertaModel", "RobertaForSequenceClassification") @@ -5594,7 +5510,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if self._position_offset is not None: data_torch = data_torch[self._position_offset:,:] - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("NomicBertModel") @@ -5647,7 +5563,7 @@ def set_vocab(self) -> None: def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]: # If the tensor is an experts bias tensor, skip it by returning an empty list. if "mlp.experts.bias" in name: - return [] # Explicitly return an empty list. + return # Explicitly return. if "mlp.experts.mlp.w1" in name: data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"]) @@ -5658,7 +5574,7 @@ def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) - data_torch = data_torch.transpose(1, 2) name += ".weight" - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def set_gguf_parameters(self): super().set_gguf_parameters() @@ -5698,12 +5614,12 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch, name, bid): if name.startswith("decoder."): - return [] + return if name.startswith("model."): name = name[6:] - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") @@ -5760,7 +5676,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith(".0.lora_A") or name.endswith(".0.lora_B"): if name.startswith("pooler.dense"): - return [] + return num_loras = data_torch.size(0) assert num_loras == len(self._lora_names) @@ -5776,9 +5692,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter new_name = new_name[:-1] + ("a" if new_name[-1:] == "b" else "b") lora_writer.add_tensor(new_name, data.float().numpy(), raw_dtype=gguf.GGMLQuantizationType.F32) - return [] + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) def set_gguf_parameters(self): super().set_gguf_parameters() @@ -5837,19 +5753,17 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") - return [] + return # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 if name.endswith("norm.weight"): data_torch = data_torch + 1 - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Gemma2ForCausalLM") @@ -5883,19 +5797,17 @@ def set_gguf_parameters(self): self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # lm_head is not used in llama.cpp, while autoawq will include this tensor in model # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") - return [] + return # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 if name.endswith("norm.weight"): data_torch = data_torch + 1 - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") @@ -5930,14 +5842,12 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4)) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if "language_model." in name: name = name.replace("language_model.", "") elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ or name.startswith("multimodal_projector.") or name.startswith("vision_model."): - return [] # skip vision tensors + return # skip vision tensors # remove OOV (out-of-vocabulary) rows in token_embd if "embed_tokens.weight" in name: @@ -5953,7 +5863,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("norm.weight"): data_torch = data_torch + self.norm_shift - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Gemma3TextModel") @@ -6059,10 +5969,8 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if "vision_model.head." in name: - return [] # skip redundant tensors for tinygemma3 + return # skip redundant tensors for tinygemma3 if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ or name.startswith("multimodal_projector.") or name.startswith("vision_model."): @@ -6076,9 +5984,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter logger.info(f"Correcting norm value for '{name}'") data_torch = data_torch + 1 - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) - return [] # skip other tensors + return # skip other tensors class ConformerAudioModel(MmprojModel): @@ -6103,7 +6011,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._batch_norm_tensors[bid][name] = data_torch if len(self._batch_norm_tensors[bid]) < 5: - return [] + return weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"] bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"] @@ -6113,10 +6021,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter a = weight / torch.sqrt(running_var + eps) b = bias - running_mean * a - return [ - (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a), - (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b), - ] + yield from super().modify_tensors(a, f"conformer.layers.{bid}.conv.batch_norm.weight", bid) + yield from super().modify_tensors(b, f"conformer.layers.{bid}.conv.batch_norm.bias", bid) + return # reshape conv weights if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"): @@ -6128,7 +6035,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter assert data_torch.shape[2] == 1 data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1]) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Gemma3nForConditionalGeneration") @@ -6227,14 +6134,14 @@ def custom_map(self, name: str) -> str: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if (ConformerAudioModel.is_audio_tensor(name)): name = name.replace("model.audio_tower.conformer.", "conformer.layers.") - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) # Gemma3n uses # - model.embed_vision.* for projection layers # - model.vision_tower.* for vision encoder # Skip non-vision tensors if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")): - return [] + return if name.startswith("model.vision_tower.timm_model.blocks."): # Double-indexed block tensors through custom logic @@ -6246,7 +6153,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"): data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1] - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, new_name, bid) @ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration") @@ -6324,7 +6231,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # TODO: implement self.prediction_coefs.weight.clamp_(...) if "language_model." not in name: - return [] # skip non-language model tensors + return # skip non-language model tensors # Pad token embeddings for vision/audio special tokens (262144-262399) if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name: @@ -6346,7 +6253,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # Continue with normal processing name = name.replace("language_model.", "") - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) + return if "altup_unembed_projections" in name: data_torch = data_torch.to(device="cpu") @@ -6362,9 +6270,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter raise ValueError(f"Unknown name: {name}") out = self._stack_matrices(self._altup_unembd) if out is not None: - return [(self.map_tensor_name("model.altup_unembed_projections.weight"), out)] + yield from super().modify_tensors(out, "model.altup_unembed_projections.weight", bid) + return else: - return [] + return if "altup_projections" in name: data_torch = data_torch.to(device="cpu") @@ -6378,11 +6287,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter raise ValueError(f"Unknown name: {name}") out = self._stack_matrices(self._altup_proj) if out is not None: - return [(self.map_tensor_name("model.altup_projections.weight"), out)] + yield from super().modify_tensors(out, "model.altup_projections.weight", bid) + return else: - return [] + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Starcoder2ForCausalLM") @@ -6765,11 +6675,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if self._tok_embd is not None and new_name == output_name: if torch.equal(self._tok_embd, data_torch): logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") - return [] + return elif new_name == tok_embd_name: self._tok_embd = data_torch - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, new_name, bid) @ModelBase.register("Mamba2ForCausalLM") @@ -7025,8 +6935,6 @@ def set_gguf_parameters(self): # Same as super class, but permuting q_proj, k_proj # Copied from: LlamaModel def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") @@ -7035,7 +6943,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("k_proj.weight"): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("SeedOssForCausalLM") @@ -7091,8 +6999,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -7106,14 +7012,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) # Copied from: Qwen2MoeModel def prepare_tensors(self): @@ -7336,8 +7240,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for wid in ["w1", "w2", "w3"]: datas: list[Tensor] = [] @@ -7351,14 +7253,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -7425,8 +7325,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -7440,14 +7338,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -7583,9 +7479,9 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # skip vision tensors and remove "language_model." for Kimi-VL if "vision_tower" in name or "multi_modal_projector" in name: - return [] + return if name.startswith("siglip2.") or name.startswith("merger."): - return [] + return if name.startswith("language_model."): name = name.replace("language_model.", "") @@ -7593,7 +7489,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if self.hparams.get("tie_word_embeddings", False): if name == "lm_head.weight" or name == "model.lm_head.weight": logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)") - return [] + return # rename e_score_correction_bias tensors if name.endswith("e_score_correction_bias"): @@ -7603,7 +7499,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter block_count = self.hparams["num_hidden_layers"] match = re.match(r"model.layers.(\d+)", name) if match and int(match.group(1)) >= block_count: - return [] + return # process the experts separately if name.find("mlp.experts") != -1: @@ -7616,8 +7512,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -7631,12 +7525,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed if name.endswith("kv_b_proj.weight"): @@ -7653,12 +7545,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) k_b = k_b.transpose(1, 2) - return [ - (self.map_tensor_name(name_kb), k_b), - (self.map_tensor_name(name_vb), v_b) - ] + yield from super().modify_tensors(k_b, name_kb, bid) + yield from super().modify_tensors(v_b, name_vb, bid) + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -7700,9 +7591,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): # not enough expert weights to merge if len(expert_cache) < n_experts * len(expert_weights): - return [] + return - tensors: list[tuple[str, Tensor]] = [] for w_name in expert_weights: datas: list[Tensor] = [] @@ -7714,12 +7604,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) + yield from super().modify_tensors(data_torch, new_name, bid) del self._experts_cache[bid] - return tensors + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("MiMoV2FlashForCausalLM") @@ -7761,7 +7651,7 @@ def modify_tensors(self, data_torch, name, bid): # TODO: mimo v2 does not indicate the number of next-token-prediction layers, therefore we cannot do the same way as GLM4_MOE if "model.mtp." in name: - return [] + return # process the experts separately if name.find("mlp.experts") != -1: @@ -7774,8 +7664,6 @@ def modify_tensors(self, data_torch, name, bid): self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["gate_proj", "up_proj", "down_proj"]: datas: list[Tensor] = [] @@ -7787,13 +7675,12 @@ def modify_tensors(self, data_torch, name, bid): data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] - return [(self.map_tensor_name(name), data_torch)] + return + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -7837,8 +7724,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name == "lm_head.weight": if self.hparams.get("tie_word_embeddings", False): logger.info("Skipping tied output layer 'lm_head.weight'") - return [] - return [(self.map_tensor_name(name), data_torch)] + return + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Dots1ForCausalLM") @@ -7860,8 +7747,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): if name.endswith("e_score_correction_bias"): name = name.replace("e_score_correction_bias", "e_score_correction.bias") if "shared_experts" in name: - return [(self.map_tensor_name(name), data_torch)] - return super().modify_tensors(data_torch, name, bid) + yield from ModelBase.modify_tensors(self, data_torch, name, bid) + else: + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("PLMForCausalLM") @@ -7880,9 +7768,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_value_length(hparams["v_head_dim"]) self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - return [(self.map_tensor_name(name), data_torch)] - def prepare_tensors(self): super().prepare_tensors() @@ -8013,8 +7898,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder @@ -8025,9 +7908,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self.shared_token_embeddings_found = True else: logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("T5EncoderModel") @@ -8149,8 +8032,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder @@ -8161,9 +8042,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self.shared_token_embeddings_found = True else: logger.debug(f"Skipping shared tensor {name!r} in safetensors so that convert can end normally.") - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("JAISLMHeadModel") @@ -8211,13 +8092,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - tensors: list[tuple[str, Tensor]] = [] - # we don't need these if name.endswith((".attn.bias")): - return tensors + return if name.endswith(("relative_pe.slopes")): # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) @@ -8228,7 +8105,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter first_val = float(data_torch[0].item()) self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) - return tensors + return if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): data_torch = data_torch.transpose(1, 0) @@ -8236,13 +8113,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter new_name = self.map_tensor_name(name) if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): - tensors.append((new_name, data_torch * self.embeddings_scale)) + yield from super().modify_tensors(data_torch * self.embeddings_scale, new_name, bid) elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): - tensors.append((new_name, data_torch * self.width_scale)) + yield from super().modify_tensors(data_torch * self.width_scale, new_name, bid) else: - tensors.append((new_name, data_torch)) - - return tensors + yield from super().modify_tensors(data_torch, new_name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -8308,7 +8183,7 @@ def normal_to_neox(weights: Tensor, n_head: int, n_head_kv: int, head_dim: int, def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.startswith("model.visual."): # ignore visual part of Glm4v - return [] + return elif name.startswith("model.language_model."): name = name.replace("language_model.", "") # for Glm4v if self.use_mrope: @@ -8321,7 +8196,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor) if name.endswith(("k_proj.weight", "k_proj.bias")): data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_kv_head, head_dim, self.partial_rotary_factor) - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration") @@ -8396,13 +8271,14 @@ def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: if name.startswith("model.visual."): # ignore visual part - return [] + return elif name.startswith("model.language_model."): name = name.replace("language_model.", "") # for multimodal variants # Handle main token embedding (but not layer-specific NextN embeddings) if name == "model.embed_tokens.weight" and ".layers." not in name: - return [(self.map_tensor_name("token_embd.weight"), data_torch)] + yield from super().modify_tensors(data_torch, "token_embd.weight", bid) + return # Handle routed experts if name.find("mlp.experts") != -1: @@ -8415,8 +8291,6 @@ def modify_tensors( self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -8430,18 +8304,15 @@ def modify_tensors( merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return if name.endswith("e_score_correction_bias"): name = name.replace("e_score_correction_bias", "e_score_correction.bias") - new_name = self.map_tensor_name(name) - - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -8624,13 +8495,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_freq_base(rope_freq) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if name.endswith(".rotary_pos_emb.inv_freq") or name.startswith("model.vision."): - return [] + return name = name.removeprefix("transformer.") - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("NemotronForCausalLM") @@ -8671,7 +8540,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("norm.weight"): data_torch = data_torch + 1 - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("ExaoneForCausalLM") @@ -8827,11 +8696,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter new_name = remapper[_n.stem] + _n.suffix # set shared weights for all NextN/MTP layers - tensors = [] for bid in range(self.hparams['num_hidden_layers'], self.block_count): - new_name = new_name.format(bid=bid) - tensors.append((self.map_tensor_name(new_name), data_torch)) - return tensors + yield from super().modify_tensors(data_torch, new_name.format(bid=bid), bid) + return if name.endswith("e_score_correction_bias"): name = name.replace("e_score_correction_bias", "e_score_correction.bias") @@ -8846,8 +8713,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -8863,12 +8728,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, new_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -8938,10 +8803,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter ffn_dim = self.hparams["intermediate_size"] assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size" gate, up = data_torch.split(ffn_dim, dim=-2) - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate), - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up), - ] + yield from super().modify_tensors(gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), bid) + yield from super().modify_tensors(up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), bid) has_experts = bool(self.hparams.get('num_local_experts')) @@ -8950,21 +8813,18 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size" gate, up = data_torch.split(ffn_dim, dim=-2) if has_experts: - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate), - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up), - ] - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), gate), - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), up), - ] + yield from super().modify_tensors(gate,self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), bid) + yield from super().modify_tensors(up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), bid) + return + yield from super().modify_tensors(gate, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), bid) + yield from super().modify_tensors(up, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), bid) + return if not has_experts and name.endswith("shared_mlp.output_linear.weight"): - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), data_torch) - ] + yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), bid) + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("GraniteMoeHybridForCausalLM", "BambaForCausalLM") @@ -9058,7 +8918,7 @@ def modify_tensors( return Mamba2Model.modify_tensors(self, data_torch, name, bid) elif bid in self._attn_layers: return GraniteMoeModel.modify_tensors(self, data_torch, name, bid) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def set_gguf_parameters(self): """This method merges params from both parents and some that are @@ -9190,34 +9050,34 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if self.is_moe and bid is not None: if name.endswith("mixer.gate.e_score_correction_bias"): new_name = name.replace("e_score_correction_bias", "e_score_correction.bias") - mapped_name = self.map_tensor_name(new_name) - return [(mapped_name, data_torch)] + yield from super().modify_tensors(data_torch, new_name, bid) + return if name.endswith("mixer.dt_bias"): new_name = name.replace("dt_bias", "dt.bias") - mapped_name = self.map_tensor_name(new_name) - return [(mapped_name, data_torch)] + yield from super().modify_tensors(data_torch, new_name, bid) + return if name.endswith("mixer.conv1d.weight"): squeezed_data = data_torch.squeeze() - mapped_name = self.map_tensor_name(name) - return [(mapped_name, squeezed_data)] + yield from super().modify_tensors(squeezed_data, name, bid) + return if name.endswith("mixer.A_log"): transformed_data = -torch.exp(data_torch) reshaped_data = transformed_data.squeeze().reshape(-1, 1) - mapped_name = self.map_tensor_name(name) - return [(mapped_name, reshaped_data)] + yield from super().modify_tensors(reshaped_data, name, bid) + return if name.endswith("mixer.D"): reshaped_data = data_torch.squeeze().reshape(-1, 1) - mapped_name = self.map_tensor_name(name) - return [(mapped_name, reshaped_data)] + yield from super().modify_tensors(reshaped_data, name, bid) + return if name.endswith("mixer.norm.weight"): reshaped_data = data_torch.reshape(self.n_group, -1) - mapped_name = self.map_tensor_name(name) - return [(mapped_name, reshaped_data)] + yield from super().modify_tensors(reshaped_data, name, bid) + return if name.find("mixer.experts") != -1: n_experts = self.hparams["n_routed_experts"] @@ -9230,7 +9090,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if len(self._experts[bid]) >= n_experts * 2: # merge the experts into a single tensor - tensors: list[tuple[str, Tensor]] = [] for w_name in ["down_proj", "up_proj"]: datas: list[Tensor] = [] @@ -9241,14 +9100,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -9307,21 +9165,19 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) if name.endswith("attention.dense.weight"): - return [(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch)] + yield from super().modify_tensors(data_torch, self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), bid) + return elif name.endswith("query_key_value.weight"): q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2) - return [ - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), BailingMoeModel.permute(q, n_head, n_head)), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), BailingMoeModel.permute(k, n_head, n_kv_head)), - (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v) - ] + yield from super().modify_tensors(BailingMoeModel.permute(q, n_head, n_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), bid) + yield from super().modify_tensors(BailingMoeModel.permute(k, n_head, n_kv_head), self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), bid) + yield from super().modify_tensors(v,self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid) + return elif name.find("mlp.experts") != -1: n_experts = self.hparams["num_experts"] assert bid is not None - tensors: list[tuple[str, Tensor]] = [] - if self._experts is None: self._experts = [{} for _ in range(self.block_count)] @@ -9343,9 +9199,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) + yield from super().modify_tensors(data_torch, new_name, bid) - return tensors + return new_name = self.map_tensor_name(name) @@ -9353,7 +9209,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = data_torch.float() data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7 - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, new_name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -9404,8 +9260,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter n_experts = self.hparams["num_experts"] assert bid is not None - tensors: list[tuple[str, Tensor]] = [] - if self._experts is None: self._experts = [{} for _ in range(self.block_count)] @@ -9425,16 +9279,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return if name.endswith(".expert_bias"): name = name.replace(".expert_bias", ".expert_bias.bias") - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -9470,7 +9321,7 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.endswith(".expert_bias"): # FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303 - return [] + return # process the experts separately if name.find("chunk_experts") != -1: @@ -9483,8 +9334,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._chunk_experts[bid][name] = data_torch if len(self._chunk_experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -9498,12 +9347,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return elif name.find("experts") != -1: n_experts = self.hparams["num_experts"] assert bid is not None @@ -9514,8 +9361,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -9529,14 +9374,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -9570,7 +9413,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # ignore image tokenizer for now # TODO: remove this once image support is implemented for Chameleon if name.startswith("model.vqmodel"): - return [] + return n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") @@ -9585,7 +9428,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith(("k_norm.weight", "k_norm.bias")): data_torch = ChameleonModel._reverse_hf_permute(data_torch, n_kv_head, hidden_dim) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203 @staticmethod @@ -9630,11 +9473,9 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if name.startswith("model.") or name.startswith("lm_head."): # skip language model tensors - return [] + return if name.startswith("audio_encoder.whisper."): name = name.replace("audio_encoder.whisper.","audio_tower.") @@ -9642,7 +9483,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name = name.replace("audio_encoder.", "audio_encoder.adapting.") if name.startswith("audio_encoder.audio_bos_eos_token."): - return [(self.map_tensor_name("model.vision.boi"), data_torch[0]), (self.map_tensor_name("model.vision.eoi"), data_torch[1])] + yield from super().modify_tensors(data_torch[0], "model.vision.boi", bid) + yield from super().modify_tensors(data_torch[1], "model.vision.eoi", bid) + return if name.startswith("audio_encoder.adapting."): name = name.replace("audio_encoder.adapting.","audio.multi_modal_projector.") @@ -9653,13 +9496,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if ".2." in name: name = name.replace(".2.", ".linear_2.") if ".proj." in name: - return [] + return if "conv1.bias" in name or "conv2.bias" in name: # transpose conv1 and conv2 bias data_torch = data_torch.unsqueeze(-1) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("Qwen2AudioForConditionalGeneration") @@ -9686,11 +9529,9 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if name.startswith("language_model."): # skip language model tensors - return [] + return # prevent clash naming with vision tensors if name.startswith("multi_modal_projector"): @@ -9700,7 +9541,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # transpose conv1 and conv2 bias data_torch = data_torch.unsqueeze(-1) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("UltravoxModel") @@ -9944,7 +9785,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name == "lm_head.weight": if self.hparams.get("tie_word_embeddings", False): logger.info("Skipping tied output layer 'lm_head.weight'") - return [] + return if name.find("mlp.experts") != -1: n_experts = self.hparams["num_experts"] @@ -9957,7 +9798,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if len(self._experts[bid]) >= n_experts * 3: # merge the experts into a single 3d tensor - tensors: list[tuple[str, Tensor]] = [] for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -9968,14 +9808,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -10020,8 +9859,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -10035,14 +9872,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) # Copied from: Qwen2MoeModel def prepare_tensors(self): @@ -10141,9 +9976,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name == "lm_head.weight": if self.hparams.get("tie_word_embeddings", False): logger.info("Skipping tied output layer 'lm_head.weight'") - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("SmolLM3ForCausalLM") @@ -10223,8 +10058,6 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: return [] def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if "sinks" in name: name += ".weight" @@ -10238,7 +10071,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = data_torch.transpose(-1, -2) else: # otherwise, it should already be repacked to ggml MXFP4 format - return [] + return # split the gate_up into gate and up if "gate_up_proj" in name: @@ -10246,25 +10079,18 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter name_up = name.replace("gate_up_proj_bias", "up_proj.bias") name_gate = name.replace("gate_up_proj_bias", "gate_proj.bias") gate_proj_bias, up_proj_bias = data_torch[..., ::2], data_torch[..., 1::2] - return [ - (self.map_tensor_name(name_gate), gate_proj_bias), - (self.map_tensor_name(name_up), up_proj_bias) - ] + yield from super().modify_tensors(gate_proj_bias, name_gate, bid) + yield from super().modify_tensors(up_proj_bias, name_up, bid) elif "_blocks" not in name and "_scales" not in name: logger.warning(f"{name} is not in MXFP4, performance may be degraded") name_up = name.replace("gate_up_proj", "up_proj.weight") name_gate = name.replace("gate_up_proj", "gate_proj.weight") data_torch = data_torch.transpose(-1, -2) gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :] - return [ - (self.map_tensor_name(name_gate), gate_proj_weight), - (self.map_tensor_name(name_up), up_proj_weight) - ] - else: - # otherwise, it should already be repacked to ggml MXFP4 format - return [] - - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(gate_proj_weight, name_gate, bid) + yield from super().modify_tensors(up_proj_weight, name_up, bid) + else: + yield from super().modify_tensors(data_torch, name, bid) def set_vocab(self): self._set_vocab_gpt2() @@ -10312,7 +10138,7 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if self._is_vision_tensor(name) or ConformerAudioModel.is_audio_tensor(name): # skip multimodal tensors - return [] + return name = name.replace("language_model.", "") # vision name = name.replace("lfm.", "model.") # audio @@ -10321,7 +10147,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if 'conv.conv' in name: data_torch = data_torch.squeeze(1) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def _is_vision_tensor(self, name: str) -> bool: return "vision_tower" in name or "multi_modal_projector" in name @@ -10336,7 +10162,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if not name.startswith(self.dense_tensor_name): name = "model." + name - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: # dense tensor is stored in a separate safetensors file @@ -10391,9 +10217,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # not enough expert weights to merge if len(expert_cache) < n_experts * len(expert_weights): - return [] + return - tensors: list[tuple[str, Tensor]] = [] for w_name in expert_weights: datas: list[Tensor] = [] @@ -10404,13 +10229,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = torch.stack(datas, dim=0) merged_name = f"layers.{bid}.feed_forward.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) + + yield from super().modify_tensors(data_torch, merged_name, bid) del self._experts_cache[bid] - return tensors + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -10436,7 +10261,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys) - vision_feature_layers_to_drop) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name if is_vision_tensor: @@ -10447,9 +10271,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if "patch_embedding.weight" in name: data_torch = data_torch.view(data_torch.shape[0], 16, 16, 3).permute(0, 3, 1, 2) - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) + return - return [] # skip other tensors + return # skip other tensors @ModelBase.register("Lfm2AudioForConditionalGeneration") @@ -10474,17 +10299,17 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch, name, bid): # skip language model tensors if name.startswith("lfm."): - return [] + return # for training only if any(p in name for p in ["audio_loss_weight"]): - return [] + return # for audio output if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]): - return [] + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("SmallThinkerForCausalLM") @@ -10529,8 +10354,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter self._experts[bid][name] = data_torch if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - # merge the experts into a single 3d tensor for w_name in ["down", "gate", "up"]: datas: list[Tensor] = [] @@ -10544,14 +10367,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors + yield from super().modify_tensors(data_torch, merged_name, bid) + return else: - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) def prepare_tensors(self): super().prepare_tensors() @@ -10584,12 +10405,12 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # these layers act as MLM head, so we don't need them if name.startswith("decoder."): - return [] + return if name.startswith("model."): name = name[6:] - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("ApertusForCausalLM") @@ -10609,24 +10430,24 @@ def modify_tensors(self, data_torch, name, bid): self._alpha_n[bid] = data_torch.to("cpu").float().item() if (len(self._alpha_n) == n_layers): self.gguf_writer.add_xielu_alpha_n([self._alpha_n[k] for k in sorted(self._alpha_n)]) - return [] + return if name.endswith(".act_fn.alpha_p"): self._alpha_p[bid] = data_torch.to("cpu").float().item() if (len(self._alpha_p) == n_layers): self.gguf_writer.add_xielu_alpha_p([self._alpha_p[k] for k in sorted(self._alpha_p)]) - return [] + return if name.endswith(".act_fn.beta"): self._beta[bid] = data_torch.to("cpu").float().item() if (len(self._beta) == n_layers): self.gguf_writer.add_xielu_beta([self._beta[k] for k in sorted(self._beta)]) - return [] + return if name.endswith(".act_fn.eps"): self._eps[bid] = data_torch.to("cpu").float().item() if (len(self._eps) == n_layers): self.gguf_writer.add_xielu_eps([self._eps[k] for k in sorted(self._eps)]) - return [] + return - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) class MistralModel(LlamaModel): @@ -10789,7 +10610,7 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): if name.startswith("vision_") or name.startswith("patch_merger.") or "mm_projector" in name: - return [] + return # rename certain tensors so that we can reuse DeepseekV2Model modify_tensors logic if name.endswith(".qscale_act"): @@ -10805,7 +10626,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): name = name.replace(".w3.", ".up_proj.") name = "model." + name - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) class PixtralModel(LlavaVisionModel): @@ -10850,7 +10671,7 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): name = name.replace("model.vision_encoder.", "vision_tower.") name = name.replace("model.vision_projection.", "multi_modal_projector.") - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("KimiVLForConditionalGeneration") @@ -10870,24 +10691,20 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-5)) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name if is_vision_tensor: if "pos_emb.weight" in name: data_torch = data_torch.view(data_torch.shape[0] * data_torch.shape[1], data_torch.shape[2]) - elif "wqkv" in name: + + if "wqkv" in name: split_dim = 0 if "weight" in name else -1 wq, wk, wv = data_torch.chunk(3, dim=split_dim) - return [ - (self.map_tensor_name(name.replace("wqkv", "wq")), wq), - (self.map_tensor_name(name.replace("wqkv", "wk")), wk), - (self.map_tensor_name(name.replace("wqkv", "wv")), wv) - ] - - return [(self.map_tensor_name(name), data_torch)] - - return [] # skip other tensors + yield from super().modify_tensors(wq, name.replace("wqkv", "wq"), bid) + yield from super().modify_tensors(wk, name.replace("wqkv", "wk"), bid) + yield from super().modify_tensors(wv, name.replace("wqkv", "wv"), bid) + else: + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("CogVLMForCausalLM") @@ -10899,12 +10716,10 @@ def set_gguf_parameters(self): self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - if not name.startswith("model.vision."): - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("CogVLMForCausalLM") @@ -10912,13 +10727,11 @@ class CogVLMModel(LlamaModel): model_arch = gguf.MODEL_ARCH.COGVLM def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # block vision tensors if name.startswith("model.vision."): - return [] + return - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("JanusForConditionalGeneration") @@ -10936,14 +10749,14 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter 'model.generation_head.', ) if name.startswith(skip_prefixes): - return [] + return if name.startswith('model.language_model.'): name = name.replace('model.language_model.', 'model.') elif name.startswith('language_model.'): name = name.replace('language_model.', '') - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) @ModelBase.register("JanusForConditionalGeneration") @@ -10996,11 +10809,9 @@ def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[s return [(tensor_name, data_torch)] def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # Skip language model tensors as they will be handled by `JanusProModel` if name.startswith(('model.language_model.', 'language_model.')): - return [] + return # Skip generation-related components skip_generation_prefixes = ( @@ -11014,17 +10825,19 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter 'generation_head.', ) if name.startswith(skip_generation_prefixes): - return [] + return # Handle aligner tensors if name.startswith(('model.aligner.', 'aligner.')): - return list(self._map_aligner_tensor(data_torch, name)) + yield from self._map_aligner_tensor(data_torch, name) + return # Handle vision tensors if name.startswith(('model.vision_model.', 'vision_model.')): - return [(self.map_tensor_name(name), data_torch)] + yield from super().modify_tensors(data_torch, name, bid) + return - return [] + return @ModelBase.register("YoutuVLForConditionalGeneration") @@ -11063,21 +10876,18 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_wa_layer_indexes(layers=fullatt_block_indexes) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - # Skip language model tensors skip_prefixes = ('lm_head.', 'model.layers.', 'model.embed_tokens.', 'model.norm.') if name.startswith(skip_prefixes): - return [] + return # Try to map the tensor using TensorNameMap (handles vision encoder and projector) try: - new_name = self.map_tensor_name(name) - return [(new_name, data_torch)] + yield from super().modify_tensors(data_torch, name, bid) except ValueError: # If mapping fails, log warning and skip logger.warning(f"Cannot map tensor: {name}") - return [] + return @ModelBase.register("SolarOpenForCausalLM") From e2baf02162382a14c9f4fc15d7681a715256453c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 22 Jan 2026 20:39:25 +0100 Subject: [PATCH 4/6] CUDA: fix alignment check for FA (#19023) --- ggml/src/ggml-cuda/fattn.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 80c3bfbc271..87f07a2f938 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -46,7 +46,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con // are put into the template specialization without GQA optimizations. bool use_gqa_opt = mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0; for (const ggml_tensor * t : {Q, K, V, mask}) { - if (t == nullptr) { + if (t == nullptr || ggml_is_quantized(t->type)) { continue; } for (size_t i = 1; i < GGML_MAX_DIMS; ++i) { @@ -236,7 +236,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const // The kernel versions without this optimization are also used for ALiBi, if there is no mask, or if the KV cache is not padded, bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0; for (const ggml_tensor * t : {Q, K, V, mask}) { - if (t == nullptr) { + if (t == nullptr || ggml_is_quantized(t->type)) { continue; } for (size_t i = 1; i < GGML_MAX_DIMS; ++i) { From a5eaa1d6a3732bc0f460b02b61c95680bba5a012 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 Jan 2026 22:09:01 +0200 Subject: [PATCH 5/6] mla : make the V tensor a view of K (#18986) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * mla : pass V as a view of K to the FA op * cuda : adjust mla logic to new layout * kv-cache : fix rope shift * tests : remove comment * cuda : fix reusable_cutoff Co-authored-by: Johannes Gäßler --------- Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/fattn-common.cuh | 7 +++++-- ggml/src/ggml-cuda/fattn-mma-f16.cuh | 7 ++++--- src/llama-graph.cpp | 5 +++++ src/llama-kv-cache.cpp | 8 ++++++-- src/models/deepseek2.cpp | 9 ++++----- src/models/minicpm3.cpp | 1 + src/models/plm.cpp | 1 + tests/test-backend-ops.cpp | 14 +++++++++++++- 8 files changed, 39 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 8468ba8488d..a781fb91f5b 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -778,12 +778,15 @@ void launch_fattn( ) { constexpr int ncols = ncols1 * ncols2; - const bool is_mla = DV == 512; // TODO better parameterization - const ggml_tensor * Q = dst->src[0]; const ggml_tensor * K = dst->src[1]; const ggml_tensor * V = dst->src[2]; + // TODO: make this more generic by removing the notion of "MLA". + // for example "is V a view of K?" so we can skip loading it. + // V strides should be driven by V itself and avoid assumption of the data layout + const bool is_mla = V->op == GGML_OP_VIEW && V->src[0] == K; + GGML_ASSERT(V || is_mla); const ggml_tensor * mask = dst->src[3]; diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index 8cca89c2bfa..203569e3459 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -794,7 +794,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter( // For MLA K and V have the same data. // Therefore, iterate over V in reverse and re-use the data if possible. static_assert(!mla || nstages <= 1, "combination of MLA and multi-stage loading not implemented"); - constexpr int reusable_cutoff = mla ? (DKQ - 1) - (DKQ - 1) % (2*nbatch_K2) - (DKQ - DV) : DV; + // constexpr int reusable_cutoff = mla ? (DV - 1) - (DV - 1) % (2*nbatch_K2) : DV; + constexpr int reusable_cutoff = DV; // TODO implement properly #if defined(AMD_WMMA_AVAILABLE) && !defined(LDMATRIX_TRANS_AVAILABLE) T_A_VKQ A_identity; make_identity_mat(A_identity); @@ -1552,7 +1553,7 @@ static __global__ void flash_attn_ext_f16( (const half *) (mask + nb33*(sequence % ne33)); float2 * dstk = ((float2 *) dst) + (sequence*ne01.z*ne02 + head0) * (DV/2); - const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio)); + const half2 * V_h2 = mla ? K_h2 : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio)); const float * sinks_f = sinks ? (const float *) sinks + head0 : nullptr; const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head0, n_head_log2, m0, m1) : 1.0f; @@ -1596,7 +1597,7 @@ static __global__ void flash_attn_ext_f16( (const half *) (mask + nb33*(sequence % ne33)); float2 * dstk = ((float2 *) dst) + (sequence*ne01.z*ne02 + head0) * (DV/2); - const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio)); + const half2 * V_h2 = mla ? K_h2 : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio)); const float * sinks_f = sinks ? (const float *) sinks + head0 : nullptr; const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head0, n_head_log2, m0, m1) : 1.0f; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 57485c534ee..5ebd0cf8aac 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1565,6 +1565,11 @@ ggml_tensor * llm_graph_context::build_attn_mha( v = ggml_transpose(ctx0, v); } + // TODO: update llama_kv_cache to not store V cache in the MLA case and automatically return a view of K + if (v_mla) { + v = ggml_view_4d(ctx0, k, v->ne[0], v->ne[1], v->ne[2], v->ne[3], k->nb[1], k->nb[2], k->nb[3], 0); + } + // this can happen when KV cache is not used (e.g. an embedding model with non-causal attn) if (k->type == GGML_TYPE_F32) { k = ggml_cast(ctx0, k, GGML_TYPE_F16); diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index fd9f97d52e8..a7327c49874 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1594,6 +1594,10 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co const auto & n_embd_head_k = hparams.n_embd_head_k; //const auto & n_embd_head_v = hparams.n_embd_head_v; + const auto & n_rot = hparams.n_rot; + + const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0; + auto inp = std::make_unique(this); inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream); @@ -1614,10 +1618,10 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co ggml_tensor * k = ggml_view_3d(ctx, layer.k, - n_embd_head_k, n_head_kv, get_size()*n_stream, + n_rot, n_head_kv, get_size()*n_stream, ggml_row_size(layer.k->type, n_embd_head_k), ggml_row_size(layer.k->type, n_embd_k_gqa), - 0); + ggml_row_size(layer.k->type, n_embd_nope)); ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l); diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index ca63a62ad1b..c404c1946d0 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -124,14 +124,14 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens} // note: rope must go first for in-place context shifting in build_rope_shift() - ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0); + ggml_tensor * Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0); cb(Qcur, "Qcur", il); kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens); cb(kv_cmpr, "kv_cmpr_reshape", il); // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens} - ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0); + ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0); cb(Kcur, "Kcur", il); // {kv_lora_rank, 1, n_tokens} @@ -169,11 +169,10 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr Vcur = ggml_cont(ctx0, Vcur); cb(Vcur, "Vcur_cont", il); - // note: rope must go first for in-place context shifting in build_rope_shift() - ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0); + ggml_tensor * Qcur = ggml_concat(ctx0, q_nope, q_pe, 0); cb(Qcur, "Qcur", il); - ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0); + ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); cb(Kcur, "Kcur", il); if (inp_attn_scale) { diff --git a/src/models/minicpm3.cpp b/src/models/minicpm3.cpp index f374a9fd030..297cc34ba58 100644 --- a/src/models/minicpm3.cpp +++ b/src/models/minicpm3.cpp @@ -9,6 +9,7 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap const uint32_t n_embd_head_qk_rope = hparams.n_rot; const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + const uint32_t kv_lora_rank = hparams.n_lora_kv; ggml_tensor * cur; diff --git a/src/models/plm.cpp b/src/models/plm.cpp index 481cbba6907..612a487c564 100644 --- a/src/models/plm.cpp +++ b/src/models/plm.cpp @@ -5,6 +5,7 @@ llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params & const uint32_t n_embd_head_qk_rope = hparams.n_rot; const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + const uint32_t kv_lora_rank = hparams.n_lora_kv; ggml_tensor * cur; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 9f61c6483de..146d05f53bc 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -6122,7 +6122,19 @@ struct test_flash_attn_ext : public test_case { ggml_tensor * k = create_permuted(type_KV, hsk_padded, kv, nh, nr23[1], true); // the K tensor is usually a view of the K cache ggml_set_name(k, "k"); - ggml_tensor * v = create_permuted(type_KV, hsv_padded, kv, nh, nr23[1], true); // the V tensor is usually a view of the V cache + ggml_tensor * v = nullptr; + if (hsk_padded == 576 && hsv_padded == 512) { + // TODO: this branch should become a separate test case parameter instead of hardcoding this for these head shapes + + // in this branch, the V cache is sub-view of the K cache. this is used by some MLA-based models + // for more info: + // - https://github.com/ggml-org/llama.cpp/pull/13435 + // - https://github.com/ggml-org/llama.cpp/pull/18953#issuecomment-3774948392 + // - https://github.com/ggml-org/llama.cpp/pull/18986 + v = ggml_view_4d(ctx, k, hsv_padded, kv, nh, nr23[1], k->nb[1], k->nb[2], k->nb[3], 0); + } else { + v = create_permuted(type_KV, hsv_padded, kv, nh, nr23[1], true); // the V tensor is usually a view of the V cache + } ggml_set_name(v, "v"); ggml_tensor * m = nullptr; From 51fa458a92d6a3f305f8fd76fc8f702e3e87ddb5 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Thu, 22 Jan 2026 21:30:06 +0100 Subject: [PATCH 6/6] server : support preserving reasoning_content in assistant message (#18994) * support reasoning_content input * report template caps to webui * add docs * rm commented code --- common/chat-parser.cpp | 4 +- common/chat.cpp | 173 +++++++++++++------------------- common/chat.h | 25 +++-- common/jinja/caps.cpp | 53 +++++++++- common/jinja/caps.h | 6 +- tests/test-chat.cpp | 16 +-- tools/server/README.md | 8 ++ tools/server/server-context.cpp | 2 + tools/server/server-context.h | 1 + tools/server/server-task.cpp | 6 +- 10 files changed, 164 insertions(+), 130 deletions(-) diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp index c2d1e30f35e..29819e48d3b 100644 --- a/common/chat-parser.cpp +++ b/common/chat-parser.cpp @@ -1630,7 +1630,7 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co } auto msg = builder.result(); if (!is_partial) { - LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str()); + LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str()); } return msg; } @@ -1663,7 +1663,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std mapper.from_ast(ctx.ast, result); } if (!is_partial) { - LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str()); + LOG_DBG("Parsed message: %s\n", common_chat_msgs_to_json_oaicompat({msg}).at(0).dump().c_str()); } return msg; } diff --git a/common/chat.cpp b/common/chat.cpp index b29544dac01..6853f4ad47a 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -7,9 +7,6 @@ #include "log.h" #include "regex-partial.h" -// #include -// #include - #include "jinja/parser.h" #include "jinja/value.h" #include "jinja/runtime.h" @@ -56,39 +53,73 @@ static bool has_content_or_tool_calls(const common_chat_msg & msg) { return !msg.content.empty() || !msg.tool_calls.empty(); } -template <> -json common_chat_msg::to_json_oaicompat() const -{ - json message { - {"role", "assistant"}, +json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const { + if (!content.empty() && !content_parts.empty()) { + throw std::runtime_error("Cannot specify both content and content_parts"); + } + json jmsg { + {"role", role}, }; + if (!content.empty()) { + jmsg["content"] = content; + } else if (!content_parts.empty()) { + if (concat_typed_text) { + std::string text; + for (const auto & part : content_parts) { + if (part.type != "text") { + LOG_WRN("Ignoring content part type: %s\n", part.type.c_str()); + continue; + } + if (!text.empty()) { + text += '\n'; + } + text += part.text; + } + jmsg["content"] = text; + } else { + auto & parts = jmsg["content"] = json::array(); + for (const auto & part : content_parts) { + parts.push_back({ + {"type", part.type}, + {"text", part.text}, + }); + } + } + } else { + jmsg["content"] = ""; + } if (!reasoning_content.empty()) { - message["reasoning_content"] = reasoning_content; + jmsg["reasoning_content"] = reasoning_content; } - if (content.empty() && !tool_calls.empty()) { - message["content"] = json(); - } else { - message["content"] = content; + if (!tool_name.empty()) { + jmsg["name"] = tool_name; + } + if (!tool_call_id.empty()) { + jmsg["tool_call_id"] = tool_call_id; } if (!tool_calls.empty()) { - auto arr = json::array(); - for (const auto & tc : tool_calls) { - arr.push_back({ + jmsg["tool_calls"] = json::array(); + auto & jtool_calls = jmsg["tool_calls"]; + for (const auto & tool_call : tool_calls) { + json tc { {"type", "function"}, {"function", { - {"name", tc.name}, - {"arguments", tc.arguments}, + {"name", tool_call.name}, + {"arguments", tool_call.arguments}, }}, - {"id", tc.id}, - // // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo). - // // We only generate a random id for the ones that don't generate one by themselves - // // (they also won't get to see it as their template likely doesn't use it, so it's all for the client) - // {"id", tc.id.empty() ? gen_tool_call_id() : tc.id}, - }); + }; + if (!tool_call.id.empty()) { + tc["id"] = tool_call.id; + } + // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo). + // We only generate a random id for the ones that don't generate one by themselves + // (they also won't get to see it as their template likely doesn't use it, so it's all for the client) + // {"id", tc.id.empty() ? gen_tool_call_id() : tc.id}, + jtool_calls.push_back(tc); } - message["tool_calls"] = arr; } - return message; + + return jmsg; } std::vector common_chat_msg_diff::compute_diffs(const common_chat_msg & msg_prv, const common_chat_msg & msg_new) { @@ -256,7 +287,6 @@ bool common_chat_templates_support_enable_thinking(const common_chat_templates * return rendered_no_thinking.prompt != rendered_with_thinking.prompt; } -template <> std::vector common_chat_msgs_parse_oaicompat(const json & messages) { std::vector msgs; @@ -350,80 +380,15 @@ std::vector common_chat_msgs_parse_oaicompat(const json & messa return msgs; } -template <> json common_chat_msgs_to_json_oaicompat(const std::vector & msgs, bool concat_typed_text) { json messages = json::array(); for (const auto & msg : msgs) { - if (!msg.content.empty() && !msg.content_parts.empty()) { - throw std::runtime_error("Cannot specify both content and content_parts"); - } - json jmsg { - {"role", msg.role}, - }; - if (!msg.content.empty()) { - jmsg["content"] = msg.content; - } else if (!msg.content_parts.empty()) { - if (concat_typed_text) { - std::string text; - for (const auto & part : msg.content_parts) { - if (part.type != "text") { - LOG_WRN("Ignoring content part type: %s\n", part.type.c_str()); - continue; - } - if (!text.empty()) { - text += '\n'; - } - text += part.text; - } - jmsg["content"] = text; - } else { - auto & parts = jmsg["content"] = json::array(); - for (const auto & part : msg.content_parts) { - parts.push_back({ - {"type", part.type}, - {"text", part.text}, - }); - } - } - } else { - jmsg["content"] = ""; - } - if (!msg.reasoning_content.empty()) { - jmsg["reasoning_content"] = msg.reasoning_content; - } - if (!msg.tool_name.empty()) { - jmsg["name"] = msg.tool_name; - } - if (!msg.tool_call_id.empty()) { - jmsg["tool_call_id"] = msg.tool_call_id; - } - if (!msg.tool_calls.empty()) { - auto & tool_calls = jmsg["tool_calls"] = json::array(); - for (const auto & tool_call : msg.tool_calls) { - json tc { - {"type", "function"}, - {"function", { - {"name", tool_call.name}, - {"arguments", tool_call.arguments}, - }}, - }; - if (!tool_call.id.empty()) { - tc["id"] = tool_call.id; - } - tool_calls.push_back(tc); - } - } + json jmsg = msg.to_json_oaicompat(concat_typed_text); messages.push_back(jmsg); } return messages; } -template <> -std::vector common_chat_msgs_parse_oaicompat(const std::string & messages) { - return common_chat_msgs_parse_oaicompat(json::parse(messages)); -} - -template <> std::vector common_chat_tools_parse_oaicompat(const json & tools) { std::vector result; @@ -459,12 +424,6 @@ std::vector common_chat_tools_parse_oaicompat(const json & too return result; } -template <> -std::vector common_chat_tools_parse_oaicompat(const std::string & tools) { - return common_chat_tools_parse_oaicompat(json::parse(tools)); -} - -template <> json common_chat_tools_to_json_oaicompat(const std::vector & tools) { if (tools.empty()) { return json(); @@ -484,7 +443,7 @@ json common_chat_tools_to_json_oaicompat(const std::vector & t return result; } -template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) { +json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) { json delta = json::object(); if (!diff.reasoning_content_delta.empty()) { delta["reasoning_content"] = diff.reasoning_content_delta; @@ -2867,13 +2826,13 @@ static common_chat_params common_chat_templates_apply_jinja( const struct common_chat_templates_inputs & inputs) { templates_params params; - params.tools = common_chat_tools_to_json_oaicompat(inputs.tools); + params.tools = common_chat_tools_to_json_oaicompat(inputs.tools); const auto & tmpl = params.tools.is_array() && tmpls->template_tool_use ? *tmpls->template_tool_use : *tmpls->template_default; const auto & src = tmpl.source(); const auto & caps = tmpl.original_caps(); - params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content); + params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content); params.add_generation_prompt = inputs.add_generation_prompt; params.tool_choice = inputs.tool_choice; params.reasoning_format = inputs.reasoning_format; @@ -2943,6 +2902,10 @@ static common_chat_params common_chat_templates_apply_jinja( src.find("") != std::string::npos && params.json_schema.is_null()) { workaround::func_args_not_string(params.messages); + if (!params.extra_context.contains("clear_thinking")) { + // by default, do not clear reasoning_content (added since GLM-4.7) + params.extra_context["clear_thinking"] = false; + } return common_chat_params_init_glm_4_5(tmpl, params); } @@ -3174,3 +3137,9 @@ common_chat_params common_chat_templates_apply( ? common_chat_templates_apply_jinja(tmpls, inputs) : common_chat_templates_apply_legacy(tmpls, inputs); } + +std::map common_chat_templates_get_caps(const common_chat_templates * chat_templates) { + GGML_ASSERT(chat_templates != nullptr); + GGML_ASSERT(chat_templates->template_default != nullptr); + return chat_templates->template_default->caps.to_map(); +} diff --git a/common/chat.h b/common/chat.h index ac19348ece7..24aa4aab5cd 100644 --- a/common/chat.h +++ b/common/chat.h @@ -10,6 +10,8 @@ #include #include +#include + struct common_chat_templates; struct common_chat_tool_call { @@ -26,6 +28,11 @@ struct common_chat_msg_content_part { std::string type; std::string text; + // TODO @ngxson : no known chat templates support reasoning_content in content parts yet + // this can be useful for models with interleaved thinking (like Kimi-K2) + // if you see any templates explicitly support this, please ping me + // std::string reasoning_content; + bool operator==(const common_chat_msg_content_part & other) const { return type == other.type && text == other.text; } @@ -40,7 +47,7 @@ struct common_chat_msg { std::string tool_name; std::string tool_call_id; - template T to_json_oaicompat() const; + nlohmann::ordered_json to_json_oaicompat(bool concat_typed_text = false) const; bool empty() const { return content.empty() && content_parts.empty() && tool_calls.empty() && reasoning_content.empty() && tool_name.empty() && tool_call_id.empty(); @@ -232,13 +239,13 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates); // Parses a JSON array of messages in OpenAI's chat completion API format. -// T can be std::string containing JSON or nlohmann::ordered_json -template std::vector common_chat_msgs_parse_oaicompat(const T & messages); -template T common_chat_msgs_to_json_oaicompat(const std::vector & msgs, bool concat_typed_text = false); +std::vector common_chat_msgs_parse_oaicompat(const nlohmann::ordered_json & messages); +nlohmann::ordered_json common_chat_msgs_to_json_oaicompat(const std::vector & msgs, bool concat_typed_text = false); + +std::vector common_chat_tools_parse_oaicompat(const nlohmann::ordered_json & tools); +nlohmann::ordered_json common_chat_tools_to_json_oaicompat(const std::vector & tools); -// Parses a JSON array of tools in OpenAI's chat completion tool call API format. -// T can be std::string containing JSON or nlohmann::ordered_json -template std::vector common_chat_tools_parse_oaicompat(const T & tools); -template T common_chat_tools_to_json_oaicompat(const std::vector & tools); +nlohmann::ordered_json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff); -template T common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff); +// get template caps, useful for reporting to server /props endpoint +std::map common_chat_templates_get_caps(const common_chat_templates * chat_templates); diff --git a/common/jinja/caps.cpp b/common/jinja/caps.cpp index 61deccd1f5e..f27490f1fb7 100644 --- a/common/jinja/caps.cpp +++ b/common/jinja/caps.cpp @@ -61,14 +61,23 @@ static void caps_print_stats(value & v, const std::string & path) { ops.c_str()); } +std::map caps::to_map() const { + return { + {"requires_typed_content", requires_typed_content}, + {"supports_tools", supports_tools}, + {"supports_tool_calls", supports_tool_calls}, + {"supports_parallel_tool_calls", supports_parallel_tool_calls}, + {"supports_system_role", supports_system_role}, + {"supports_preserve_reasoning", supports_preserve_reasoning}, + }; +} + std::string caps::to_string() const { std::ostringstream ss; ss << "Caps(\n"; - ss << " requires_typed_content=" << requires_typed_content << "\n"; - ss << " supports_tools=" << supports_tools << "\n"; - ss << " supports_tool_calls=" << supports_tool_calls << "\n"; - ss << " supports_parallel_tool_calls=" << supports_parallel_tool_calls << "\n"; - ss << " supports_system_role=" << supports_system_role << "\n"; + for (const auto & [key, value] : to_map()) { + ss << " " << key << "=" << (value ? "true" : "false") << "\n"; + } ss << ")"; return ss.str(); } @@ -229,6 +238,40 @@ caps caps_get(jinja::program & prog) { } ); + // case: preserve reasoning content in chat history + caps_try_execute( + prog, + [&]() { + // messages + return json::array({ + { + {"role", "user"}, + {"content", "User message"} + }, + { + {"role", "assistant"}, + {"content", "Assistant message"}, + {"reasoning_content", "Reasoning content"} + }, + { + {"role", "user"}, + {"content", "User message"} + }, + }); + }, + [&]() { + // tools + return json::array(); + }, + [&](bool, value & messages, value &) { + auto & content = messages->at(1)->at("reasoning_content"); + caps_print_stats(content, "messages[1].reasoning_content"); + if (content->stats.used) { + result.supports_preserve_reasoning = true; + } + } + ); + JJ_DEBUG("%s\n", result.to_string().c_str()); return result; diff --git a/common/jinja/caps.h b/common/jinja/caps.h index deb2df180f0..77df117baa1 100644 --- a/common/jinja/caps.h +++ b/common/jinja/caps.h @@ -3,6 +3,7 @@ #include "runtime.h" #include +#include namespace jinja { @@ -11,14 +12,17 @@ struct caps { bool supports_tool_calls = true; bool supports_system_role = true; bool supports_parallel_tool_calls = true; + bool supports_preserve_reasoning = false; // support assistant message with reasoning_content bool requires_typed_content = false; // default: use string content + // for reporting on server + std::map to_map() const; + // for debugging std::string to_string() const; }; caps caps_get(jinja::program & prog); -void debug_print_caps(const caps & c); } // namespace jinja diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 6820acf6792..de7075e6e5d 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -462,9 +462,9 @@ static void test_parser_with_streaming(const common_chat_msg & expected, const s for (size_t i = 1; i <= raw_message.size(); ++i) { auto curr_msg = parse_msg(std::string(utf8_truncate_safe_view(std::string_view(raw_message).substr(0, i)))); if (curr_msg == simple_assist_msg("")) continue; - LOG_INF("Streaming msg: %s\n", common_chat_msgs_to_json_oaicompat({curr_msg}).dump().c_str()); + LOG_INF("Streaming msg: %s\n", common_chat_msgs_to_json_oaicompat({curr_msg}).dump().c_str()); for (auto diff: common_chat_msg_diff::compute_diffs(last_msg, curr_msg)) { - LOG_INF("Streaming diff: %s\n", common_chat_msg_diff_to_json_oaicompat(diff).dump().c_str()); + LOG_INF("Streaming diff: %s\n", common_chat_msg_diff_to_json_oaicompat(diff).dump().c_str()); if (!diff.reasoning_content_delta.empty()) { merged.reasoning_content += diff.reasoning_content_delta; } @@ -480,7 +480,7 @@ static void test_parser_with_streaming(const common_chat_msg & expected, const s merged.tool_calls.back().arguments += diff.tool_call_delta.arguments; } } - LOG_INF("Streaming merged: %s\n", common_chat_msgs_to_json_oaicompat({merged}).dump().c_str()); + LOG_INF("Streaming merged: %s\n", common_chat_msgs_to_json_oaicompat({merged}).dump().c_str()); } assert_msg_equals(curr_msg, merged, true); last_msg = curr_msg; @@ -622,7 +622,7 @@ static void test_msgs_oaicompat_json_conversion() { message_assist_call_code_interpreter, }; for (const auto & msg : msgs) { - auto oai_json = common_chat_msgs_to_json_oaicompat({msg}); + auto oai_json = common_chat_msgs_to_json_oaicompat({msg}); auto msgs2 = common_chat_msgs_parse_oaicompat(oai_json); assert_equals((size_t) 1, msgs2.size()); auto msg2 = msgs2[0]; @@ -646,7 +646,7 @@ static void test_msgs_oaicompat_json_conversion() { " }\n" "]" ), - common_chat_msgs_to_json_oaicompat({message_user_parts}).dump(2)); + common_chat_msgs_to_json_oaicompat({message_user_parts}).dump(2)); assert_equals( std::string( @@ -666,7 +666,7 @@ static void test_msgs_oaicompat_json_conversion() { " }\n" "]" ), - common_chat_msgs_to_json_oaicompat({message_assist_call_python}).dump(2)); + common_chat_msgs_to_json_oaicompat({message_assist_call_python}).dump(2)); auto res = common_chat_msgs_parse_oaicompat(json::parse("[{\"role\": \"assistant\", \"tool_calls\": []}]")); assert_equals(1, res.size()); @@ -693,7 +693,7 @@ static void test_tools_oaicompat_json_conversion() { }; for (const auto & tool : tools) { - auto oai_json = common_chat_tools_to_json_oaicompat({tool}); + auto oai_json = common_chat_tools_to_json_oaicompat({tool}); auto tools2 = common_chat_tools_parse_oaicompat(oai_json); assert_equals((size_t) 1, tools2.size()); auto tool2 = tools2[0]; @@ -726,7 +726,7 @@ static void test_tools_oaicompat_json_conversion() { " }\n" "]" ), - common_chat_tools_to_json_oaicompat({special_function_tool}).dump(2)); + common_chat_tools_to_json_oaicompat({special_function_tool}).dump(2)); { auto tools_no_params = common_chat_tools_parse_oaicompat(json::parse( diff --git a/tools/server/README.md b/tools/server/README.md index 191391a8824..f113f9cb758 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -781,6 +781,7 @@ By default, it is read-only. To make POST request to change global properties, y "total_slots": 1, "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", "chat_template": "...", + "chat_template_caps": {}, "modalities": { "vision": false }, @@ -793,6 +794,7 @@ By default, it is read-only. To make POST request to change global properties, y - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) - `model_path` - the path to model file (same with `-m` argument) - `chat_template` - the model's original Jinja2 prompt template +- `chat_template_caps` - capabilities of the chat template (see `common/jinja/caps.h` for more info) - `modalities` - the list of supported modalities - `is_sleeping` - sleeping status, see [Sleeping on idle](#sleeping-on-idle) @@ -1267,6 +1269,12 @@ This provides information on the performance of the server. It also allows calcu The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n` +*Reasoning support* + +The server supports parsing and returning reasoning via the `reasoning_content` field, similar to Deepseek API. + +Reasoning input (preserve reasoning in history) is also supported by some specific templates. For more details, please refer to [PR#18994](https://github.com/ggml-org/llama.cpp/pull/18994). + ### POST `/v1/responses`: OpenAI-compatible Responses API *Options:* diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 9a828e1eff3..73cb4c75b3e 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2903,6 +2903,7 @@ server_context_meta server_context::get_meta() const { /* pooling_type */ llama_pooling_type(impl->ctx), /* chat_params */ impl->chat_params, + /* chat_template_caps */ common_chat_templates_get_caps(impl->chat_params.tmpls.get()), /* bos_token_str */ bos_token_str, /* eos_token_str */ eos_token_str, @@ -3410,6 +3411,7 @@ void server_routes::init_routes() { { "webui", params.webui }, { "webui_settings", meta->json_webui_settings }, { "chat_template", tmpl_default }, + { "chat_template_caps", meta->chat_template_caps }, { "bos_token", meta->bos_token_str }, { "eos_token", meta->eos_token_str }, { "build_info", meta->build_info }, diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 3e5e870fc56..c0b5d373ff9 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -22,6 +22,7 @@ struct server_context_meta { // chat params server_chat_params & chat_params; + std::map chat_template_caps; // tokens std::string bos_token_str; diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index eeaf5d2f6af..799e341d373 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -700,7 +700,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat() { json choice { {"finish_reason", finish_reason}, {"index", index}, - {"message", msg.to_json_oaicompat()}, + {"message", msg.to_json_oaicompat()}, }; if (!stream && probs_output.size() > 0) { @@ -750,7 +750,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() { json { {"finish_reason", nullptr}, {"index", 0}, - {"delta", common_chat_msg_diff_to_json_oaicompat(diff)}, + {"delta", common_chat_msg_diff_to_json_oaicompat(diff)}, }, })}, {"created", t}, @@ -1383,7 +1383,7 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() { } for (const auto & diff : oaicompat_msg_diffs) { - add_delta(common_chat_msg_diff_to_json_oaicompat(diff)); + add_delta(common_chat_msg_diff_to_json_oaicompat(diff)); } if (!deltas.empty()) {