From 7fcf1ef45d37f7af07f23407e1979be679532959 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 6 Feb 2026 09:25:11 +0200 Subject: [PATCH 01/21] metal : skip loading all-zero mask (#19337) * metal : skip loading all-zero mask * cont : minor --- ggml/src/ggml-metal/ggml-metal.metal | 63 +++++++++++++++++----------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index e54cdab39dd..612a42a1ea8 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -5285,6 +5285,7 @@ constant int32_t FC_flash_attn_ext_blk_ncpsg [[function_constant(FC_FLASH_ATTN_E // scan the blocks of the mask that are not masked // 0 - masked (i.e. full of -INF, skip) // 1 - not masked (i.e. at least one element of the mask is not -INF) +// 2 - all zero kernel void kernel_flash_attn_ext_blk( constant ggml_metal_kargs_flash_attn_ext_blk & args, device const char * mask, @@ -5306,27 +5307,29 @@ kernel void kernel_flash_attn_ext_blk( device const half * mask_src = (device const half *) (mask + (i1*Q)*args.nb31 + i2*args.nb32 + i3*args.nb33) + i0*C + tiisg; - // fast route - if (res == 0) { - if (simd_max(*mask_src) > -MAXHALF/2) { - res = 1; - } - } - // detailed check of the elements of the block if ((C > NW || Q > 1) && res == 0) { - half m = -MAXHALF; + half mmin = MAXHALF; + half mmax = -MAXHALF; FOR_UNROLL (short j = 0; j < Q; ++j) { FOR_UNROLL (short ii = 0; ii < C/NW; ++ii) { - m = max(m, mask_src[ii*NW]); + mmin = min(mmin, mask_src[ii*NW]); + mmax = max(mmax, mask_src[ii*NW]); } mask_src += args.nb31/2; } - if (simd_max(m) > -MAXHALF/2) { - res = 1; + mmin = simd_min(mmin); + mmax = simd_max(mmax); + + if (mmax > -MAXHALF) { + if (mmin == 0.0 && mmax == 0.0) { + res = 2; + } else { + res = 1; + } } } @@ -5568,9 +5571,13 @@ void kernel_flash_attn_ext_impl( ic = 0; } + char blk_cur = 1; + // read the mask into shared mem if (FC_flash_attn_ext_has_mask) { - if (blk[ic0] == 0) { + blk_cur = blk[ic0]; + + if (blk_cur == 0) { FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { pm2[jj] += NW; } @@ -5578,16 +5585,22 @@ void kernel_flash_attn_ext_impl( continue; } - FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { - const short j = jj*NSG + sgitg; + if (blk_cur == 1) { + FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { + const short j = jj*NSG + sgitg; - if (FC_flash_attn_ext_bc_mask) { - sm2[j*SH + tiisg] = (iq1 + j) < args.ne31 ? pm2[jj][tiisg] : half2(-MAXHALF, -MAXHALF); - } else { - sm2[j*SH + tiisg] = pm2[jj][tiisg]; - } + if (FC_flash_attn_ext_bc_mask) { + sm2[j*SH + tiisg] = (iq1 + j) < args.ne31 ? pm2[jj][tiisg] : half2(-MAXHALF, -MAXHALF); + } else { + sm2[j*SH + tiisg] = pm2[jj][tiisg]; + } - pm2[jj] += NW; + pm2[jj] += NW; + } + } else if (blk_cur == 2) { + FOR_UNROLL (short jj = 0; jj < NQ; ++jj) { + pm2[jj] += NW; + } } #if 0 @@ -5752,10 +5765,12 @@ void kernel_flash_attn_ext_impl( } // mqk = mqk + slope*mask - if (FC_flash_attn_ext_has_bias) { - s2 += s2_t(sm2[j*SH + tiisg])*slope; - } else { - s2 += s2_t(sm2[j*SH + tiisg]); + if (blk_cur != 2) { + if (FC_flash_attn_ext_has_bias) { + s2 += s2_t(sm2[j*SH + tiisg])*slope; + } else { + s2 += s2_t(sm2[j*SH + tiisg]); + } } M[jj] = simd_max(max(M[jj], max(s2[0], s2[1]))); From f9bd518a6bac615e1060dcc44f3f302f9e7ae0e8 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Fri, 6 Feb 2026 01:49:58 -0600 Subject: [PATCH 02/21] vulkan: make FA mask/softcap enables spec constants (#19309) * vulkan: make FA mask/softcap enables spec constants * don't specialize for sinks * bump timeout a little bit --- .github/workflows/build.yml | 2 +- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 56 ++++++++++--------- .../vulkan-shaders/flash_attn.comp | 6 +- .../vulkan-shaders/flash_attn_base.glsl | 7 ++- .../vulkan-shaders/flash_attn_cm1.comp | 6 +- .../vulkan-shaders/flash_attn_cm2.comp | 6 +- 6 files changed, 45 insertions(+), 38 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8ce679bd9ab..51a3dc76e9e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -468,7 +468,7 @@ jobs: export GGML_VK_VISIBLE_DEVICES=0 export GGML_VK_DISABLE_F16=1 # This is using llvmpipe and runs slower than other backends - ctest -L main --verbose --timeout 4200 + ctest -L main --verbose --timeout 4800 ubuntu-24-cmake-webgpu: runs-on: ubuntu-24.04 diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 4357da24d42..72097ffd0ff 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -402,19 +402,19 @@ enum FaCodePath { }; struct vk_fa_pipeline_state { - vk_fa_pipeline_state(uint32_t HSK, uint32_t HSV, bool small_rows, bool small_cache, FaCodePath path, bool aligned, bool f32acc, bool use_mask_opt) - : HSK(HSK), HSV(HSV), small_rows(small_rows), small_cache(small_cache), path(path), aligned(aligned), f32acc(f32acc), use_mask_opt(use_mask_opt) {} + vk_fa_pipeline_state(uint32_t HSK, uint32_t HSV, bool small_rows, bool small_cache, FaCodePath path, bool aligned, bool f32acc, uint32_t flags) + : HSK(HSK), HSV(HSV), small_rows(small_rows), small_cache(small_cache), path(path), aligned(aligned), f32acc(f32acc), flags(flags) {} uint32_t HSK, HSV; bool small_rows, small_cache; FaCodePath path; bool aligned; bool f32acc; - bool use_mask_opt; + uint32_t flags; bool operator<(const vk_fa_pipeline_state &b) const { - return std::tie(HSK, HSV, small_rows, small_cache, path, aligned, f32acc, use_mask_opt) < - std::tie(b.HSK, b.HSV, b.small_rows, b.small_cache, b.path, b.aligned, b.f32acc, b.use_mask_opt); + return std::tie(HSK, HSV, small_rows, small_cache, path, aligned, f32acc, flags) < + std::tie(b.HSK, b.HSV, b.small_rows, b.small_cache, b.path, b.aligned, b.f32acc, b.flags); } }; @@ -3193,7 +3193,7 @@ static void ggml_vk_load_shaders(vk_device& device) { return {fa_rows_cols(path, hsk, hsv, clamp, type, small_rows, small_cache)[0], 1, 1}; }; - auto const &fa_spec_constants = [&](FaCodePath path, uint32_t hsk, uint32_t hsv, uint32_t clamp, ggml_type type, bool small_rows, bool small_cache, bool use_mask_opt) -> std::vector { + auto const &fa_spec_constants = [&](FaCodePath path, uint32_t hsk, uint32_t hsv, uint32_t clamp, ggml_type type, bool small_rows, bool small_cache, uint32_t flags) -> std::vector { // For large number of rows, 128 invocations seems to work best. // For small number of rows (e.g. N==1), 256 works better. But matrix granularity for 256 is 32, so we // can't use 256 for D==80. @@ -3225,7 +3225,7 @@ static void ggml_vk_load_shaders(vk_device& device) { // AMD prefers loading K directly from global memory const uint32_t k_load_shmem = device->vendor_id == VK_VENDOR_ID_NVIDIA && hsk < 256 ? 1 : 0; - return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split, device->subgroup_size, k_load_shmem, use_mask_opt}; + return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split, device->subgroup_size, k_load_shmem, flags}; }; #define CREATE_FA(TYPE, NAMELC, FAPATH, SUFFIX) \ @@ -3237,19 +3237,19 @@ static void ggml_vk_load_shaders(vk_device& device) { FaCodePath path = fa.first.path; \ bool aligned = fa.first.aligned; \ bool f32acc = fa.first.f32acc; \ - bool use_mask_opt = fa.first.use_mask_opt; \ + uint32_t flags = fa.first.flags; \ if (path == FAPATH) { \ if (aligned) { \ if (f32acc) { \ - ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache,use_mask_opt), fa_align(FAPATH,HSK,HSV,TYPE,small_rows,small_cache), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \ + ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache,flags), fa_align(FAPATH,HSK,HSV,TYPE,small_rows,small_cache), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \ } else { \ - ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache,use_mask_opt), fa_align(FAPATH,HSK,HSV,TYPE,small_rows,small_cache), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \ + ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows,small_cache,flags), fa_align(FAPATH,HSK,HSV,TYPE,small_rows,small_cache), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \ } \ } else { \ if (f32acc) { \ - ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache,use_mask_opt), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \ + ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache,flags), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \ } else { \ - ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache,use_mask_opt), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \ + ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 7, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows,small_cache,flags), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? device->subgroup_size : 0)); \ } \ } \ } \ @@ -8595,10 +8595,26 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32; + float scale = 1.0f; + float max_bias = 0.0f; + float logit_softcap = 0.0f; + + memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float)); + memcpy(&logit_softcap, (const float *) dst->op_params + 2, sizeof(float)); + + if (logit_softcap != 0) { + scale /= logit_softcap; + } + // Only use mask opt when the mask is fairly large. This hasn't been tuned extensively. bool use_mask_opt = mask && nem1 >= 32 && nem0 * nem1 > 32768; - vk_fa_pipeline_state fa_pipeline_state(HSK, HSV, small_rows, small_cache, path, aligned, f32acc, use_mask_opt); + uint32_t flags = (use_mask_opt ? 1 : 0) | + (mask != nullptr ? 2 : 0) | + (logit_softcap != 0 ? 4 : 0); + + vk_fa_pipeline_state fa_pipeline_state(HSK, HSV, small_rows, small_cache, path, aligned, f32acc, flags); vk_pipeline pipeline = nullptr; @@ -8678,18 +8694,6 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx } } - float scale = 1.0f; - float max_bias = 0.0f; - float logit_softcap = 0.0f; - - memcpy(&scale, (const float *) dst->op_params + 0, sizeof(float)); - memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float)); - memcpy(&logit_softcap, (const float *) dst->op_params + 2, sizeof(float)); - - if (logit_softcap != 0) { - scale /= logit_softcap; - } - const uint32_t n_head_kv = neq2; const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); @@ -8703,7 +8707,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx vk_subbuffer sinks_buf = sinks ? ggml_vk_tensor_subbuffer(ctx, sinks) : q_buf; vk_subbuffer mask_opt_buf = use_mask_opt ? ggml_vk_subbuffer(ctx, ctx->prealloc_y, 0) : q_buf; - uint32_t mask_n_head_log2 = ((sinks != nullptr) << 24) | ((mask != nullptr) << 16) | n_head_log2; + uint32_t mask_n_head_log2 = ((sinks != nullptr) << 24) | n_head_log2; if (use_mask_opt) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp index 49a3c530cb6..914f131c965 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp @@ -127,7 +127,7 @@ void main() { continue; } // Only load if the block is not all zeros - if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0 && mask_opt_bits != MASK_OPT_ALL_ZERO) { + if (MASK_ENABLE && mask_opt_bits != MASK_OPT_ALL_ZERO) { bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0; [[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) { @@ -181,7 +181,7 @@ void main() { } } - if (p.logit_softcap != 0.0f) { + if (LOGIT_SOFTCAP) { [[unroll]] for (uint32_t r = 0; r < Br; ++r) { [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) { Sf[r][c] = p.logit_softcap * tanh(Sf[r][c]); @@ -189,7 +189,7 @@ void main() { } } - if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0 && mask_opt_bits != MASK_OPT_ALL_ZERO) { + if (MASK_ENABLE && mask_opt_bits != MASK_OPT_ALL_ZERO) { [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) { [[unroll]] for (uint32_t r = 0; r < Br; ++r) { float mvf = masksh[c * cols_per_iter + col_tid][r]; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl index 252451101ab..74005cffb3f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl @@ -10,7 +10,11 @@ layout (constant_id = 5) const uint32_t Clamp = 0; layout (constant_id = 6) const uint32_t D_split = 16; layout (constant_id = 7) const uint32_t SubGroupSize = 32; layout (constant_id = 8) const uint32_t K_LOAD_SHMEM = 0; -layout (constant_id = 9) const bool USE_MASK_OPT = false; +layout (constant_id = 9) const uint32_t Flags = 0; + +const bool USE_MASK_OPT = (Flags & 1) != 0; +const bool MASK_ENABLE = (Flags & 2) != 0; +const bool LOGIT_SOFTCAP = (Flags & 4) != 0; // Round up head sizes to a multiple of 16, for coopmat1/coopmat2 paths const uint32_t HSK_pad = (HSK + 15) & ~15; @@ -60,7 +64,6 @@ layout (push_constant) uniform parameter { } p; #define SINK_ENABLE_BIT (1<<24) -#define MASK_ENABLE_BIT (1<<16) #define N_LOG2_MASK 0xFFFF layout (binding = 4) readonly buffer S {float data_s[];}; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp index 89af3697e1d..b3177738234 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp @@ -160,7 +160,7 @@ void main() { mask_cache[idx] = f16vec4(0); } - if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) { + if (MASK_ENABLE) { if (USE_MASK_OPT && mask_opt_idx != j / 16) { mask_opt_idx = j / 16; @@ -303,7 +303,7 @@ void main() { coopMatStore(SfMat, sfsh, coord, sfshstride, gl_CooperativeMatrixLayoutRowMajor); barrier(); - if (p.logit_softcap != 0.0f) { + if (LOGIT_SOFTCAP) { [[unroll]] for (uint32_t idx = 0; idx < Bc * Br / 4; idx += gl_WorkGroupSize.x) { uint32_t c = (idx + tid) / (Br / 4); uint32_t r = (idx + tid) % (Br / 4); @@ -314,7 +314,7 @@ void main() { barrier(); } - if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) { + if (MASK_ENABLE) { [[unroll]] for (uint32_t idx = 0; idx < Bc * Br / 4; idx += gl_WorkGroupSize.x) { uint32_t c = (idx + tid) / (Br / 4); uint32_t r = (idx + tid) % (Br / 4); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp index 47b110621b7..b07c21f6e55 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -155,7 +155,7 @@ void main() { for (uint32_t j = start_j; j < end_j; ++j) { coopmat mv = coopmat(0); - if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) { + if (MASK_ENABLE) { if (USE_MASK_OPT && mask_opt_idx != j / 16) { mask_opt_idx = j / 16; @@ -197,14 +197,14 @@ void main() { coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose DECODEFUNC); S = coopMatMulAdd(Qf16, K_T, S); - if (p.logit_softcap != 0.0f) { + if (LOGIT_SOFTCAP) { [[unroll]] for (int k = 0; k < S.length(); ++k) { S[k] = ACC_TYPE(p.logit_softcap)*tanh(S[k]); } } - if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) { + if (MASK_ENABLE) { S += slopeMat*coopmat(mv); } From 1946e46f4c29da7b9294d702756969839e922bb8 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Fri, 6 Feb 2026 02:15:13 -0600 Subject: [PATCH 03/21] vulkan: For coopmat2 FA, use fp16 accumulators for the final result (#19376) The cpu and cuda backends use fp16 for the VKQ accumulator type, this change does the same for vulkan. This helps particularly with large head sizes which are very register-limited. I tried this for the coopmat1 path and it slowed down a bit. I didn't try for scalar. I applied the softmax bias that the cuda backend uses to avoid overflow, although I was not able to reproduce the original bug without it. --- .../vulkan-shaders/flash_attn_base.glsl | 4 ++++ .../vulkan-shaders/flash_attn_cm2.comp | 20 +++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl index 74005cffb3f..4142c1e6eaa 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl @@ -240,3 +240,7 @@ void init_indices() // and breaking the alignment detection. m_stride = (p.gqa_ratio > 1) ? (p.gqa_ratio >> 16) : KV; } + +// Bias applied to softmax to stay in fp16 range. +// Based on ggml-cuda issue https://github.com/ggml-org/llama.cpp/issues/18606 +const float FATTN_KQ_MAX_OFFSET = 3.0f*0.6931f; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp index b07c21f6e55..39f0c4d23b9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -117,7 +117,7 @@ void main() { Qf16 = coopmat(Q); Qf16 *= float16_t(p.scale); - coopmat O = coopmat(0); + coopmat O = coopmat(0); coopmat L, M; @@ -223,6 +223,8 @@ void main() { coopMatReduceNV(rowmax, S, gl_CooperativeMatrixReduceRowNV, maxReduce); + rowmax += coopmat(FATTN_KQ_MAX_OFFSET); + coopmat Mold = M; // M = max(rowmax, Mold) @@ -265,11 +267,8 @@ void main() { // resize eM by using smear/reduce coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce); - // multiply with fp16 accumulation, then add to O. - coopmat PV = coopmat(0); - PV = coopMatMulAdd(P_A, V, PV); - - O = eMdiag * O + coopmat(PV); + O *= coopmat(eMdiag); + O = coopMatMulAdd(P_A, V, O); } // If there is split_k, then the split_k resolve shader does the final @@ -311,7 +310,7 @@ void main() { if (sink > Mr[i]) { ms = exp(Mr[i] - sink); - O[i] *= ms; + O[i] *= float16_t(ms); } else { vs = exp(sink - Mr[i]); } @@ -325,15 +324,16 @@ void main() { Ldiag[k] = (Ldiag[k] == 0.0) ? ACC_TYPE(0.0) : (ACC_TYPE(1.0) / Ldiag[k]); } - O = Ldiag*O; + coopmat O_D = coopmat(O); + + O_D = coopmat(Ldiag)*O_D; #if defined(ACC_TYPE_MAX) - [[unroll]] for (uint i = 0; i < O.length(); ++i) { O[i] = clamp(O[i], -ACC_TYPE_MAX, ACC_TYPE_MAX); } + [[unroll]] for (uint i = 0; i < O_D.length(); ++i) { O_D[i] = clamp(O_D[i], D_TYPE(-ACC_TYPE_MAX), D_TYPE(ACC_TYPE_MAX)); } #endif uint32_t o_offset = gqa_iq1*p.ne1*HSV + iq3*p.ne2*p.ne1*HSV; - coopmat O_D = coopmat(O); if (p.gqa_ratio > 1) { coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N); } else { From 3688c4f504f8e336663157bcc6e0af78d617420c Mon Sep 17 00:00:00 2001 From: ymcki <84055651+ymcki@users.noreply.github.com> Date: Fri, 6 Feb 2026 18:39:58 +0800 Subject: [PATCH 04/21] Kimi-Linear support (backend agnostic + MLA KV cache) (#18755) * kimi linear model implementation * kimi linear convert_hf_to_gguf * kimi linear constants.py tensor_mapping.py * Kimi Linear ggml.h * kimi linear ggml-cpu * Kimi Linear ggml-cuda * Kimi Linear ggml.c * kimi linear src/llama * remove "const int64_t n_seq_tokens = q->ne[2];" to get rid of unused variable warning * remove type mismatch warning * read MoE params * removed some hard coded code * removed all hard code * use DeepseekV2 tokenizer * removed unnecessary internal methods called by the old set_vocab of KimiLinear * rewrite get_vocab for KimiLinear. Removed all kda_scan code * removed all traces of kda_scan * reduce OP count by 1 due to removal of kda_scan * Move KIMI_LINEAR to llm_arch_is_hybrid to enable KV cache * set n_embd_head_k/v to ensure kv cache works * don't quantize conv1d of Kimi Linear * Kimi Linear backend agnostic * removed LOG_INFO * naive chunking form implemented * fixed some comments * add Kimi-K2 specific tokens to be recognized as EOG * build_kda_autoregressive is implemented to replace build_kda_recurrent for faster inference. sync'd to b7682 * replaced Akk and Aqk with mul_mat and clamp * no clamp version * Moved Aqk computation out of the loop * fixed typo and split wkv_b into wk_b and wv_b * MLA KV cache support * fix trailing spaces * moved const llama_model & model; around to follow qwen3next format and see if it cna pass the -Wunused-private-field error * fix trailing whitespace * removed traling whitespaces in empty line + make sure indentation is multiple of 4 * try to make lint happy * remove blank lines to make lint happy * removed at least blank line containing white space * fixed flake8 complaints locally * return ggml_tensor * pair in kda_autoregressive and kda_chunking as in ngxson's Qwen3Next improvement * removed Kimi-Linear specific change that causes failure at server-windows * removed private: from kimi_linear to make build checks happy * removed unnecessary ggml_cont before ggml_reshape * created static function causal_conv1d to abtract similar code for q/k/v * merged dt_bias to SSM_DT. Do -exp(log_A) in convert_hf_to_gguf.py. * reverted to original * fixed find_hparam calls. Fixed e_score_correction_bias to use bias instead of weight. Removed all ssm_conv bias terms. * remove DT_B from constants.py. remove one comment line in llama-model.cpp * new class llm_graph_input_mem_hybrid_k to get around the new MLA change. switch the concat order of ggml_concat calls in kimi-linear.cpp to accommodate MLA changes. Removed support for exp_probs_b.weight * remove ssm_o_norm_b * remove ssm_o_norm_b * changed hparams.kda_head_dim to hparams.n_embd_head_kda. added TODO comment for class llama_graph_mem_hybrid_k * removed all ggml_cont b4 ggml_reshape_4d * Whitespace * replaced all hparams.get with find_hparams * added new names for n_experts, n_experts_used and score_func in TextModel and removed their code in KimiLinear in convert_hf_to_gguf.py. Removed unnecessary ggml_cont and GGML_ASSERT in kimi-linear.cpp * use is_mla to switch between different mem_hybrid types * fixed logical errors in convert_hf_to_gguf.py pointed out by CISC * removed if else for required parameters kv_lora_rank and qk_rope_head_dim * add back ggml_cont for Vcur * minor changes * removed extra line in llama-vocab.cpp. Added back the comment in llama-graph.cpp * f16 gguf cannot run without context length * made a mistake of adding back n_ctx parsing --------- Co-authored-by: Piotr Wilkin (ilintar) --- convert_hf_to_gguf.py | 225 +++++++++- gguf-py/gguf/constants.py | 65 +++ gguf-py/gguf/gguf_writer.py | 3 + gguf-py/gguf/tensor_mapping.py | 32 ++ src/CMakeLists.txt | 1 + src/llama-arch.cpp | 70 +++ src/llama-arch.h | 12 + src/llama-context.cpp | 2 +- src/llama-graph.cpp | 55 +++ src/llama-graph.h | 29 ++ src/llama-hparams.cpp | 14 + src/llama-hparams.h | 3 + src/llama-model.cpp | 172 ++++++++ src/llama-model.h | 13 + src/llama-quant.cpp | 4 +- src/llama-vocab.cpp | 39 +- src/models/kimi-linear.cpp | 772 +++++++++++++++++++++++++++++++++ src/models/models.h | 27 ++ 18 files changed, 1518 insertions(+), 20 deletions(-) create mode 100644 src/models/kimi-linear.cpp diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index eb43520f98c..c167de8a465 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -586,6 +586,10 @@ def prepare_tensors(self): gguf.MODEL_TENSOR.A_ENC_EMBD_POS, gguf.MODEL_TENSOR.ALTUP_CORRECT_COEF, gguf.MODEL_TENSOR.ALTUP_PREDICT_COEF, + # Kimi KDA conv weights should be F32 + gguf.MODEL_TENSOR.SSM_CONV1D_Q, + gguf.MODEL_TENSOR.SSM_CONV1D_K, + gguf.MODEL_TENSOR.SSM_CONV1D_V, ) ) or new_name[-7:] not in (".weight", ".lora_a", ".lora_b") @@ -903,10 +907,10 @@ def set_gguf_parameters(self): if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: self.gguf_writer.add_layer_norm_eps(f_norm_eps) logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") - if (n_experts := self.hparams.get("num_local_experts")) is not None: + if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None: self.gguf_writer.add_expert_count(n_experts) logger.info(f"gguf: expert count = {n_experts}") - if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True)) is not None: self.gguf_writer.add_expert_used_count(n_experts_used) logger.info(f"gguf: experts used count = {n_experts_used}") if (n_expert_groups := self.hparams.get("n_group")) is not None: @@ -916,7 +920,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_expert_group_used_count(n_group_used) logger.info(f"gguf: expert groups used count = {n_group_used}") - if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func"], optional=True)) is not None: + if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation_func"], optional=True)) is not None: if score_func == "sigmoid": self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) elif score_func == "softmax": @@ -5013,6 +5017,221 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_factor(1.0) +@ModelBase.register("KimiLinearModel", "KimiLinearForCausalLM") +class KimiLinearModel(TextModel): + """Kimi-Linear model with hybrid MLA+KDA architecture""" + model_arch = gguf.MODEL_ARCH.KIMI_LINEAR + + _experts: list[dict[str, Tensor]] | None = None + + def set_vocab(self): + try: + self._set_vocab_gpt2() + return + except Exception: + pass + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + tokpre = self.get_vocab_base_pre(tokenizer) + + if tokpre == "kimi-k2": + # Build merges list using the approach similar to HunYuanMoE + merges = [] + vocab = {} + mergeable_ranks = tokenizer.model._mergeable_ranks + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + if len(merged) == 2: + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + # Build token list + vocab_size = self.hparams["vocab_size"] + special_tokens = tokenizer.special_tokens + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} + tokens: list[str] = [] + toktypes: list[int] = [] + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token = reverse_vocab[i] + tokens.append(token) + if i in special_tokens.values(): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.NORMAL) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_token_merges(merges) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) + # override eos id in config.json with tiktoken eos id + self.gguf_writer.add_eos_token_id(tokenizer.eos_id) + else: + raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!") + + def set_gguf_parameters(self): + # note: To enable MLA KV cache, attention needs to be converted into MQA (ie: GQA with 1 group) + self.hparams["num_key_value_heads"] = 1 + + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + # KDA & MLA params + # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv + linear_attn_config = self.hparams["linear_attn_config"] + # n_head == 0 for KDA layers, n_head > 0 for MLA layers + # full_attention_layers list will be used to distingush layer type + _num_kv_heads = list() + _full_attn_layers = linear_attn_config["full_attn_layers"] + for il in range(self.hparams["num_hidden_layers"]): + if il + 1 in _full_attn_layers: + _num_kv_heads.append(self.hparams["num_key_value_heads"]) + else: + _num_kv_heads.append(0) + assert len(_num_kv_heads) == self.hparams["num_hidden_layers"] + self.gguf_writer.add_head_count_kv(_num_kv_heads) + + if (ssm_d_conv := linear_attn_config.get("short_conv_kernel_size")) is not None: + self.gguf_writer.add_ssm_conv_kernel(ssm_d_conv) + if (kda_head_dim := linear_attn_config.get("head_dim")) is not None: + self.gguf_writer.add_kda_head_dim(kda_head_dim) + + # MLA params - use add_* methods that handle arch substitution + # Support both HuggingFace naming (q_lora_rank, kv_lora_rank) and internal naming (n_lora_q, n_lora_kv) + if (q_lora_rank := self.find_hparam(["q_lora_rank", "n_lora_q"], optional=True)) is not None: + self.gguf_writer.add_q_lora_rank(q_lora_rank) + # To enable MLA KV cache, MLA needs to be converted into MQA with larger heads, then decompresses to MHA + kv_lora_rank = self.find_hparam(["kv_lora_rank", "n_lora_kv"], optional=False) + self.gguf_writer.add_kv_lora_rank(kv_lora_rank) + + # MLA head dimensions + # Support HuggingFace naming: qk_nope_head_dim, qk_rope_head_dim, v_head_dim + qk_nope_head_dim = self.hparams.get("qk_nope_head_dim") + # Rotation - use qk_rope_head_dim for Kimi + qk_rope_head_dim = self.find_hparam(["qk_rope_head_dim", "n_rot"], optional=False) + self.gguf_writer.add_rope_dimension_count(qk_rope_head_dim) + self.gguf_writer.add_key_length(kv_lora_rank + qk_rope_head_dim) + v_head_dim = self.hparams.get("v_head_dim") + + # Calculate n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim + if (n_embd_head_k_mla := self.find_hparam(["n_embd_head_k_mla"], optional=True)) is not None: + self.gguf_writer.add_key_length_mla(n_embd_head_k_mla) + elif qk_nope_head_dim is not None: + n_embd_head_k_mla = qk_nope_head_dim + qk_rope_head_dim + self.gguf_writer.add_key_length_mla(n_embd_head_k_mla) + + # n_embd_head_v_mla = v_head_dim + if (n_embd_head_v_mla := self.hparams.get("n_embd_head_v_mla")) is not None: + self.gguf_writer.add_value_length_mla(n_embd_head_v_mla) + elif v_head_dim is not None: + self.gguf_writer.add_value_length_mla(v_head_dim) + + # moe_intermediate_size (1024 for Kimi) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) + # num_shared_experts (1 for Kimi) + self.gguf_writer.add_expert_shared_count(self.hparams["num_shared_experts"]) + # first_k_dense_replace (1 for Kimi - first layer uses dense MLP) + self.gguf_writer.add_leading_dense_block_count(self.hparams["first_k_dense_replace"]) + # Routed scaling factor (expert_weights_scale = 2.446 for Kimi) + self.gguf_writer.add_expert_weights_scale(self.hparams["routed_scaling_factor"]) + + def prepare_tensors(self): + super().prepare_tensors() + if self._experts is not None: + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + logger.info(f"Processing {name}: shape before = {tuple(data_torch.shape)}") + + # Handle KDA conv1d weights + # HuggingFace/vLLM stores as [d_inner, d_conv] (2D), memory layout: conv_step changes fastest + # llama.cpp expects ggml ne = [d_conv, 1, d_inner, 1], memory layout: ne[0]=d_conv changes fastest + # GGUF reverses numpy shape when writing, so numpy (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1] + # Memory layouts match: both have conv_step (d_conv) changing fastest + if name.endswith((".q_conv1d.weight", ".k_conv1d.weight", ".v_conv1d.weight")): + # HF shape: [d_inner, d_conv] e.g. [4096, 4] + # Target numpy shape: (1, d_inner, 1, d_conv) -> ggml ne = [d_conv, 1, d_inner, 1] + if data_torch.ndim == 2: + d_inner, d_conv = data_torch.shape + # Reshape to (1, d_inner, 1, d_conv) - memory layout preserved (d_conv fastest) + data_torch = data_torch.reshape(1, d_inner, 1, d_conv) + logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]") + elif data_torch.ndim == 3: + # Already 3D [d_inner, 1, d_conv] from unsqueeze + d_inner, _, d_conv = data_torch.shape + data_torch = data_torch.reshape(1, d_inner, 1, d_conv) + logger.info(f"Reshaped conv1d weight {name}: [d_inner={d_inner}, 1, d_conv={d_conv}] -> numpy {tuple(data_torch.shape)} -> ggml ne=[{d_conv}, 1, {d_inner}, 1]") + + # Kimi specific bias + if name.endswith("e_score_correction_bias"): + name = name.replace("e_score_correction_bias", "e_score_correction.bias") + + # Handle A_log: iHF stores as [1, 1, num_heads, 1] + # llama.cpp expects ggml ne = [1, num_heads, 1, 1] + # GGUF reverses numpy shape: numpy (1, 1, num_heads, 1) -> ggml ne = [1, num_heads, 1, 1] + if name.endswith(".A_log"): + data_torch = -torch.exp(data_torch) + if name.endswith(".dt_bias"): + name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias" + logger.info("Changed dt_bias to dt_proj.bias") + + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=False) + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + # w1: gate, w2: down, w3: up + for wid, tname in [("w1", gguf.MODEL_TENSOR.FFN_GATE_EXP), + ("w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP), + ("w3", gguf.MODEL_TENSOR.FFN_UP_EXP)]: + datas: list[Tensor] = [] + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + data_torch = torch.stack(datas, dim=0) + new_name = self.format_tensor_name(tname, bid) + yield from super().modify_tensors(data_torch, new_name, bid) + return + + # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed + if name.endswith("kv_b_proj.weight"): + name_kb = name.replace("kv_b_proj", "k_b_proj") + name_vb = name.replace("kv_b_proj", "v_b_proj") + n_head_kv = self.hparams["num_key_value_heads"] + v_head_dim = self.find_hparam(["n_embd_head_v_mla", "v_head_dim"], optional=False) + qk_nope_head_dim = self.hparams["qk_nope_head_dim"] + logger.info("Split kv_b n_head_kv %d\n" % n_head_kv) + assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) + kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) + k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) + k_b = k_b.transpose(1, 2) + yield from super().modify_tensors(k_b, name_kb, bid) + yield from super().modify_tensors(v_b, name_vb, bid) + return + + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("InternLM2ForCausalLM") class InternLM2Model(TextModel): model_arch = gguf.MODEL_ARCH.INTERNLM2 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6f56d36c59f..3ddbc73d1cc 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -207,6 +207,9 @@ class SSM: GROUP_COUNT = "{arch}.ssm.group_count" DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms" + class KDA: + HEAD_DIM = "{arch}.kda.head_dim" + class WKV: HEAD_SIZE = "{arch}.wkv.head_size" @@ -461,6 +464,7 @@ class MODEL_ARCH(IntEnum): MIMO2 = auto() LLAMA_EMBED = auto() MAINCODER = auto() + KIMI_LINEAR = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -551,6 +555,14 @@ class MODEL_TENSOR(IntEnum): SSM_NORM = auto() SSM_OUT = auto() SSM_BETA_ALPHA = auto() # qwen3next + SSM_CONV1D_Q = auto() # Kimi Linear + SSM_CONV1D_K = auto() # Kimi Linear + SSM_CONV1D_V = auto() # Kimi Linear + SSM_F_A = auto() # Kimi Linear + SSM_F_B = auto() # Kimi Linear + SSM_BETA = auto() # Kimi Linear + SSM_G_A = auto() # Kimi Linear + SSM_G_B = auto() # Kimi Linear TIME_MIX_W0 = auto() TIME_MIX_W1 = auto() TIME_MIX_W2 = auto() @@ -882,6 +894,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.MIMO2: "mimo2", MODEL_ARCH.LLAMA_EMBED: "llama-embed", MODEL_ARCH.MAINCODER: "maincoder", + MODEL_ARCH.KIMI_LINEAR: "kimi-linear", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -969,6 +982,14 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_NORM: "blk.{bid}.ssm_norm", MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", MODEL_TENSOR.SSM_BETA_ALPHA: "blk.{bid}.ssm_ba", + MODEL_TENSOR.SSM_CONV1D_Q: "blk.{bid}.ssm_conv1d_q", # Kimi Linear + MODEL_TENSOR.SSM_CONV1D_K: "blk.{bid}.ssm_conv1d_k", # Kimi Linear + MODEL_TENSOR.SSM_CONV1D_V: "blk.{bid}.ssm_conv1d_v", # Kimi Linear + MODEL_TENSOR.SSM_F_A: "blk.{bid}.ssm_f_a", # Kimi Linear + MODEL_TENSOR.SSM_F_B: "blk.{bid}.ssm_f_b", # Kimi Linear + MODEL_TENSOR.SSM_BETA: "blk.{bid}.ssm_beta", # Kimi Linear + MODEL_TENSOR.SSM_G_A: "blk.{bid}.ssm_g_a", # Kimi Linear + MODEL_TENSOR.SSM_G_B: "blk.{bid}.ssm_g_b", # Kimi Linear MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", @@ -3379,6 +3400,47 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.KIMI_LINEAR: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_Q_A, + MODEL_TENSOR.ATTN_Q_B, + MODEL_TENSOR.ATTN_KV_A_MQA, + MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_K_B, + MODEL_TENSOR.ATTN_V_B, + MODEL_TENSOR.ATTN_Q_A_NORM, + MODEL_TENSOR.ATTN_KV_A_NORM, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.SSM_CONV1D_Q, + MODEL_TENSOR.SSM_CONV1D_K, + MODEL_TENSOR.SSM_CONV1D_V, + MODEL_TENSOR.SSM_F_A, + MODEL_TENSOR.SSM_F_B, + MODEL_TENSOR.SSM_BETA, + MODEL_TENSOR.SSM_A, + MODEL_TENSOR.SSM_G_A, + MODEL_TENSOR.SSM_G_B, + MODEL_TENSOR.SSM_DT, + MODEL_TENSOR.SSM_NORM, + MODEL_TENSOR.FFN_EXP_PROBS_B, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + ], # TODO } @@ -3706,6 +3768,9 @@ class VisionProjectorType: KEY_SSM_GROUP_COUNT = Keys.SSM.GROUP_COUNT KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS +# KDA +KEY_KDA_HEAD_DIM = Keys.KDA.HEAD_DIM + # tokenization KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 0b9c650161e..f720aa2d54a 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -980,6 +980,9 @@ def add_ssm_group_count(self, value: int) -> None: def add_ssm_dt_b_c_rms(self, value: bool) -> None: self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value) + def add_kda_head_dim(self, value: int) -> None: + self.add_uint32(Keys.KDA.HEAD_DIM.format(arch=self.arch), value) + def add_tokenizer_model(self, model: str) -> None: self.add_string(Keys.Tokenizer.MODEL, model) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 84aa8688092..e16c06c2a3c 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -438,6 +438,7 @@ class TensorNameMap: "model.layers.{bid}.block_sparse_moe.e_score_correction", # minimax-m2 "backbone.layers.{bid}.mixer.gate.e_score_correction", # nemotron-h-moe "model.layers.{bid}.mlp.e_score_correction", # exaone-moe + "model.layers.{bid}.block_sparse_moe.gate.e_score_correction", # kimi ), # Feed-forward up @@ -502,6 +503,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan "layers.{bid}.shared_experts.w3", # mistral-large "backbone.layers.{bid}.mixer.shared_experts.up_proj", # nemotron-h-moe + "model.layers.{bid}.block_sparse_moe.shared_experts.up_proj", # kimi ), MODEL_TENSOR.FFN_UP_CHEXP: ( @@ -549,6 +551,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4 "model.layers.{bid}.mlp.shared_mlp.gate_proj", # hunyuan "layers.{bid}.shared_experts.w1", # mistral-large + "model.layers.{bid}.block_sparse_moe.shared_experts.gate_proj", # kimi ), MODEL_TENSOR.FFN_GATE_CHEXP: ( @@ -613,6 +616,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan "layers.{bid}.shared_experts.w2", # mistral-large "backbone.layers.{bid}.mixer.shared_experts.down_proj", # nemotron-h-moe + "model.layers.{bid}.block_sparse_moe.shared_experts.down_proj", # kimi ), MODEL_TENSOR.FFN_DOWN_CHEXP: ( @@ -759,6 +763,7 @@ class TensorNameMap: "model.layers.layers.{bid}.mixer.dt_proj", # plamo2 "model.layers.{bid}.linear_attn.dt_proj", # qwen3next "backbone.layers.{bid}.mixer.dt", # nemotron-h-moe + "model.layers.{bid}.self_attn.dt_proj", # kimi ), MODEL_TENSOR.SSM_DT_NORM: ( @@ -772,6 +777,7 @@ class TensorNameMap: "model.layers.{bid}.mamba.A_log", # jamba falcon-h1 granite-hybrid "model.layers.layers.{bid}.mixer.A_log", # plamo2 "model.layers.{bid}.linear_attn.A_log", # qwen3next + "model.layers.{bid}.self_attn.A_log", # kimi ), MODEL_TENSOR.SSM_B_NORM: ( @@ -797,6 +803,7 @@ class TensorNameMap: "model.layers.{bid}.mamba.norm", # falcon-h1 granite-hybrid "model.layers.{bid}.linear_attn.norm", # qwen3next "backbone.layers.{bid}.mixer.norm", # mamba2 + "model.layers.{bid}.self_attn.o_norm", # kimi ), MODEL_TENSOR.SSM_OUT: ( @@ -811,6 +818,31 @@ class TensorNameMap: "model.layers.{bid}.linear_attn.in_proj_ba", # qwen3next ), + # Kimi Linear KDA (using SSM_ prefix for consistency) + MODEL_TENSOR.SSM_CONV1D_Q: ( + "model.layers.{bid}.self_attn.q_conv1d", + ), + MODEL_TENSOR.SSM_CONV1D_K: ( + "model.layers.{bid}.self_attn.k_conv1d", + ), + MODEL_TENSOR.SSM_CONV1D_V: ( + "model.layers.{bid}.self_attn.v_conv1d", + ), + MODEL_TENSOR.SSM_F_A: ( + "model.layers.{bid}.self_attn.f_a_proj", + ), + MODEL_TENSOR.SSM_F_B: ( + "model.layers.{bid}.self_attn.f_b_proj", + ), + MODEL_TENSOR.SSM_BETA: ( + "model.layers.{bid}.self_attn.b_proj", + ), + MODEL_TENSOR.SSM_G_A: ( + "model.layers.{bid}.self_attn.g_a_proj", + ), + MODEL_TENSOR.SSM_G_B: ( + "model.layers.{bid}.self_attn.g_b_proj", + ), MODEL_TENSOR.TIME_MIX_W0: ( "model.layers.{bid}.attention.w0", # rwkv7 ), diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bedfa1bc3d9..5238a5e934d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -84,6 +84,7 @@ add_library(llama models/internlm2.cpp models/jais.cpp models/jamba.cpp + models/kimi-linear.cpp models/lfm2.cpp models/llada-moe.cpp models/llada.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index a54bc1956ae..a8bf1c9b80c 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -120,6 +120,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_MIMO2, "mimo2" }, { LLM_ARCH_LLAMA_EMBED, "llama-embed" }, { LLM_ARCH_MAINCODER, "maincoder" }, + { LLM_ARCH_KIMI_LINEAR, "kimi-linear" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -246,6 +247,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_SSM_GROUP_COUNT, "%s.ssm.group_count" }, { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, + { LLM_KV_KDA_HEAD_DIM, "%s.kda.head_dim" }, + { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, { LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" }, @@ -371,6 +374,15 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" }, { LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" }, { LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" }, + { LLM_TENSOR_SSM_CONV1D_Q, "blk.%d.ssm_conv1d_q" }, + { LLM_TENSOR_SSM_CONV1D_K, "blk.%d.ssm_conv1d_k" }, + { LLM_TENSOR_SSM_CONV1D_V, "blk.%d.ssm_conv1d_v" }, + { LLM_TENSOR_SSM_F_A, "blk.%d.ssm_f_a" }, + { LLM_TENSOR_SSM_F_B, "blk.%d.ssm_f_b" }, + { LLM_TENSOR_SSM_BETA, "blk.%d.ssm_beta" }, + { LLM_TENSOR_SSM_G_A, "blk.%d.ssm_g_a" }, + { LLM_TENSOR_SSM_G_B, "blk.%d.ssm_g_b" }, + { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" }, { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" }, @@ -2289,6 +2301,54 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, }; + case LLM_ARCH_KIMI_LINEAR: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + // Dense FFN (layer 0 only) + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + // MoE FFN (layers 1+) + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_EXP_PROBS_B, + // Shared experts + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + // KDA (using SSM_ enum prefix, keeping GGUF names for backward compat) + LLM_TENSOR_SSM_CONV1D_Q, + LLM_TENSOR_SSM_CONV1D_K, + LLM_TENSOR_SSM_CONV1D_V, + LLM_TENSOR_SSM_F_A, + LLM_TENSOR_SSM_F_B, + LLM_TENSOR_SSM_BETA, + LLM_TENSOR_SSM_A, + LLM_TENSOR_SSM_G_A, + LLM_TENSOR_SSM_G_B, + LLM_TENSOR_SSM_DT, + LLM_TENSOR_SSM_NORM, + // MLA + LLM_TENSOR_ATTN_Q_A, + LLM_TENSOR_ATTN_Q_B, + LLM_TENSOR_ATTN_Q_A_NORM, + LLM_TENSOR_ATTN_KV_A_MQA, + LLM_TENSOR_ATTN_KV_B, + LLM_TENSOR_ATTN_K_B, + LLM_TENSOR_ATTN_V_B, + LLM_TENSOR_ATTN_KV_A_NORM, + }; default: GGML_ABORT("unknown architecture for tensor mapping"); } @@ -2392,6 +2452,15 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_SSM_D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_SSM_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + // Kimi KDA - Conv tensors are 4D [d_conv, 1, d_inner, 1], reshaped to 2D at runtime + {LLM_TENSOR_SSM_CONV1D_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_SSM_CONV1D_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_SSM_CONV1D_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_SSM_F_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_SSM_F_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_SSM_BETA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_SSM_G_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_SSM_G_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, @@ -2573,6 +2642,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_NEMOTRON_H_MOE: case LLM_ARCH_QWEN3NEXT: + case LLM_ARCH_KIMI_LINEAR: return true; default: return false; diff --git a/src/llama-arch.h b/src/llama-arch.h index 270d28b16a4..f092f728344 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -124,6 +124,7 @@ enum llm_arch { LLM_ARCH_MIMO2, LLM_ARCH_LLAMA_EMBED, LLM_ARCH_MAINCODER, + LLM_ARCH_KIMI_LINEAR, LLM_ARCH_UNKNOWN, }; @@ -250,6 +251,8 @@ enum llm_kv { LLM_KV_SSM_GROUP_COUNT, LLM_KV_SSM_DT_B_C_RMS, + LLM_KV_KDA_HEAD_DIM, + LLM_KV_WKV_HEAD_SIZE, LLM_KV_TOKENIZER_MODEL, @@ -398,6 +401,15 @@ enum llm_tensor { LLM_TENSOR_SSM_NORM, LLM_TENSOR_SSM_OUT, LLM_TENSOR_SSM_BETA_ALPHA, // qwen3next + // Kimi Linear KDA (using SSM_ prefix for consistency) + LLM_TENSOR_SSM_CONV1D_Q, // kimi: Q conv1d weight + LLM_TENSOR_SSM_CONV1D_K, // kimi: K conv1d weight + LLM_TENSOR_SSM_CONV1D_V, // kimi: V conv1d weight + LLM_TENSOR_SSM_F_A, // kimi: forget gate projection A + LLM_TENSOR_SSM_F_B, // kimi: forget gate projection B + LLM_TENSOR_SSM_BETA, // kimi: beta mixing coefficient + LLM_TENSOR_SSM_G_A, // kimi: output gate projection A + LLM_TENSOR_SSM_G_B, // kimi: output gate projection B LLM_TENSOR_TIME_MIX_W0, LLM_TENSOR_TIME_MIX_W1, LLM_TENSOR_TIME_MIX_W2, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 95b207e9e11..a6df893a311 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2013,7 +2013,7 @@ void llama_context::output_reorder() { // uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const { - if (model.arch == LLM_ARCH_QWEN3NEXT) { + if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR) { return std::max(n_tokens * 40, 32u * model.n_tensors()); } uint32_t res = std::max(1024u, 8u*model.n_tensors()); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 54f4ed24812..165cbc0a7d6 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -533,6 +533,50 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) { return res; } +// TODO: Hybrid input classes are a bit redundant. +// Instead of creating a hybrid input, the graph can simply create 2 separate inputs. +// Refactoring is required in the future. +void llm_graph_input_mem_hybrid_k::set_input(const llama_ubatch * ubatch) { + mctx->get_attn()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch); + + mctx->get_attn()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn); + + const int64_t n_rs = mctx->get_recr()->get_n_rs(); + + if (inp_rs->s_copy) { + GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer)); + int32_t * data = (int32_t *) inp_rs->s_copy->data; + + // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n + for (uint32_t i = 0; i < n_rs; ++i) { + data[i] = mctx->get_recr()->s_copy(i); + } + } +} + +bool llm_graph_input_mem_hybrid_k::can_reuse(const llm_graph_params & params) { + const auto * mctx = static_cast(params.mctx); + + this->mctx = mctx; + + bool res = true; + + res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens; + + res &= inp_attn->self_kq_mask->ne[0] == mctx->get_attn()->get_n_kv(); + res &= inp_attn->self_kq_mask->ne[1] == params.ubatch.n_tokens; + + res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs(); + + res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs; + res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs; + + res &= inp_rs->head == mctx->get_recr()->get_head(); + res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z(); + + return res; +} + void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) { const auto * attn_ctx = mctx->get_attn(); @@ -2268,6 +2312,17 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const { return (llm_graph_input_mem_hybrid *) res->add_input(std::move(inp)); } +llm_graph_input_mem_hybrid_k * llm_graph_context::build_inp_mem_hybrid_k() const { + const auto * mctx_cur = static_cast(mctx); + + auto inp_rs = build_rs_inp_impl (ctx0, ubatch, mctx_cur->get_recr()); + auto inp_attn = build_attn_inp_k_impl(ctx0, ubatch, hparams, cparams, mctx_cur->get_attn()); + + auto inp = std::make_unique(cparams, std::move(inp_attn), std::move(inp_rs), mctx_cur); + + return (llm_graph_input_mem_hybrid_k *) res->add_input(std::move(inp)); +} + llm_graph_input_mem_hybrid_iswa * llm_graph_context::build_inp_mem_hybrid_iswa() const { const auto * mctx_cur = static_cast(mctx); diff --git a/src/llama-graph.h b/src/llama-graph.h index 4090d8116c9..1d69ff1a6fc 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -433,6 +433,34 @@ class llm_graph_input_mem_hybrid : public llm_graph_input_i { const llama_memory_hybrid_context * mctx; }; +class llm_graph_input_mem_hybrid_k : public llm_graph_input_i { +public: + llm_graph_input_mem_hybrid_k( + const llama_cparams & cparams, + std::unique_ptr inp_attn, + std::unique_ptr inp_rs, + const llama_memory_hybrid_context * mctx) : + inp_attn(std::move(inp_attn)), + inp_rs(std::move(inp_rs)), + cparams(cparams), + mctx(mctx) { } + virtual ~llm_graph_input_mem_hybrid_k() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + std::unique_ptr inp_attn; + std::unique_ptr inp_rs; + + llm_graph_input_attn_k * get_attn() const { return inp_attn.get(); } + llm_graph_input_rs * get_recr() const { return inp_rs.get(); } + + const llama_cparams cparams; + + const llama_memory_hybrid_context * mctx; +}; + class llm_graph_input_mem_hybrid_iswa : public llm_graph_input_i { public: llm_graph_input_mem_hybrid_iswa( @@ -960,6 +988,7 @@ struct llm_graph_context { // llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const; + llm_graph_input_mem_hybrid_k * build_inp_mem_hybrid_k() const; llm_graph_input_mem_hybrid_iswa * build_inp_mem_hybrid_iswa() const; diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 392f9160cef..756dda1a7ab 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -139,6 +139,13 @@ uint32_t llama_hparams::n_embd_r() const { return n_embd * (n_shortconv_l_cache - 1); } + if (n_embd_head_kda != 0) { + // for Kimi KDA layers + // Conv state for Q, K, V: 3 * (d_conv - 1) * n_head * head_dim + const uint32_t d_inner = n_head() * n_embd_head_kda; // 32 * 128 = 4096 + return 3 * (ssm_d_conv > 0 ? ssm_d_conv - 1 : 3) * d_inner; + } + // TODO: maybe support other convolution strides than 1 // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed // Corresponds to Mamba's conv_states size @@ -151,6 +158,13 @@ uint32_t llama_hparams::n_embd_s() const { return n_embd * wkv_head_size; } + if (n_embd_head_kda != 0) { + // for Kimi KDA layers + // Full recurrent state: head_dim * head_dim * n_head + // h tensor shape for delta attention: [head_dim, head_dim, n_head] + return n_embd_head_kda * n_embd_head_kda * n_head(); // 128 * 128 * 32 = 524288 + } + // corresponds to Mamba's ssm_states size return ssm_d_state * ssm_d_inner; } diff --git a/src/llama-hparams.h b/src/llama-hparams.h index dfbc7d95e9b..a435043cfec 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -137,6 +137,9 @@ struct llama_hparams { uint32_t ssm_dt_rank = 0; uint32_t ssm_n_group = 0; + // for Kimi Linear KDA + uint32_t n_embd_head_kda = 0; + // for hybrid state space models std::array recurrent_layer_arr; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 72490a89b56..765e4de2e49 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -125,6 +125,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_21B_A3B: return "21B.A3B"; case LLM_TYPE_30B_A3B: return "30B.A3B"; case LLM_TYPE_31B_A3_5B: return "31B.A3.5B"; + case LLM_TYPE_48B_A3B: return "48B.A3B"; case LLM_TYPE_80B_A3B: return "80B.A3B"; case LLM_TYPE_100B_A6B: return "100B.A6B"; case LLM_TYPE_102B_A12B: return "102B.A12B"; @@ -2450,6 +2451,37 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_KIMI_LINEAR: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl); + ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); + ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot); + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.n_embd_head_kda); + + // MLA qk_rope_head_dim (for reference) + // qk_rope_head_dim = 64, qk_nope_head_dim = 128, qk_head_dim = 192 + + // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba) + // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention) + for (uint32_t i = 0; i < hparams.n_layer; ++i) { + hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent + } + + // MoE parameters - Kimi uses moe_intermediate_size = 1024 + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); + + switch (hparams.n_layer) { + case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B + default: type = LLM_TYPE_UNKNOWN; + } + } break; default: throw std::runtime_error("unsupported model architecture"); } @@ -6752,6 +6784,141 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0); } } break; + case LLM_ARCH_KIMI_LINEAR: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + // Check for KDA specific tensors to determine layer type or if it's a mixed model + // Assuming KDA layer if KDA tensors are present + + // KDA uses head_dim = 128 (from linear_attn_config.head_dim) + const int64_t n_embd_head_k_kda = hparams.n_embd_head_kda; + const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda; + const int64_t ssm_d_conv = hparams.ssm_d_conv; + + // Try loading KDA specific tensors (using SSM_ prefix) + // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1) + // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner] + layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED); + if (!layer.ssm_q_conv) { + layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, TENSOR_NOT_REQUIRED); + } + + if (layer.ssm_q_conv) { + // KDA Layer - Conv1d weights may be 3D or 4D + layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED); + if (!layer.ssm_k_conv) { + layer.ssm_k_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_K, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head}, 0); + } + layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head, 1}, TENSOR_NOT_REQUIRED); + if (!layer.ssm_v_conv) { + layer.ssm_v_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_V, "weight", i), {ssm_d_conv, 1, n_embd_head_v_kda * n_head}, 0); + } + + // q, k, v projections + // Python: q_proj, k_proj, v_proj + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k_kda * n_head}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_v_kda * n_head}, 0); + + // KDA specific projections + // f_a_proj, f_b_proj + layer.ssm_f_a = create_tensor(tn(LLM_TENSOR_SSM_F_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0); // head_dim + layer.ssm_f_b = create_tensor(tn(LLM_TENSOR_SSM_F_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0); // projection_size + + // b_proj (beta mixing coefficient) + layer.ssm_beta = create_tensor(tn(LLM_TENSOR_SSM_BETA, "weight", i), {n_embd, n_head}, 0); + + // A_log - Shape in GGUF: [1, num_heads, 1, 1] (4D) or [1, num_heads] (2D after quantization) Note: -exp(A_log) is applied in convert_hf_to_gguf.py + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head, 1, 1}, TENSOR_NOT_REQUIRED); + if (!layer.ssm_a) { + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0); + } + + // dt_bias - shape [n_embd_head_k_kda * n_head] = [4096] + layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_embd_head_k_kda * n_head}, 0); + + // g_a_proj, g_b_proj (output gate) + layer.ssm_g_a = create_tensor(tn(LLM_TENSOR_SSM_G_A, "weight", i), {n_embd, n_embd_head_k_kda}, 0); + layer.ssm_g_b = create_tensor(tn(LLM_TENSOR_SSM_G_B, "weight", i), {n_embd_head_k_kda, n_embd_head_k_kda * n_head}, 0); + + // o_norm (reusing SSM_NORM) + layer.ssm_o_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {n_embd_head_k_kda}, 0); // FusedRMSNormGated + + // o_proj + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v_kda * n_head, n_embd}, 0); + + } else { + // MLA Layer - use MLA-specific head dimensions + const int64_t q_lora_rank = hparams.n_lora_q; + const int64_t kv_lora_rank = hparams.n_lora_kv; + const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla(); + const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla(); + + layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, TENSOR_NOT_REQUIRED); + layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0); + + if (layer.attn_q_a_norm) { + layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0); + layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0); + } else { + // Kimi MLA without Q compression: wq = [n_embd, n_head * n_embd_head_k_mla] + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0); + } + + // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA) + // Note: hparams.n_rot may be 72 (from conversion) but actual is 64 + const int64_t qk_rope_head_dim = hparams.n_rot; // From config: qk_rope_head_dim + layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0); + // Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled) + layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_k_mla - qk_rope_head_dim + n_embd_head_v_mla)}, TENSOR_NOT_REQUIRED); + if (!layer.wkv_b) { // MLA KV cache enabled + layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_k_mla - qk_rope_head_dim, kv_lora_rank, n_head}, 0); + layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0); + } + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0); + } + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + // MoE intermediate size (different from dense FFN) + const int64_t n_ff_exp = hparams.n_ff_exp; + + // Kimi uses n_layer_dense_lead to determine which layers use dense FFN vs MoE + // first_k_dense_replace = 1 means layer 0 uses dense FFN, layers 1+ use MoE + if (i < (int) hparams.n_layer_dense_lead) { + // Dense FFN layer - use normal n_ff + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } else { + // MoE layer - use n_ff_exp (1024) instead of n_ff (9216) + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0); + + // Shared experts use moe_intermediate_size * num_shared_experts + // Kimi: shared_expert_intermediate_size = 1024 * 1 = 1024 + // Tensors are 2D: [n_embd, n_ff_shexp] or [n_ff_shexp, n_embd] + const int64_t n_ff_shexp_actual = n_ff_exp * (hparams.n_expert_shared > 0 ? hparams.n_expert_shared : 1); + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp_actual, n_embd}, TENSOR_NOT_REQUIRED); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp_actual}, TENSOR_NOT_REQUIRED); + + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0); + } + } + } break; case LLM_ARCH_COGVLM: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -8086,6 +8253,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_KIMI_LINEAR: + { + llm = std::make_unique(*this, params); + } break; default: GGML_ABORT("fatal error"); } @@ -8235,6 +8406,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_NEMOTRON_H_MOE: + case LLM_ARCH_KIMI_LINEAR: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values diff --git a/src/llama-model.h b/src/llama-model.h index d1de16e3f28..5b408bcea25 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -118,6 +118,7 @@ enum llm_type { LLM_TYPE_21B_A3B, // Ernie MoE small LLM_TYPE_30B_A3B, LLM_TYPE_31B_A3_5B, + LLM_TYPE_48B_A3B, // Kimi Linear LLM_TYPE_80B_A3B, // Qwen3 Next LLM_TYPE_100B_A6B, LLM_TYPE_102B_A12B, // Solar-Open @@ -411,6 +412,18 @@ struct llama_layer { struct ggml_tensor * ffn_act_beta = nullptr; struct ggml_tensor * ffn_act_eps = nullptr; + // Kimi Linear KDA (using ssm_ prefix for consistency) + // Note: ssm_dt_b already exists above (mamba bias), reused for Kimi dt_bias + struct ggml_tensor * ssm_q_conv = nullptr; + struct ggml_tensor * ssm_k_conv = nullptr; + struct ggml_tensor * ssm_v_conv = nullptr; + struct ggml_tensor * ssm_f_a = nullptr; + struct ggml_tensor * ssm_f_b = nullptr; + struct ggml_tensor * ssm_beta = nullptr; + struct ggml_tensor * ssm_g_a = nullptr; + struct ggml_tensor * ssm_g_b = nullptr; + struct ggml_tensor * ssm_o_norm = nullptr; + struct llama_layer_posnet posnet; struct llama_layer_convnext convnext; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 776222cb6f2..a7891647c3d 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -787,9 +787,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); - // do not quantize Mamba's small yet 2D weights + // do not quantize Mamba /Kimi's small conv1d weights // NOTE: can't use LLM_TN here because the layer number is not known - quantize &= name.find("ssm_conv1d.weight") == std::string::npos; + quantize &= name.find("ssm_conv1d") == std::string::npos; quantize &= name.find("shortconv.conv.weight") == std::string::npos; // do not quantize RWKV's small yet 2D weights diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 38d03a8c39b..6d6bdfa090c 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1752,26 +1752,33 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { // read bpe merges and populate bpe ranks const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str()); + // Kimi-K2 uses custom tokenization without traditional BPE merges + const bool is_kimi_k2 = (tokenizer_pre == "kimi-k2"); + if (merges_keyidx == -1) { - throw std::runtime_error("cannot find tokenizer merges in model file\n"); - } + if (!is_kimi_k2) { + throw std::runtime_error("cannot find tokenizer merges in model file\n"); + } + // Kimi-K2 doesn't need merges, skip + LLAMA_LOG_INFO("%s: Kimi-K2 tokenizer detected, skipping BPE merges\n", __func__); + } else { + const int n_merges = gguf_get_arr_n(ctx, merges_keyidx); + for (int i = 0; i < n_merges; i++) { + const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i); + //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); - const int n_merges = gguf_get_arr_n(ctx, merges_keyidx); - for (int i = 0; i < n_merges; i++) { - const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i); - //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); + std::string first; + std::string second; - std::string first; - std::string second; + const size_t pos = word.find(' ', 1); - const size_t pos = word.find(' ', 1); + if (pos != std::string::npos) { + first = word.substr(0, pos); + second = word.substr(pos + 1); + } - if (pos != std::string::npos) { - first = word.substr(0, pos); - second = word.substr(pos + 1); + bpe_ranks.emplace(std::make_pair(first, second), i); } - - bpe_ranks.emplace(std::make_pair(first, second), i); } // default special tokens @@ -2226,6 +2233,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "<|end_of_text|>" // granite || t.first == "" || t.first == "_" + || t.first == "[EOT]" // Kimi-K2 || t.first == "<|end▁of▁sentence|>" // DeepSeek || t.first == "" // smoldocling ) { @@ -2322,6 +2330,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "" || t.first == "" // Granite || t.first == "" + || t.first == "[PAD]" // Kimi-K2 ) { special_fim_pad_id = t.second; if ((attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { @@ -2424,6 +2433,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "<|eom_id|>" || t.first == "" || t.first == "_" + || t.first == "[EOT]" // Kimi-K2 + || t.first == "[EOS]" // Kimi-K2 || t.first == "<|end_of_text|>" || t.first == "" // smoldocling ) { diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp new file mode 100644 index 00000000000..0f037d1a393 --- /dev/null +++ b/src/models/kimi-linear.cpp @@ -0,0 +1,772 @@ +#include "models.h" +#include "ggml.h" + +#define CHUNK_SIZE 64 + +// Causal Conv1d function for Q,K,V +// When qkv is 0, it is Q, 1 is K, 2 is V +static ggml_tensor * causal_conv1d(ggml_cgraph * gf, ggml_context * ctx0, ggml_tensor * conv_states_all, ggml_tensor * conv_state_all, int64_t qkv, ggml_tensor * x, ggml_tensor * proj_w, ggml_tensor * conv_w, int64_t d_conv, int64_t head_dim, int64_t n_head, int64_t n_seq_tokens, int64_t n_seqs, int64_t n_tokens, int64_t kv_head) { + const int64_t d_inner = head_dim * n_head; + const int64_t conv_state_size = (d_conv - 1) * d_inner; + const int64_t n_embd_r_total = 3 * conv_state_size; // Q + K + V + + // conv_state_all is [n_embd_r_total, n_seqs], split into Q, K, V + // Each conv state is [(d_conv-1) * d_inner] per sequence, need to reshape to [d_conv-1, d_inner, n_seqs] + // Memory layout: for each seq, Q state is first conv_state_size elements, then K, then V + // conv_state_all has stride: nb[0] = element_size, nb[1] = n_embd_r_total * element_size + // View Q conv state: offset 0, size conv_state_size per seq + // conv_state_all is [n_embd_r_total, n_seqs] with memory layout: + // state[i + seq * n_embd_r_total] where i = conv_step + channel * (d_conv-1) + {0, conv_state_size, 2*conv_state_size} for Q/K/V + // We want [d_conv-1, d_inner, n_seqs] view: + // nb1 = (d_conv-1) * element_size (stride between channels) + // nb2 = n_embd_r_total * element_size (stride between seqs) + ggml_tensor * conv_state_x = ggml_view_3d(ctx0, conv_state_all, d_conv - 1, d_inner, n_seqs, + (d_conv - 1) * ggml_element_size(conv_state_all), // nb1: stride between channels + n_embd_r_total * ggml_element_size(conv_state_all), // nb2: stride between seqs + qkv * conv_state_size * ggml_element_size(conv_state_all)); + +// Causal Conv1d function for Q,K,V +// When qkv is 0, it is Q, 1 is K, 2 is V + // Step 1: Q, K, V projections -> [d_inner, n_tokens] + ggml_tensor * x_proj = ggml_mul_mat(ctx0, proj_w, x); + + // Reshape input: {d_inner, n_tokens} -> {d_inner, n_seq_tokens, n_seqs} + ggml_tensor * x_3d = ggml_reshape_3d(ctx0, x_proj, d_inner, n_seq_tokens, n_seqs); + + // Concat Q conv state and current input: {d_conv-1 + n_seq_tokens, d_inner, n_seqs} + ggml_tensor * conv_x = ggml_concat(ctx0, conv_state_x, ggml_transpose(ctx0, x_3d), 0); + + // Save last (d_conv-1) columns back to Q conv state + ggml_tensor * last_conv_x = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, + conv_x->nb[1], conv_x->nb[2], n_seq_tokens * conv_x->nb[0]); + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, last_conv_x, + ggml_view_1d(ctx0, conv_states_all, conv_state_size * n_seqs, + (kv_head * n_embd_r_total + qkv * conv_state_size) * ggml_element_size(conv_states_all)))); + // Reshape conv weight: GGUF [d_conv, 1, d_inner, 1] -> ggml_ssm_conv expects [d_conv, d_inner] + // GGUF stores as [d_conv, 1, d_inner, 1] with memory layout w[conv_step + channel * d_conv] + // vLLM stores as [d_inner, d_conv] with memory layout w[channel * d_conv + conv_step] + // ggml_ssm_conv computes: c[conv_step + channel * d_conv] + // GGUF layout: [d_conv, 1, d_inner] or [d_conv, 1, d_inner, 1] -> reshape to [d_conv, d_inner] + // Reshape conv weight from [d_conv, 1, d_inner, 1] to [d_conv, d_inner] for ggml_ssm_conv + ggml_tensor * conv_weight = ggml_reshape_2d(ctx0, conv_w, d_conv, d_inner); + + // Apply conv1d + // ggml_ssm_conv output: {d_inner, n_seq_tokens, n_seqs} + ggml_tensor * Xcur = ggml_ssm_conv(ctx0, conv_x, conv_weight); + // Reshape to 2D for bias add: {d_inner, n_tokens} + Xcur = ggml_reshape_2d(ctx0, Xcur, d_inner, n_tokens); + Xcur = ggml_silu(ctx0, Xcur); + + return ggml_reshape_4d(ctx0, Xcur, head_dim, n_head, n_seq_tokens, n_seqs); +} + +llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params) : + llm_graph_context_mamba(params), model(model) { + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + cb(inpL, "model.embed_tokens", -1); + + // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM) + // So we don't need inp_pos + + auto * inp_kv = !hparams.is_mla() ? build_inp_mem_hybrid() : nullptr; + auto * inp_k = hparams.is_mla() ? build_inp_mem_hybrid_k() : nullptr; + auto * inp_rs = hparams.is_mla() ? inp_k->get_recr() : inp_kv->get_recr(); + auto * inp_attn_kv = !hparams.is_mla() ? inp_kv->get_attn() : nullptr; + auto * inp_attn_k = hparams.is_mla() ? inp_k->get_attn() : nullptr; + + // Output ids for selecting which tokens to output + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + ggml_tensor * chunked_causal_mask = + ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f), + GGML_TRI_TYPE_LOWER); + + ggml_tensor * chunked_identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f)); + ggml_tensor * chunked_diag_mask = ggml_add(ctx0, chunked_causal_mask, chunked_identity); + + ggml_build_forward_expand(gf, chunked_causal_mask); + ggml_build_forward_expand(gf, chunked_identity); + ggml_build_forward_expand(gf, chunked_diag_mask); + + // Kimi dimension constants + const int64_t n_head = hparams.n_head(); + const int64_t head_dim = hparams.n_embd_head_kda; + const int64_t d_conv = hparams.ssm_d_conv; + const int64_t d_inner = n_head * head_dim; // 32 * 128 = 4096 + const int64_t n_seqs = ubatch.n_seqs; + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + + // Verify batch consistency for recurrent layers + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs()); + GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); + + // MLA params + const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla(); + const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla(); + const int64_t kv_lora_rank = hparams.n_lora_kv; + // qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot + // Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim] + const int64_t n_embd_head_qk_rope = hparams.n_rot; // config.qk_rope_head_dim + const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; // 192 - 64 = 128 + // Attention scale for MLA + const float kq_scale_mla = 1.0f / sqrtf((float)n_embd_head_k_mla); + + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers[il]; + ggml_tensor * inpSA = inpL; + + // Attention Norm + cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // Check layer type by checking which tensors exist + // KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor + bool is_kda = (layer.ssm_a != nullptr); + bool is_mla = (layer.wkv_a_mqa != nullptr); + + if (is_kda) { + // === KDA Layer (Kimi Delta Attention) with Recurrent State === + // Reference: vLLM kda.py + const auto * mctx_cur = inp_rs->mctx; + const auto kv_head = mctx_cur->get_head(); + + // Get conv states from r_l tensor (Q, K, V each have separate state) + ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); + cb(conv_states_all, "conv_states_all", il); + ggml_tensor * conv_state_all = build_rs(inp_rs, conv_states_all, hparams.n_embd_r(), n_seqs); + ggml_tensor * Qcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 0, cur, layer.wq, layer.ssm_q_conv, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head); + ggml_tensor * Kcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 1, cur, layer.wk, layer.ssm_k_conv, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head); + ggml_tensor * Vcur = causal_conv1d(gf, ctx0, conv_states_all, conv_state_all, 2, cur, layer.wv, layer.ssm_v_conv, d_conv, head_dim, n_head, n_seq_tokens, n_seqs, n_tokens, kv_head); + + // g1 = -exp(A_log) * softplus(f_b(f_a(x)) + dt_bias) + ggml_tensor * f_a = ggml_mul_mat(ctx0, layer.ssm_f_a, cur); + ggml_tensor * g1 = ggml_mul_mat(ctx0, layer.ssm_f_b, f_a); + cb(g1, "g1 f_b(f_a(cur))", il); + g1 = ggml_add(ctx0, g1, layer.ssm_dt_b); + g1 = ggml_softplus(ctx0, g1); + g1 = ggml_reshape_3d(ctx0, g1, head_dim, n_head, n_tokens); + + // A_log shape is [1, n_head] or [1, n_head, 1, 1], need to broadcast to [head_dim, n_head, n_tokens]. No need to -exp(a_log) because it was done in convert_hf_to_gguf.py + // Reshape to [1, n_head, 1] for broadcasting with g1 [head_dim, n_head, n_tokens] + ggml_tensor * A = ggml_reshape_3d(ctx0, layer.ssm_a, 1, n_head, 1); + g1 = ggml_mul(ctx0, g1, A); + cb(g1, "kda_g1", il); + + // Compute beta (mixing coefficient) + ggml_tensor * beta = ggml_mul_mat(ctx0, layer.ssm_beta, cur); + beta = ggml_reshape_4d(ctx0, beta, n_head, 1, n_seq_tokens, n_seqs); + cb(beta, "kda_beta", il); + + // Reshape for KDA recurrence + // {n_embd, n_tokens} -> {n_embd, n_seq_tokens, n_seqs} + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); + + g1 = ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs); + + // Get SSM state and compute KDA recurrence using ggml_kda_scan + ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); + ggml_tensor * state = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs); + state = ggml_reshape_4d(ctx0, state, head_dim, head_dim, n_head, n_seqs); + // Choose between build_kda_chunking and build_kda_recurrent based on n_tokens + std::pair attn_out = n_seq_tokens == 1 ? + build_kda_autoregressive(Qcur, Kcur, Vcur, g1, beta, state, il) : + build_kda_chunking(Qcur, Kcur, Vcur, g1, beta, state, chunked_causal_mask, chunked_identity, chunked_diag_mask, il); + + ggml_tensor * output = attn_out.first; + ggml_tensor * new_state = attn_out.second; + cb(output, "attn_output", il); + cb(new_state, "new_state", il); + + // Update the recurrent states + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, new_state, + ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs, + kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all)))); + + // Output gating g2 = g_b(g_a(x)) + ggml_tensor * cur_2d = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs); + ggml_tensor * g_a = ggml_mul_mat(ctx0, layer.ssm_g_a, cur_2d); + ggml_tensor * g2 = ggml_mul_mat(ctx0, layer.ssm_g_b, g_a); + cb(g2, "g2 g_b(g_a(cur_2d))", il); + g2 = ggml_reshape_3d(ctx0, g2, head_dim, n_head, n_seq_tokens * n_seqs); + + // Apply o_norm with sigmoid gating + // Note: Kimi model uses sigmoid gating, not SiLU (despite FusedRMSNormGated default being swish) + // Formula: output = RMSNorm(x) * sigmoid(g) + ggml_tensor * attn_out_final = ggml_reshape_3d(ctx0, output, head_dim, n_head, n_seq_tokens * n_seqs); + ggml_tensor * normed = build_norm(attn_out_final, layer.ssm_o_norm, nullptr, LLM_NORM_RMS, il); + cb(normed, "kda_normed", il); + ggml_tensor * gate = ggml_sigmoid(ctx0, g2); + ggml_tensor * gated = ggml_mul(ctx0, normed, gate); + + // Output projection + gated = ggml_cont_2d(ctx0, gated, d_inner, n_tokens); + cur = ggml_mul_mat(ctx0, layer.wo, gated); + cb(cur, "kda_out", il); + + } else if (is_mla) { + // === MLA Layer (Multi-head Latent Attention) without KV Cache === + // Reference: vLLM mla.py + // Step 1: Q projection and reshape + // vLLM Kimi: q = q_proj(hidden_states), then view as [n_tokens, n_head, qk_head_dim] + // Note: Kimi MLA does NOT use RoPE (rotary_emb=None in vLLM) + ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.wq, cur); + + // Step 2: KV compression + // kv_cmpr_pe = kv_a_proj_with_mqa(hidden_states) -> [kv_lora_rank + qk_rope_head_dim, n_tokens] + ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, layer.wkv_a_mqa, cur); + + // Split: kv_cmpr = kv_lora[:kv_lora_rank], k_pe = kv_lora[kv_lora_rank:] + ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe, kv_lora_rank, n_tokens, + ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0); + ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe, n_embd_head_qk_rope, 1, n_tokens, + ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), + ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), + ggml_row_size(kv_cmpr_pe->type, kv_lora_rank)); + // Note: Kimi MLA does NOT apply RoPE (rotary_emb=None in vLLM) + // k_pe is used directly without RoPE + // Normalize kv_c + kv_cmpr = build_norm(kv_cmpr, layer.attn_kv_a_norm, nullptr, LLM_NORM_RMS, il); + + if (layer.wk_b && layer.wv_b) { // MLA KV cache enabled + // extract q_nope + ggml_tensor * q_nope = + ggml_view_3d(ctx0, Qcur, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(Qcur->type, n_embd_head_k_mla), + ggml_row_size(Qcur->type, n_embd_head_k_mla) * n_head, 0); + cb(q_nope, "q_nope", il); + + // and {n_embd_head_qk_rope, n_head, n_tokens} + ggml_tensor * q_pe = ggml_view_3d( + ctx0, Qcur, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(Qcur->type, n_embd_head_k_mla), + ggml_row_size(Qcur->type, n_embd_head_k_mla) * n_head, ggml_row_size(Qcur->type, n_embd_head_qk_nope)); + cb(q_pe, "q_pe", il); + + // {n_embd_head_qk_nope, n_tokens, n_head} + q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3); + cb(q_nope, "q_nope_perm", il); + + // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head} + ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, layer.wk_b, q_nope); + cb(q_nope_absorbed, "q_nope_absorbed", il); + + // {kv_lora_rank, n_head, n_tokens} + q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3); + cb(q_nope_absorbed, "q_nope_absorbed_perm", il); + + // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens} + // note: rope must go first for in-place context shifting in build_rope_shift() + Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0); + cb(Qcur, "Qcur", il); + + kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens); + cb(kv_cmpr, "kv_cmpr_reshape", il); + + // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens} + ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0); + cb(Kcur, "Kcur", il); + + // {kv_lora_rank, 1, n_tokens} + ggml_tensor * Vcur = kv_cmpr; + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn_k, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, layer.wv_b, kq_scale_mla, il); + cb(cur, "mla_out", il); + } else { // MLA KV cache disabled. Fall back to MHA KV cache. + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k_mla, n_head, n_tokens); + cb(Qcur, "mla_Q", il); + // KV decompression: kv = kv_b_proj(kv_c_normed) + ggml_tensor * kv = ggml_mul_mat(ctx0, layer.wkv_b, kv_cmpr); + const int64_t kv_per_head = n_embd_head_qk_nope + n_embd_head_v_mla; + + // Split kv into k_nope and v + ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(kv->type, kv_per_head), + ggml_row_size(kv->type, kv_per_head * n_head), 0); + ggml_tensor * Vcur = ggml_view_3d(ctx0, kv, n_embd_head_v_mla, n_head, n_tokens, + ggml_row_size(kv->type, kv_per_head), + ggml_row_size(kv->type, kv_per_head * n_head), + ggml_row_size(kv->type, n_embd_head_qk_nope)); + Vcur = ggml_cont(ctx0, Vcur); + cb(Vcur, "mla_V", il); + + // Concatenate k_nope + k_pe (broadcast k_pe to all heads) + // K = [k_nope, k_pe] where k_nope is [qk_nope_head_dim, n_head, n_tokens] + // and k_pe is [qk_rope_head_dim, 1, n_tokens] broadcast to all heads + // Need to broadcast k_pe from [qk_rope, 1, n_tokens] to [qk_rope, n_head, n_tokens] + ggml_tensor * k_pe_target = ggml_new_tensor_3d(ctx0, k_pe->type, n_embd_head_qk_rope, n_head, n_tokens); + ggml_tensor * k_pe_repeated = ggml_repeat(ctx0, k_pe, k_pe_target); + ggml_tensor * Kcur = ggml_concat(ctx0, k_pe_repeated, k_nope, 0); + cb(Kcur, "mla_K", il); + + // Direct softmax attention (with MHA KV cache) + // Use build_attn with inp_attn for proper mask handling + cur = build_attn(inp_attn_kv, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il); + cb(cur, "mla_out", il); + } + } else { + // Unknown layer type - this should not happen + GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors"); + } + + // On last layer, select only the output tokens + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // Residual + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // FFN Norm + cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + if ((uint32_t) il < hparams.n_layer_dense_lead) { + // Dense FFN layer + cur = build_ffn(cur, + layer.ffn_up, NULL, NULL, + layer.ffn_gate, NULL, NULL, + layer.ffn_down, NULL, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { + // MoE layer + // Kimi uses moe_renormalize=True and routed_scaling_factor (stored as expert_weights_scale) = 2.446 + ggml_tensor * moe_out = build_moe_ffn(cur, + layer.ffn_gate_inp, + layer.ffn_up_exps, + layer.ffn_gate_exps, + layer.ffn_down_exps, + layer.ffn_exp_probs_b, + hparams.n_expert, + hparams.n_expert_used, + LLM_FFN_SILU, true, + true, hparams.expert_weights_scale, + (llama_expert_gating_func_type) hparams.expert_gating_func, + il); + cb(moe_out, "ffn_moe_out", il); + + // Shared expert + { + ggml_tensor * ffn_shexp = build_ffn(cur, + layer.ffn_up_shexp, NULL, NULL, + layer.ffn_gate_shexp, NULL, NULL, + layer.ffn_down_shexp, NULL, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "ffn_shexp", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); + } + } + // Residual + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + inpL = cur; + } + cur = inpL; + + // Final Norm + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // Output + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} + +/* + This is a ggml implementation of the naive_chunk_kda function of + https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/kda/naive.py +*/ +std::pair llm_build_kimi_linear::build_kda_chunking( + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * gk, + ggml_tensor * beta, + ggml_tensor * state, + ggml_tensor * causal_mask, + ggml_tensor * identity, + ggml_tensor * diag_mask, + int il) { + GGML_ASSERT(ggml_is_contiguous(state)); + + const int64_t S_k = q->ne[0]; + const int64_t H_k = q->ne[1]; + const int64_t n_tokens = q->ne[2]; + const int64_t n_seqs = q->ne[3]; + + const int64_t S_v = v->ne[0]; + const int64_t H_v = v->ne[1]; + + GGML_ASSERT(v->ne[2] == n_tokens); + GGML_ASSERT(k->ne[2] == n_tokens); + GGML_ASSERT(gk->ne[0] == S_v && gk->ne[1] == H_v && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs); + GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs); + GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs); + + GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs); + GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs); + + GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case + + // TODO: can this ever be false? + const bool use_qk_l2norm = true; + + if (use_qk_l2norm) { + const float eps_norm = hparams.f_norm_rms_eps; + + q = ggml_l2_norm(ctx0, q, eps_norm); + k = ggml_l2_norm(ctx0, k, eps_norm); + } + + const float scale = 1.0f / sqrtf(S_v); + + beta = ggml_sigmoid(ctx0, beta); + + cb(q, "q_in", il); + cb(k, "k_in", il); + cb(v, "v_in", il); + cb(beta, "beta_in", il); + cb(gk, "gk_in", il); + + q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs); + k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_k, n_tokens, H_k, n_seqs); + v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); + gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs); + + beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3)); + state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs); + + cb(q, "q_perm", il); + cb(k, "k_perm", il); + cb(v, "v_perm", il); + cb(beta, "beta_perm", il); + cb(gk, "gk_perm", il); + cb(state, "state_in", il); + + GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs); + GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs); + GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs); + GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs); + + // Do padding + const int64_t chunk_size = CHUNK_SIZE; + + const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size; + const int64_t n_chunks = (n_tokens + pad) / chunk_size; + + q = ggml_pad(ctx0, q, 0, pad, 0, 0); + k = ggml_pad(ctx0, k, 0, pad, 0, 0); + v = ggml_pad(ctx0, v, 0, pad, 0, 0); + gk = ggml_pad(ctx0, gk, 0, pad, 0, 0); + beta = ggml_pad(ctx0, beta, 0, pad, 0, 0); + + cb(q, "q_pad", il); + cb(k, "k_pad", il); + cb(v, "v_pad", il); + cb(beta, "beta_pad", il); + cb(gk, "gk_pad", il); + + ggml_tensor * v_beta = ggml_mul(ctx0, v, beta); + ggml_tensor * k_beta = ggml_mul(ctx0, k, beta); + + cb(v_beta, "v_beta", il); + cb(k_beta, "k_beta", il); + + const int64_t HB = H_k * n_seqs; + + q = ggml_cont_4d(ctx0, q, S_k, chunk_size, n_chunks, HB); + k = ggml_cont_4d(ctx0, k, S_k, chunk_size, n_chunks, HB); + k_beta = ggml_cont_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, HB); + v = ggml_cont_4d(ctx0, v, S_v, chunk_size, n_chunks, HB); + v_beta = ggml_cont_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, HB); + + gk = ggml_cont_4d(ctx0, gk, S_k, chunk_size, n_chunks, HB); + beta = ggml_cont_4d(ctx0, beta, 1, chunk_size, n_chunks, HB); + + // switch for cumsum + gk = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk, 1, 0, 2, 3), chunk_size, S_k, n_chunks, HB); + cb(gk, "gk", il); + ggml_tensor * gk_cumsum = ggml_cumsum(ctx0, gk); + cb(gk_cumsum, "gk_cumsum", il); + +/* + Compute Akk and Aqk loop together + Akk loop: + for i in range(BT): + k_i = k[..., i, :] # k_i [B,H,NT,S] + g_i = g[..., i:i+1, :] # g_i [B,H,NT,1,S] + A[..., i] = torch.einsum('... c d, ... d -> ... c', k * (g - g_i).exp(), k_i) + Aqk loop: + for j in range(BT): + k_j = k[:, :, i, j] + g_j = g[:, :, i, j:j+1, :] + A[..., j] = torch.einsum('... c d, ... d -> ... c', q_i * (g_i - g_j).exp(), k_j) +*/ + const int64_t CHB = n_chunks * H_k * n_seqs; + ggml_tensor * gkcs_i = ggml_reshape_4d(ctx0, gk_cumsum, chunk_size, 1, S_k, CHB); // [chunk_size, 1, S_k, CHB] + ggml_tensor * gkcs_j = ggml_reshape_4d(ctx0, gkcs_i, 1, chunk_size, S_k, CHB); // [1, chunk_size, S_k, CHB] + + ggml_tensor * gkcs_j_bc = ggml_repeat_4d(ctx0, gkcs_j, chunk_size, chunk_size, S_k, CHB); // [1, chunk_size, S_k, CHB] -> [chunk_size, chunk_size, S_k, CHB] + // decay_mask [chunk_size,chunk_size,S_k,CHB] + ggml_tensor * decay_mask = ggml_sub(ctx0, gkcs_j_bc, gkcs_i); + cb(decay_mask, "decay_mask", il); + + decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); + cb(decay_mask, "decay_masked", il); + decay_mask = ggml_exp(ctx0, decay_mask); + decay_mask = ggml_mul(ctx0, decay_mask, diag_mask); + + // decay_mask [S_k,BT_j,BT_i,CHB] *Note* second and third chunk_sizes are switched + decay_mask = ggml_cont_4d(ctx0, ggml_permute(ctx0, decay_mask, 2, 1, 0, 3), S_k, chunk_size, chunk_size, CHB); + + ggml_tensor * k_i = ggml_reshape_4d(ctx0, k, S_k, chunk_size, 1, CHB); + ggml_tensor * k_j = ggml_reshape_4d(ctx0, k, S_k, 1, chunk_size, CHB); + ggml_tensor * q_i = ggml_reshape_4d(ctx0, q, S_k, chunk_size, 1, CHB); + + ggml_tensor * decay_k_i = ggml_mul(ctx0, decay_mask, k_i); + ggml_tensor * decay_q_i = ggml_mul(ctx0, decay_mask, q_i); + + // decay_k_i [S.BT,BT,CHB] @ k_j [S,1,BT,CHB] = Akk [BT,1,BT,CHB] + ggml_tensor * Akk = ggml_mul_mat(ctx0, decay_k_i, k_j); + ggml_tensor * Aqk = ggml_mul_mat(ctx0, decay_q_i, k_j); + Akk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Akk, chunk_size, chunk_size, n_chunks, HB))); + Aqk = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_4d(ctx0, Aqk, chunk_size, chunk_size, n_chunks, HB))); + cb(Akk, "Akk", il); + cb(Aqk, "Aqk", il); + + Akk = ggml_mul(ctx0, Akk, beta); + Akk = ggml_neg(ctx0, ggml_mul(ctx0, Akk, causal_mask)); + cb(Akk, "attn_pre_solve", il); + + Aqk = ggml_mul(ctx0, Aqk, diag_mask); + Aqk = ggml_scale(ctx0, Aqk, scale); // scale q + cb(Aqk, "Aqk_masked", il); + + // for i in range(1, chunk_size): + // row = attn[..., i, :i].clone() + // sub = attn[..., :i, :i].clone() + // attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2) + // attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device) + // + // We reduce this to a linear triangular solve: AX = B, where B = attn, A = I - tril(A) + ggml_tensor * attn_lower = ggml_mul(ctx0, Akk, causal_mask); + ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower); + + ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, Akk, true, true, false); + Akk = ggml_mul(ctx0, lin_solve, causal_mask); + Akk = ggml_add(ctx0, Akk, identity); + + cb(Akk, "attn_solved", il); + + // switch back for downstream + gk_cumsum = ggml_cont_4d(ctx0, ggml_permute(ctx0, gk_cumsum, 1, 0, 2, 3), S_k, chunk_size, n_chunks, HB); + ggml_tensor * gkexp = ggml_exp(ctx0, gk_cumsum); + cb(gk_cumsum, "gk_cumsum", il); + + // u = (A*beta[..., None, :]) @ v aka U_[t] + ggml_tensor * vb = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), Akk); + + ggml_tensor * kbeta_gkexp = ggml_mul(ctx0, k_beta, gkexp); + cb(kbeta_gkexp, "kbeta_gkexp", il); + + ggml_tensor * k_cumdecay = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gkexp)), Akk); + cb(k_cumdecay, "k_cumdecay", il); + + ggml_tensor * core_attn_out = nullptr; + ggml_tensor * new_state = ggml_dup(ctx0, state); + + cb(new_state, "new_state", il); + + for (int64_t chunk = 0; chunk < n_chunks; chunk++) { +// extract one chunk worth of data + auto chunkify = [=](ggml_tensor * t) { + return ggml_cont(ctx0, ggml_view_4d(ctx0, t, t->ne[0], chunk_size, 1, t->ne[3], + t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk)); + }; + auto chunkify_A = [=](ggml_tensor * t) { + return ggml_cont(ctx0, ggml_view_4d(ctx0, t, chunk_size, chunk_size, 1, t->ne[3], + t->nb[1], t->nb[2], t->nb[3], t->nb[2] * chunk)); + }; + + +// k [S,BT,NT,H*B] => k_chunk [S,BT,1,H*B] + ggml_tensor * k_chunk = chunkify(k); + ggml_tensor * q_chunk = chunkify(q); + ggml_tensor * vb_chunk = chunkify(vb); + +// gk_cumsum [S,BT,NT,H*B] => gk_cs_chunk [S,BT,1,H*B] + ggml_tensor * gk_cs_chunk = chunkify(gk_cumsum); + ggml_tensor * k_cumdecay_chunk = chunkify(k_cumdecay); + ggml_tensor * gkexp_chunk = ggml_exp(ctx0, gk_cs_chunk); + ggml_tensor * Aqk_chunk = chunkify_A(Aqk); + + ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs); + + // new_state [S,S,1,H*B] k_cumdecay_chunk [S,BT,1,H*B] + // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state or W_[t] @ S_[t] + ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk); + + // v_new = v_i - v_prime or U_[t] - W_[t]*S_[t] + ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, vb_chunk, v_prime), v_prime); + ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new)); + + // q_chunk [S,BT,1,H*B] gkexp_chunk [S,BT,1,H*B] + // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state + // or Gamma_[t]*Q_]t] @ S + ggml_tensor * q_gk_exp = ggml_mul(ctx0, q_chunk, gkexp_chunk); + ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_gk_exp); + attn_inter = ggml_scale(ctx0, attn_inter, scale); // scale q + + // v_new_t [S,BT,1,H*B] Aqk [BT,BT,1,H*B] + // core_attn_out[:, :, i] = attn_inter + attn @ v_new or A' @ (U_[t] - W_[t]*S_[t]) + ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, Aqk_chunk); + + // o[:, :, i] = (q_i * g_i.exp()) @ S + A @ v_i + ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn); + + core_attn_out = core_attn_out == nullptr ? core_attn_out_chunk : ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 1); + + ggml_tensor * gk_cum_last = + ggml_cont(ctx0, ggml_view_4d(ctx0, gk_cs_chunk, gk_cs_chunk->ne[0], 1, gk_cs_chunk->ne[2], gk_cs_chunk->ne[3], + gk_cs_chunk->nb[1], gk_cs_chunk->nb[2], gk_cs_chunk->nb[3], + gk_cs_chunk->nb[1] * (gk_cs_chunk->ne[1] - 1))); + + ggml_tensor * gkexp_last = ggml_exp(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, gk_cum_last))); + + ggml_tensor * gk_diff = ggml_neg(ctx0, ggml_sub(ctx0, gk_cs_chunk, gk_cum_last)); + + ggml_tensor * gk_diff_exp = ggml_exp(ctx0, gk_diff); + + ggml_tensor * key_gkdiff = ggml_mul(ctx0, k_chunk, gk_diff_exp); + + // rearrange((g_i[:,:,-1:] - g_i).exp()*k_i, 'b h c k -> b h k c') @ (U_[t] - W_[t] @ S) + ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, ggml_cont(ctx0, ggml_transpose(ctx0, key_gkdiff))); + + new_state = ggml_add(ctx0, + ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gkexp_last, gkexp_last->ne[0], gkexp_last->ne[1], H_v, n_seqs)), + ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs)); + } + + core_attn_out = ggml_cont_4d(ctx0, core_attn_out, S_v, chunk_size * n_chunks, H_v, n_seqs); + + // truncate padded tokens + ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out, + S_v, n_tokens, H_v, n_seqs, + ggml_row_size(core_attn_out->type, S_v), + ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks), + ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0); + output_tokens = ggml_cont(ctx0, output_tokens); + // permute back to (S_v, H_v, n_tokens, n_seqs) + output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3); + output_tokens = ggml_cont(ctx0, output_tokens); + + cb(new_state, "output_state", il); + + return {output_tokens, new_state}; +} + +std::pair llm_build_kimi_linear::build_kda_autoregressive( + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * gk, + ggml_tensor * beta, + ggml_tensor * state, + int il) { + GGML_ASSERT(ggml_is_contiguous(v)); + GGML_ASSERT(ggml_is_contiguous(gk)); + + const int64_t S_k = q->ne[0]; + const int64_t H_k = q->ne[1]; + const int64_t n_tokens = q->ne[2]; + const int64_t n_seqs = q->ne[3]; + + const int64_t S_v = v->ne[0]; + const int64_t H_v = v->ne[1]; + + GGML_ASSERT(n_tokens == 1); + GGML_ASSERT(v->ne[2] == n_tokens); + GGML_ASSERT(k->ne[2] == n_tokens); + GGML_ASSERT(gk->ne[0] == S_k && gk->ne[1] == H_k && gk->ne[2] == n_tokens && gk->ne[3] == n_seqs); + GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs); + GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_k && state->ne[2] == H_v && state->ne[3] == n_seqs); + + GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs); + GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs); + + GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case + + const float eps_norm = hparams.f_norm_rms_eps; + + q = ggml_l2_norm(ctx0, q, eps_norm); + k = ggml_l2_norm(ctx0, k, eps_norm); + + const float scale = 1.0f / sqrtf(S_v); + + q = ggml_scale(ctx0, q, scale); + beta = ggml_sigmoid(ctx0, beta); + + cb(q, "q_in", il); + cb(k, "k_in", il); + cb(v, "v_in", il); + cb(beta, "beta_in", il); + cb(gk, "gk_in", il); + +// g [H,1,B,1] g_t [1,H,B,1] => [1,1,H,B] +// gk [S,H,1,B] => [S,1,H,B] gk_t [1,S,H,B] +// beta [H,1,1,B] beta_t [1,H,1,B] => [1,1,H,B] + gk = ggml_reshape_4d(ctx0, gk, S_k, 1, H_k, n_seqs); + ggml_tensor * gk_t = ggml_cont(ctx0, ggml_transpose(ctx0, gk)); + ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs); + + // Apply exponential to gk_t + gk_t = ggml_exp(ctx0, gk_t); + // Apply the gated delta rule for the single timestep + // last_recurrent_state = last_recurrent_state * gk_t + // S = S * g_i[..., None].exp() + state = ggml_mul(ctx0, state, gk_t); + + ggml_tensor * state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state)); + +// state [S,S,H,B] k [S,1,H,B] k_state [S_v,1,H,B] + k = ggml_reshape_4d(ctx0, k, S_k, 1, H_k, n_seqs); + ggml_tensor * k_state = ggml_mul_mat(ctx0, state_t, k); + + // v_i - (k_i[..., None] * S).sum(-2) + v = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs); + ggml_tensor * v_diff = ggml_sub(ctx0, v, k_state); + + // b_i[..., None] * k_i + ggml_tensor * k_beta = ggml_mul(ctx0, k, beta_t); + + // S = S + torch.einsum('b h k, b h v -> b h k v', b_i[..., None] * k_i, v_i - (k_i[..., None] * S).sum(-2)) + // v_diff_t [1,S_v,H,B] k_beta_t [1,S_k,H,B] state [S_v,S_k,H,B] + state = ggml_add(ctx0, state, ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_diff)), ggml_cont(ctx0, ggml_transpose(ctx0, k_beta)))); + + q = ggml_reshape_4d(ctx0, q, S_k, 1, H_k, n_seqs); + state_t = ggml_cont(ctx0, ggml_transpose(ctx0, state)); + ggml_tensor * core_attn_out = ggml_mul_mat(ctx0, state_t, q); + // core_attn_out should be [S_v, 1, H_v, n_seqs] after this + cb(core_attn_out, "output_tokens", il); + cb(state, "new_state", il); + + return {core_attn_out, state}; +} + diff --git a/src/models/models.h b/src/models/models.h index 3a44f7f140f..71c1fe81084 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -288,6 +288,33 @@ struct llm_build_jamba : public llm_graph_context_mamba { llm_build_jamba(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_kimi_linear : public llm_graph_context_mamba { + llm_build_kimi_linear(const llama_model & model, const llm_graph_params & params); + + std::pair build_kda_autoregressive( + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * gk, + ggml_tensor * beta, + ggml_tensor * state, + int il); + + std::pair build_kda_chunking( + ggml_tensor * q, + ggml_tensor * k, + ggml_tensor * v, + ggml_tensor * gk, + ggml_tensor * beta, + ggml_tensor * state, + ggml_tensor * causal_mask, + ggml_tensor * identity, + ggml_tensor * diag_mask, + int il); + + const llama_model & model; +}; + struct llm_build_lfm2 : public llm_graph_context { const llama_model & model; From 02092c6891d1a1eacce865da8dd671694a88bda5 Mon Sep 17 00:00:00 2001 From: lvyichen Date: Tue, 3 Feb 2026 11:02:27 +0800 Subject: [PATCH 05/21] Support Step3.5-Flash --- convert_hf_to_gguf.py | 176 +++++++++++++++++++++++++++++++- gguf-py/gguf/constants.py | 71 +++++++++---- gguf-py/gguf/gguf_writer.py | 13 +++ gguf-py/gguf/tensor_mapping.py | 9 ++ src/CMakeLists.txt | 1 + src/llama-arch.cpp | 65 +++++++++--- src/llama-arch.h | 4 + src/llama-graph.cpp | 49 ++++++++- src/llama-graph.h | 3 +- src/llama-hparams.cpp | 5 + src/llama-hparams.h | 16 +++ src/llama-model-loader.cpp | 1 + src/llama-model.cpp | 115 +++++++++++++++++++++ src/models/models.h | 4 + src/models/step35-iswa.cpp | 178 +++++++++++++++++++++++++++++++++ 15 files changed, 669 insertions(+), 41 deletions(-) create mode 100644 src/models/step35-iswa.cpp diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c167de8a465..42ec9c05044 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -831,7 +831,7 @@ def prepare_metadata(self, vocab_only: bool): def set_gguf_parameters(self): self.gguf_writer.add_block_count(self.block_count) - if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length", "max_sequence_length", "model_max_length"], optional=True)) is not None: + if (n_ctx := self.find_hparam(["max_position_embeddings", "max_position_embedding", "n_ctx", "n_positions", "max_length", "max_sequence_length", "model_max_length"], optional=True)) is not None: self.gguf_writer.add_context_length(n_ctx) logger.info(f"gguf: context length = {n_ctx}") @@ -7912,6 +7912,180 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@ModelBase.register("Step3p5ForCausalLM") +class Step35Model(TextModel): + """ + Step3.5 interleaved sliding-window attention + MoE with sigmoid routing and expert selection bias. + """ + + model_arch = gguf.MODEL_ARCH.STEP35 + + def set_gguf_parameters(self): + rope_theta_per_layer = None + rope_theta = self.hparams.get("rope_theta", None) + if isinstance(rope_theta, list): + rope_theta_per_layer = rope_theta + if len(rope_theta) == 0: + raise ValueError("rope_theta list must not be empty") + rope_theta0 = float(rope_theta[0]) + self.hparams["rope_theta"] = rope_theta0 + if isinstance(getattr(self, "rope_parameters", None), dict) and isinstance(self.rope_parameters.get("rope_theta", None), list): + self.rope_parameters["rope_theta"] = rope_theta0 + + super().set_gguf_parameters() + + def _truncate_to_block_count(name: str, values: list, *, allow_none: bool = False) -> list: + if not isinstance(values, list): + raise ValueError(f"{name} must be a list, got {type(values)}") + if len(values) < self.block_count: + raise ValueError(f"{name} must have length >= {self.block_count}, got {len(values)}") + if len(values) != self.block_count: + logger.warning( + "%s length mismatch: expected %d, got %d; truncating to %d", + name, self.block_count, len(values), self.block_count, + ) + values = values[: self.block_count] + if not allow_none and any(v is None for v in values): + raise ValueError(f"{name} must not contain None") + return values + + layer_types = self.hparams.get("layer_types", []) + attn_other = self.hparams.get("attention_other_setting", {}) or {} + + n_head_base = self.hparams["num_attention_heads"] + n_kv_base = self.hparams["num_attention_groups"] + + n_head_swa = attn_other.get("num_attention_heads", n_head_base) + n_kv_swa = attn_other.get("num_attention_groups", n_kv_base) + + if layer_types: + layer_types = _truncate_to_block_count("layer_types", layer_types, allow_none=False) + head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types] + kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types] + swa_pat = [1 if lt == "sliding_attention" else 0 for lt in layer_types] + else: + raise ValueError(f"layer_types is not set: {layer_types}") + + self.gguf_writer.add_head_count(head_arr) + self.gguf_writer.add_head_count_kv(kv_arr) + + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_sliding_window_pattern(swa_pat) + + self.gguf_writer.add_value_length(self.hparams["head_dim"]) + + # Whether rope_scaling/rope_parameters are applied + # based on attention type, encoded as a small bitmask: + # bit0 -> apply on full_attention (dense layers) + # bit1 -> apply on sliding_attention (SWA layers) + yarn_only_types = self.hparams.get("yarn_only_types", None) + self.gguf_writer.add_rope_scaling_apply_mask(yarn_only_types) + + # MoE params + self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"]) + self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"]) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["share_expert_dim"]) + + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + self.gguf_writer.add_expert_weights_scale(self.hparams.get("moe_router_scaling_factor", 1.0)) + self.gguf_writer.add_expert_weights_norm(bool(self.hparams.get("norm_expert_weight", False))) + + # leading dense blocks + leading_dense = 0 + moe_layers_enum = self.hparams.get("moe_layers_enum") + if isinstance(moe_layers_enum, str) and moe_layers_enum.strip(): + moe_layers = sorted(int(i) for i in moe_layers_enum.strip().split(",")) + if moe_layers: + leading_dense = max(0, moe_layers[0]) + self.gguf_writer.add_leading_dense_block_count(leading_dense) + self.gguf_writer.add_moe_every_n_layers(int(self.hparams.get("moe_every_n_layer", 1))) + + # RoPE: Step35 uses per-layer partial rotary factors; llama.cpp currently only supports a single rope dim. + # Check that partial_rotary_factors exists, is the right length, and all factors > 0 + partial_rotary_factors = self.hparams.get("partial_rotary_factors", None) + if partial_rotary_factors is None: + raise ValueError("partial_rotary_factors must be present in hparams") + partial_rotary_factors = _truncate_to_block_count("partial_rotary_factors", partial_rotary_factors, allow_none=False) + rope_dim_per_layer = [int(self.hparams["head_dim"] * factor) for factor in partial_rotary_factors] + self.gguf_writer.add_rope_dimension_count_per_layer(rope_dim_per_layer) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5)) + + # Step35: per-layer rope_theta support + if rope_theta_per_layer is not None: + rope_theta_per_layer = _truncate_to_block_count("rope_theta", rope_theta_per_layer, allow_none=False) + freq_base_per_layer = [float(v) for v in rope_theta_per_layer] + self.gguf_writer.add_array(f"{self.gguf_writer.arch}.rope.freq_base_per_layer", freq_base_per_layer) + + # Optional per-layer SwiGLU clamps (HF: swiglu_limits / swiglu_limits_shared). + for key in ("swiglu_limits", "swiglu_limits_shared"): + limits = self.hparams.get(key, None) + if limits is None: + continue + limits = _truncate_to_block_count(key, limits, allow_none=True) + limits_f = [0.0 if v is None else float(v) for v in limits] + self.gguf_writer.add_array(f"{self.gguf_writer.arch}.{key}", limits_f) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + # remove mtp layers + if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None: + il = int(m.group(1)) + n_main = int(self.hparams.get("num_hidden_layers", self.block_count)) + if il >= n_main: + return [] + # Map router bias (expert selection bias) to a GGUF bias tensor + if name.endswith(".moe.router_bias"): + return [(self.map_tensor_name(name + ".bias"), data_torch)] + + if name.endswith((".self_attn.g_proj.weight", ".moe.gate.weight", ".moe.up_proj.weight", ".moe.gate_proj.weight", ".moe.down_proj.weight")): + w = data_torch.squeeze() + return [(self.map_tensor_name(name), w.contiguous())] + + return super().modify_tensors(data_torch, name, bid) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + # Step35 can optionally use Llama-3 style RoPE scaling (HF: rope_scaling.rope_type == "llama3"). + # llama.cpp represents this via a single extra tensor: "rope_freqs.weight" (aka MODEL_TENSOR.ROPE_FREQS). + rope_params = self.rope_parameters.get("full_attention", self.rope_parameters) + rope_type = (rope_params.get("rope_type") or rope_params.get("type") or "") + if rope_type.lower() != "llama3": + return () + + # Step35 configs can carry per-layer rope_theta as a list; for llama3 rope factors we use the base value. + rope_theta = self.hparams.get("rope_theta", 10000.0) + if isinstance(rope_theta, list): + if len(rope_theta) == 0: + raise ValueError("rope_theta list must not be empty") + rope_theta = rope_theta[0] + base = float(rope_theta) + dim = self.hparams.get("head_dim") + if dim is None: + dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + dim = int(dim) + + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = float(rope_params.get("factor", 8.0)) + low_freq_factor = float(rope_params.get("low_freq_factor", 1.0)) + high_freq_factor = float(rope_params.get("high_freq_factor", 4.0)) + old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192))) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + + rope_factors: list[float] = [] + for freq in freqs: + wavelen = 2 * math.pi / float(freq) + if wavelen < high_freq_wavelen: + rope_factors.append(1.0) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1.0 / ((1.0 - smooth) / factor + smooth)) + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + @ModelBase.register("PanguEmbeddedForCausalLM") class PanguEmbeddedModel(TextModel): model_arch = gguf.MODEL_ARCH.PANGU_EMBED diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 3ddbc73d1cc..bd04fec8654 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -179,20 +179,22 @@ class Attention: TEMPERATURE_SCALE = "{arch}.attention.temperature_scale" class Rope: - DIMENSION_COUNT = "{arch}.rope.dimension_count" - DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" - FREQ_BASE = "{arch}.rope.freq_base" - FREQ_BASE_SWA = "{arch}.rope.freq_base_swa" - SCALING_TYPE = "{arch}.rope.scaling.type" - SCALING_FACTOR = "{arch}.rope.scaling.factor" - SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" - SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" - SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" - SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" - SCALING_YARN_EXT_FACTOR = "{arch}.rope.scaling.yarn_ext_factor" - SCALING_YARN_ATTN_FACTOR = "{arch}.rope.scaling.yarn_attn_factor" - SCALING_YARN_BETA_FAST = "{arch}.rope.scaling.yarn_beta_fast" - SCALING_YARN_BETA_SLOW = "{arch}.rope.scaling.yarn_beta_slow" + DIMENSION_COUNT = "{arch}.rope.dimension_count" + DIMENSION_COUNT_PER_LAYER = "{arch}.rope.dimension_count_per_layer" + DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" + FREQ_BASE = "{arch}.rope.freq_base" + FREQ_BASE_SWA = "{arch}.rope.freq_base_swa" + SCALING_TYPE = "{arch}.rope.scaling.type" + SCALING_FACTOR = "{arch}.rope.scaling.factor" + SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" + SCALING_APPLY_MASK = "{arch}.rope.scaling.apply_mask" + SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" + SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" + SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" + SCALING_YARN_EXT_FACTOR = "{arch}.rope.scaling.yarn_ext_factor" + SCALING_YARN_ATTN_FACTOR = "{arch}.rope.scaling.yarn_attn_factor" + SCALING_YARN_BETA_FAST = "{arch}.rope.scaling.yarn_beta_fast" + SCALING_YARN_BETA_SLOW = "{arch}.rope.scaling.yarn_beta_slow" class Split: LLM_KV_SPLIT_NO = "split.no" @@ -462,6 +464,7 @@ class MODEL_ARCH(IntEnum): PANGU_EMBED = auto() MISTRAL3 = auto() MIMO2 = auto() + STEP35 = auto() LLAMA_EMBED = auto() MAINCODER = auto() KIMI_LINEAR = auto() @@ -892,6 +895,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.PANGU_EMBED: "pangu-embedded", MODEL_ARCH.MISTRAL3: "mistral3", MODEL_ARCH.MIMO2: "mimo2", + MODEL_ARCH.STEP35: "step35", MODEL_ARCH.LLAMA_EMBED: "llama-embed", MODEL_ARCH.MAINCODER: "maincoder", MODEL_ARCH.KIMI_LINEAR: "kimi-linear", @@ -3364,6 +3368,32 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_UP_EXP, MODEL_TENSOR.FFN_EXP_PROBS_B, ], + MODEL_ARCH.STEP35: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_GATE, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_EXP_PROBS_B, + ], MODEL_ARCH.LLAMA_EMBED: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -3753,12 +3783,13 @@ class VisionProjectorType: KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS # RoPE -KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT -KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE -KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE -KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR -KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN -KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED +KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT +KEY_ROPE_DIMENSION_COUNT_PER_LAYER = Keys.Rope.DIMENSION_COUNT_PER_LAYER +KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE +KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE +KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR +KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN +KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED # SSM KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index f720aa2d54a..df269ee09a8 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -925,6 +925,9 @@ def add_num_deepstack_layers(self, count: int) -> None: def add_rope_dimension_count(self, count: int) -> None: self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) + + def add_rope_dimension_count_per_layer(self, values: Sequence[int]) -> None: + self.add_array(Keys.Rope.DIMENSION_COUNT_PER_LAYER.format(arch=self.arch), values) def add_rope_dimension_sections(self, dims: Sequence[int]) -> None: self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims) @@ -962,6 +965,16 @@ def add_rope_scaling_yarn_beta_fast(self, value: float) -> None: def add_rope_scaling_yarn_beta_slow(self, value: float) -> None: self.add_float32(Keys.Rope.SCALING_YARN_BETA_SLOW.format(arch=self.arch), value) + def add_rope_scaling_apply_mask(self, yarn_only_types: Sequence[str] | None) -> None: + apply_mask = 0x3 # default: apply on all layers (backwards compatible) + if isinstance(yarn_only_types, list): + apply_mask = 0 + if "full_attention" in yarn_only_types: + apply_mask |= 0x1 + if "sliding_attention" in yarn_only_types: + apply_mask |= 0x2 + self.add_uint32(Keys.Rope.SCALING_APPLY_MASK.format(arch=self.arch), int(apply_mask)) + def add_ssm_conv_kernel(self, value: int) -> None: self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index e16c06c2a3c..167ade78033 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -359,6 +359,7 @@ class TensorNameMap: MODEL_TENSOR.ATTN_GATE: ( "model.layers.{bid}.self_attn.gate_proj", # afmoe + "model.layers.{bid}.self_attn.g_proj", # step3.5 head-wise attention gate ), # Feed-forward norm @@ -423,6 +424,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.router.gate", # afmoe "layers.{bid}.gate", # mistral-large "backbone.layers.{bid}.mixer.gate", # nemotron-h-moe + "model.layers.{bid}.moe.gate", # step3.5 ), MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( @@ -439,6 +441,7 @@ class TensorNameMap: "backbone.layers.{bid}.mixer.gate.e_score_correction", # nemotron-h-moe "model.layers.{bid}.mlp.e_score_correction", # exaone-moe "model.layers.{bid}.block_sparse_moe.gate.e_score_correction", # kimi + "model.layers.{bid}.moe.router_bias", # step3.5 expert selection bias ), # Feed-forward up @@ -493,6 +496,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.experts.up_proj", # llama4 "encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe "model.layers.{bid}.block_sparse_moe.experts.up", # smallthinker + "model.layers.{bid}.moe.up_proj", # step3.5 ), MODEL_TENSOR.FFN_UP_SHEXP: ( @@ -504,6 +508,7 @@ class TensorNameMap: "layers.{bid}.shared_experts.w3", # mistral-large "backbone.layers.{bid}.mixer.shared_experts.up_proj", # nemotron-h-moe "model.layers.{bid}.block_sparse_moe.shared_experts.up_proj", # kimi + "model.layers.{bid}.share_expert.up_proj", # step3.5 ), MODEL_TENSOR.FFN_UP_CHEXP: ( @@ -543,6 +548,7 @@ class TensorNameMap: "model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged) "model.layers.{bid}.feed_forward.experts.gate_proj", # llama4 "model.layers.{bid}.block_sparse_moe.experts.gate", # smallthinker + "model.layers.{bid}.moe.gate_proj", # step3.5 ), MODEL_TENSOR.FFN_GATE_SHEXP: ( @@ -552,6 +558,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_mlp.gate_proj", # hunyuan "layers.{bid}.shared_experts.w1", # mistral-large "model.layers.{bid}.block_sparse_moe.shared_experts.gate_proj", # kimi + "model.layers.{bid}.share_expert.gate_proj", # step3.5 ), MODEL_TENSOR.FFN_GATE_CHEXP: ( @@ -606,6 +613,7 @@ class TensorNameMap: "model.layers.{bid}.feed_forward.experts.down_proj", # llama4 "encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe "model.layers.{bid}.block_sparse_moe.experts.down", # smallthinker + "model.layers.{bid}.moe.down_proj", # step3.5 ), MODEL_TENSOR.FFN_DOWN_SHEXP: ( @@ -617,6 +625,7 @@ class TensorNameMap: "layers.{bid}.shared_experts.w2", # mistral-large "backbone.layers.{bid}.mixer.shared_experts.down_proj", # nemotron-h-moe "model.layers.{bid}.block_sparse_moe.shared_experts.down_proj", # kimi + "model.layers.{bid}.share_expert.down_proj", # step3.5 ), MODEL_TENSOR.FFN_DOWN_CHEXP: ( diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5238a5e934d..7327784e628 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -93,6 +93,7 @@ add_library(llama models/maincoder.cpp models/mamba.cpp models/mimo2-iswa.cpp + models/step35-iswa.cpp models/minicpm3.cpp models/minimax-m2.cpp models/modern-bert.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index a8bf1c9b80c..0e727b199d5 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -117,7 +117,8 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_RND1, "rnd1" }, { LLM_ARCH_PANGU_EMBED, "pangu-embedded" }, { LLM_ARCH_MISTRAL3, "mistral3" }, - { LLM_ARCH_MIMO2, "mimo2" }, + { LLM_ARCH_MIMO2, "mimo2" }, + { LLM_ARCH_STEP35, "step35" }, { LLM_ARCH_LLAMA_EMBED, "llama-embed" }, { LLM_ARCH_MAINCODER, "maincoder" }, { LLM_ARCH_KIMI_LINEAR, "kimi-linear" }, @@ -162,6 +163,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" }, { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" }, { LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, "%s.expert_chunk_feed_forward_length" }, + { LLM_KV_SWIGLU_LIMITS, "%s.swiglu_limits" }, + { LLM_KV_SWIGLU_LIMITS_SHARED, "%s.swiglu_limits_shared" }, { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" }, { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, { LLM_KV_EXPERT_COUNT, "%s.expert_count" }, @@ -220,21 +223,22 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, - { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, - { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, - { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, - { LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" }, - { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, - { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, - { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, - { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, - { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, - { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, - { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" }, - { LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" }, - { LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" }, - { LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" }, - { LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" }, + { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, + { LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER, "%s.rope.dimension_count_per_layer" }, + { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, + { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, + { LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" }, + { LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" }, + { LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" }, + { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, + { LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" }, + { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, + { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, + { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" }, + { LLM_KV_ROPE_SCALING_YARN_EXT_FACTOR, "%s.rope.scaling.yarn_ext_factor" }, + { LLM_KV_ROPE_SCALING_YARN_ATTN_FACTOR, "%s.rope.scaling.yarn_attn_factor" }, + { LLM_KV_ROPE_SCALING_YARN_BETA_FAST, "%s.rope.scaling.yarn_beta_fast" }, + { LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, "%s.rope.scaling.yarn_beta_slow" }, { LLM_KV_SPLIT_NO, "split.no" }, { LLM_KV_SPLIT_COUNT, "split.count" }, @@ -2279,6 +2283,35 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_UP_EXPS, LLM_TENSOR_FFN_EXP_PROBS_B, }; + case LLM_ARCH_STEP35: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_OUTPUT, + LLM_TENSOR_ROPE_FREQS, + LLM_TENSOR_ROPE_FACTORS_LONG, + LLM_TENSOR_ROPE_FACTORS_SHORT, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_Q_NORM, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_K_NORM, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_GATE, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_DOWN, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_DOWN_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_EXP_PROBS_B, + }; case LLM_ARCH_GPTJ: case LLM_ARCH_UNKNOWN: return { diff --git a/src/llama-arch.h b/src/llama-arch.h index f092f728344..4760b283020 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -122,6 +122,7 @@ enum llm_arch { LLM_ARCH_PANGU_EMBED, LLM_ARCH_MISTRAL3, LLM_ARCH_MIMO2, + LLM_ARCH_STEP35, LLM_ARCH_LLAMA_EMBED, LLM_ARCH_MAINCODER, LLM_ARCH_KIMI_LINEAR, @@ -166,6 +167,8 @@ enum llm_kv { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, + LLM_KV_SWIGLU_LIMITS, + LLM_KV_SWIGLU_LIMITS_SHARED, LLM_KV_USE_PARALLEL_RESIDUAL, LLM_KV_TENSOR_DATA_LAYOUT, LLM_KV_EXPERT_COUNT, @@ -225,6 +228,7 @@ enum llm_kv { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, LLM_KV_ROPE_DIMENSION_COUNT, + LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER, LLM_KV_ROPE_DIMENSION_SECTIONS, LLM_KV_ROPE_FREQ_BASE, LLM_KV_ROPE_FREQ_BASE_SWA, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 165cbc0a7d6..ca7c4fb6844 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -9,10 +9,14 @@ #include "llama-memory-hybrid.h" #include "llama-memory-hybrid-iswa.h" #include "llama-memory-recurrent.h" +#include "llama-mmap.h" +#include #include #include #include +#include +#include #include void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) { @@ -1014,6 +1018,26 @@ ggml_tensor * llm_graph_context::build_ffn( switch (type_op) { case LLM_FFN_SILU: if (gate && type_gate == LLM_FFN_PAR) { + // Step35: HF clamps gate (after SiLU) and up before multiplication + if (arch == LLM_ARCH_STEP35 && il >= 0) { + const float limit = hparams.swiglu_limits_shared[il]; + constexpr float eps = 1e-6f; + if (limit > eps) { + ggml_tensor * gate_act = ggml_silu(ctx0, cur); + cb(gate_act, "ffn_silu", il); + gate_act = ggml_clamp(ctx0, gate_act, -INFINITY, limit); + cb(gate_act, "ffn_silu_clamped", il); + + ggml_tensor * up_clamped = ggml_clamp(ctx0, tmp, -limit, limit); + cb(up_clamped, "ffn_up_clamped", il); + + cur = ggml_mul(ctx0, gate_act, up_clamped); + cb(cur, "ffn_swiglu_limited", il); + type_gate = LLM_FFN_SEQ; + break; + } + } + cur = ggml_swiglu_split(ctx0, cur, tmp); cb(cur, "ffn_swiglu", il); type_gate = LLM_FFN_SEQ; @@ -1266,8 +1290,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens] cb(weights_sum, "ffn_moe_weights_sum", il); - // Avoid division by zero, clamp to smallest number representable by F16 - weights_sum = ggml_clamp(ctx0, weights_sum, 6.103515625e-5, INFINITY); + // Avoid division by zero. + // Step35 HF uses +1e-20 in its renormalization (router_bias_func) + const float min_denom = (arch == LLM_ARCH_STEP35) ? 1e-20f : 6.103515625e-5f; + weights_sum = ggml_clamp(ctx0, weights_sum, min_denom, INFINITY); cb(weights_sum, "ffn_moe_weights_sum_clamped", il); weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens] @@ -1316,6 +1342,25 @@ ggml_tensor * llm_graph_context::build_moe_ffn( switch (type_op) { case LLM_FFN_SILU: if (gate_exps) { + // Step35: per-layer clamp for routed experts + if (arch == LLM_ARCH_STEP35 && il >= 0) { + const float limit = hparams.swiglu_limits[il]; + constexpr float eps = 1e-6f; + if (limit > eps) { + ggml_tensor * gate_act = ggml_silu(ctx0, cur); + cb(gate_act, "ffn_moe_silu", il); + gate_act = ggml_clamp(ctx0, gate_act, -INFINITY, limit); + cb(gate_act, "ffn_moe_silu_clamped", il); + + ggml_tensor * up_clamped = ggml_clamp(ctx0, up, -limit, limit); + cb(up_clamped, "ffn_moe_up_clamped", il); + + cur = ggml_mul(ctx0, gate_act, up_clamped); + cb(cur, "ffn_moe_swiglu_limited", il); + break; + } + } + cur = ggml_swiglu_split(ctx0, cur, up); cb(cur, "ffn_moe_swiglu", il); } else { diff --git a/src/llama-graph.h b/src/llama-graph.h index 1d69ff1a6fc..756e030e9b1 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -755,7 +755,6 @@ struct llm_graph_context { virtual ~llm_graph_context() = default; void cb(ggml_tensor * cur, const char * name, int il) const; - // // common // @@ -1016,6 +1015,6 @@ struct llm_graph_context { ggml_tensor * dense_2, ggml_tensor * dense_3) const; }; - +void llm_graph_dump_outputs_fp32(ggml_cgraph * gf, const char * out_dir = "dump_out"); // TODO: better name int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional); diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 756dda1a7ab..777f6df490a 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -232,3 +232,8 @@ uint32_t llama_hparams::n_layer_kv() const { bool llama_hparams::use_mrope() const { return rope_sections[0] > 0 && rope_sections[1] > 0; } + +uint32_t llama_hparams::rope_n_rot(uint32_t il) const { + const uint32_t v = rope_dim_per_layer[il]; + return v ? v : n_rot; +} \ No newline at end of file diff --git a/src/llama-hparams.h b/src/llama-hparams.h index a435043cfec..c4cf3c73c42 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -110,9 +110,17 @@ struct llama_hparams { float rope_freq_scale_train; float rope_freq_scale_train_swa = 1.0f; + bool has_rope_freq_base_per_layer = false; + std::array rope_freq_base_per_layer; + uint32_t n_ctx_orig_yarn; float rope_yarn_log_mul = 0.0f; + // Step35: optionally apply rope_scaling only for certain attention types (HF "yarn_only_types"). + // bit0 -> apply on full/dense layers, bit1 -> apply on sliding/SWA layers. + // Default 3 keeps backwards compatibility (apply everywhere). + uint32_t rope_scaling_apply_mask = 0x3; + float yarn_ext_factor = -1.0f; float yarn_attn_factor = 1.0f; float yarn_beta_fast = 32.0f; @@ -206,6 +214,12 @@ struct llama_hparams { enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE; + std::array rope_dim_per_layer; + + // Step35: optional per-layer clamps for (Swi)GLU + std::array swiglu_limits; + std::array swiglu_limits_shared; + // this value n_pattern means that every nth layer is dense (i.e. non-SWA) // dense_first means whether the pattern is start with a dense layer // note that if n_pattern == 0, all layers are SWA @@ -325,6 +339,8 @@ struct llama_hparams { bool use_mrope() const; + + uint32_t rope_n_rot(uint32_t il) const; }; static_assert(std::is_trivially_copyable::value, "llama_hparams must be trivially copyable"); diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 1501e392ca8..67047a4a4c3 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -497,6 +497,7 @@ namespace GGUFMeta { template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); + template bool llama_model_loader::get_key_or_arr(const std::string & key, std::array & result, uint32_t n, bool required); llama_model_loader::llama_model_loader( diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 765e4de2e49..666b401a008 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2482,6 +2482,47 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_STEP35: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + + // MoE + SWA parameters + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); + + // Step35 uses sigmoid gating by default (if not set in GGUF) + if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { + hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; + } + + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER, hparams.rope_dim_per_layer, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS, hparams.swiglu_limits, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS_SHARED, hparams.swiglu_limits_shared, hparams.n_layer); + + // Optional: Step35-only gating for applying rope scaling (HF: yarn_only_types). + // Default is 3 (apply on all layers) if the key is absent. + ml.get_key( + format("%s.rope.scaling.apply_mask", ml.get_arch_name().c_str()), + hparams.rope_scaling_apply_mask, + false + ); + + hparams.has_rope_freq_base_per_layer = ml.get_key_or_arr( + format("%s.rope.freq_base_per_layer", ml.get_arch_name().c_str()), + hparams.rope_freq_base_per_layer, + hparams.n_layer, + false + ); + + type = LLM_TYPE_UNKNOWN; + } break; default: throw std::runtime_error("unsupported model architecture"); } @@ -7107,6 +7148,72 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); } } break; + case LLM_ARCH_STEP35: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + // STEP35 supports per-layer partial RoPE dims; rope factors are stored as a single shared tensor + // ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer. + uint32_t n_rot_max = 0; + for (int i = 0; i < n_layer; ++i) { + n_rot_max = std::max(n_rot_max, hparams.rope_n_rot(i)); + } + if (n_rot_max == 0) { + n_rot_max = n_rot; + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + const uint32_t n_head_l = hparams.n_head(i); + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i); + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED); + + // optional rope factors (llama3) / longrope tensors + if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) { + layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + } else { + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + } + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head_l}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0); + + // head-wise attention gate (Step35 self_attn.g_proj) + layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + // dense MLP (leading dense blocks) + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); + + // MoE routed experts + selection bias (router_bias) + const int64_t n_ff_exp = hparams.n_ff_exp; + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED); + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED); + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); + + // shared expert MLP + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED); + } + } break; case LLM_ARCH_MAINCODER: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -7620,6 +7727,9 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const { } float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const { + if (hparams.has_rope_freq_base_per_layer) { + return hparams.rope_freq_base_per_layer[il]; + } return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base; } @@ -8257,6 +8367,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_STEP35: + { + llm = std::make_unique(*this, params); + } break; default: GGML_ABORT("fatal error"); } @@ -8502,6 +8616,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_AFMOE: case LLM_ARCH_QWEN3NEXT: case LLM_ARCH_MIMO2: + case LLM_ARCH_STEP35: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: diff --git a/src/models/models.h b/src/models/models.h index 71c1fe81084..8781f7fbf28 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -355,6 +355,10 @@ struct llm_build_mimo2_iswa : public llm_graph_context { llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_step35_iswa : public llm_graph_context { + llm_build_step35_iswa(const llama_model & model, const llm_graph_params & params); +}; + struct llm_build_minicpm3 : public llm_graph_context { llm_build_minicpm3(const llama_model & model, const llm_graph_params & params); }; diff --git a/src/models/step35-iswa.cpp b/src/models/step35-iswa.cpp new file mode 100644 index 00000000000..74c114e777d --- /dev/null +++ b/src/models/step35-iswa.cpp @@ -0,0 +1,178 @@ +#include "models.h" + +llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + ggml_tensor * inp_pos = build_inp_pos(); + auto * inp_attn = build_attn_inp_kv_iswa(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + const uint32_t n_head_l = hparams.n_head(il); + const uint32_t n_head_kv_l = hparams.n_head_kv(il); + + const float freq_base_l = model.get_rope_freq_base(cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + + cur = inpL; + + // dump pre-attn RMSNorm input to pinpoint layer boundary issues + cb(cur, "attn_norm_in", il); + + // self-attention + { + cur = build_norm(cur, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens); + + // Q/K per-head RMSNorm (Step35 q_norm / k_norm) + if (model.layers[il].attn_q_norm) { + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + } + if (model.layers[il].attn_k_norm) { + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il); + cb(Kcur, "Kcur_normed", il); + } + + // RoPE (partial rotary factors per layer) + ggml_tensor * rope_factors = nullptr; + // Match HF behavior (Step35Attention): rope_scaling / rope_parameters may be gated by attention type. + // bit0 -> full/dense layers, bit1 -> sliding/SWA layers. + const bool is_swa = hparams.is_swa(il); + const uint32_t apply_mask = hparams.rope_scaling_apply_mask; + if ((is_swa && (apply_mask & 0x2)) || (!is_swa && (apply_mask & 0x1))) { + rope_factors = model.get_rope_factors(cparams, il); + } + const int64_t n_rot_l = hparams.rope_n_rot(il); + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow + ); + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur_pos", il); + cb(Kcur, "Kcur_pos", il); + + const float kq_scale = 1.0f / sqrtf(float(n_embd_head_k)); + ggml_tensor * attn_out = build_attn(inp_attn, + nullptr, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + cb(attn_out, "attn_out", il); + // head-wise attention gate: sigmoid(g_proj(x)) in torch + if (model.layers[il].wqkv_gate) { + ggml_tensor * gate = build_lora_mm(model.layers[il].wqkv_gate, cur); // [n_head_l, n_tokens] + cb(gate, "attn_gate", il); + + gate = ggml_sigmoid(ctx0, gate); + cb(gate, "attn_gate_sigmoid", il); + + // reshape + broadcast to [n_embd_head_v, n_head_l, n_tokens] + ggml_tensor * attn_3d = ggml_reshape_3d(ctx0, attn_out, n_embd_head_v, n_head_l, n_tokens); + ggml_tensor * gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens); + gate_3d = ggml_repeat(ctx0, gate_3d, attn_3d); + cb(gate_3d, "attn_gate_bcast", il); + + attn_3d = ggml_mul(ctx0, attn_3d, gate_3d); + cb(attn_3d, "attn_gated_3d", il); + + attn_out = ggml_cont_2d(ctx0, ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens), + n_embd_head_v * n_head_l, n_tokens); + cb(attn_out, "attn_gated", il); + } + + // output projection + cur = build_lora_mm(model.layers[il].wo, attn_out); + cb(cur, "attn_proj", il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + // feed-forward + if (model.layers[il].ffn_gate_inp == nullptr) { + // dense MLP + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, nullptr, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, nullptr, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, nullptr, + nullptr, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { + // MoE routed experts + const bool norm_w = hparams.expert_weights_norm; + const float w_scale = hparams.expert_weights_scale; + const bool scale_w = w_scale != 0.0f; + ggml_tensor * moe_out = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + model.layers[il].ffn_exp_probs_b, + n_expert, n_expert_used, + LLM_FFN_SILU, + norm_w, scale_w, w_scale, + LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, + il); + cb(moe_out, "ffn_moe_out", il); + + // shared expert MLP (always added on MoE layers in Step35) + ggml_tensor * sh_out = build_ffn(cur, + model.layers[il].ffn_up_shexp, nullptr, nullptr, + model.layers[il].ffn_gate_shexp, nullptr, nullptr, + model.layers[il].ffn_down_shexp, nullptr, nullptr, + nullptr, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(sh_out, "ffn_shared_out", il); + + cur = ggml_add(ctx0, moe_out, sh_out); + cb(cur, "ffn_out", il); + } + cur = ggml_add(ctx0, cur, ffn_inp); + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res->t_embd = cur; + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} + + From 5c7c683684a81508986ecac583f31c34a567d305 Mon Sep 17 00:00:00 2001 From: lvyichen Date: Tue, 3 Feb 2026 19:26:38 +0800 Subject: [PATCH 06/21] fix: norm.weight + 1 (HF zero_centered=true) --- convert_hf_to_gguf.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 42ec9c05044..475df869801 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8033,6 +8033,14 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): n_main = int(self.hparams.get("num_hidden_layers", self.block_count)) if il >= n_main: return [] + if name.endswith(".weight"): + if ( + name == "model.norm.weight" + or re.fullmatch(r"model\.layers\.\d+\.input_layernorm\.weight", name) is not None + or re.fullmatch(r"model\.layers\.\d+\.post_attention_layernorm\.weight", name) is not None + or re.fullmatch(r"model\.layers\.\d+\.self_attn\.(q_norm|k_norm)\.weight", name) is not None + ): + data_torch = data_torch + 1 # Map router bias (expert selection bias) to a GGUF bias tensor if name.endswith(".moe.router_bias"): return [(self.map_tensor_name(name + ".bias"), data_torch)] From d9d743193b6eeb53ddb882da6d51d77b1c090bc4 Mon Sep 17 00:00:00 2001 From: lvyichen Date: Wed, 4 Feb 2026 19:15:17 +0800 Subject: [PATCH 07/21] step35: simplify GGUF conversion + drop redundant rope KVs --- convert_hf_to_gguf.py | 87 +++++++++---------------------------- gguf-py/gguf/constants.py | 3 +- gguf-py/gguf/gguf_writer.py | 16 +++---- src/llama-arch.cpp | 1 - src/llama-arch.h | 1 - src/llama-hparams.cpp | 5 --- src/llama-hparams.h | 10 ----- src/llama-model.cpp | 21 +-------- src/models/step35-iswa.cpp | 14 +++--- 9 files changed, 36 insertions(+), 122 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 475df869801..eba07b15533 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -920,7 +920,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_expert_group_used_count(n_group_used) logger.info(f"gguf: expert groups used count = {n_group_used}") - if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation_func"], optional=True)) is not None: + if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None: if score_func == "sigmoid": self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) elif score_func == "softmax": @@ -7914,43 +7914,22 @@ def prepare_tensors(self): @ModelBase.register("Step3p5ForCausalLM") class Step35Model(TextModel): - """ - Step3.5 interleaved sliding-window attention + MoE with sigmoid routing and expert selection bias. - """ - model_arch = gguf.MODEL_ARCH.STEP35 def set_gguf_parameters(self): rope_theta_per_layer = None - rope_theta = self.hparams.get("rope_theta", None) + rope_theta = self.hparams.get("rope_theta") if isinstance(rope_theta, list): rope_theta_per_layer = rope_theta - if len(rope_theta) == 0: - raise ValueError("rope_theta list must not be empty") - rope_theta0 = float(rope_theta[0]) - self.hparams["rope_theta"] = rope_theta0 - if isinstance(getattr(self, "rope_parameters", None), dict) and isinstance(self.rope_parameters.get("rope_theta", None), list): - self.rope_parameters["rope_theta"] = rope_theta0 + self.hparams["rope_theta"] = float(rope_theta[0]) + self.hparams["local_rope_theta"] = float(rope_theta[1]) + self.rope_parameters["rope_theta"] = self.hparams["rope_theta"] + self.rope_parameters["sliding_attention"] = {"rope_theta": self.hparams["local_rope_theta"]} super().set_gguf_parameters() - def _truncate_to_block_count(name: str, values: list, *, allow_none: bool = False) -> list: - if not isinstance(values, list): - raise ValueError(f"{name} must be a list, got {type(values)}") - if len(values) < self.block_count: - raise ValueError(f"{name} must have length >= {self.block_count}, got {len(values)}") - if len(values) != self.block_count: - logger.warning( - "%s length mismatch: expected %d, got %d; truncating to %d", - name, self.block_count, len(values), self.block_count, - ) - values = values[: self.block_count] - if not allow_none and any(v is None for v in values): - raise ValueError(f"{name} must not contain None") - return values - layer_types = self.hparams.get("layer_types", []) - attn_other = self.hparams.get("attention_other_setting", {}) or {} + attn_other = self.hparams.get("attention_other_setting") or {} n_head_base = self.hparams["num_attention_heads"] n_kv_base = self.hparams["num_attention_groups"] @@ -7958,13 +7937,10 @@ def _truncate_to_block_count(name: str, values: list, *, allow_none: bool = Fals n_head_swa = attn_other.get("num_attention_heads", n_head_base) n_kv_swa = attn_other.get("num_attention_groups", n_kv_base) - if layer_types: - layer_types = _truncate_to_block_count("layer_types", layer_types, allow_none=False) - head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types] - kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types] - swa_pat = [1 if lt == "sliding_attention" else 0 for lt in layer_types] - else: - raise ValueError(f"layer_types is not set: {layer_types}") + layer_types = layer_types[: self.block_count] + head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types] + kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types] + swa_pat = [1 if lt == "sliding_attention" else 0 for lt in layer_types] self.gguf_writer.add_head_count(head_arr) self.gguf_writer.add_head_count_kv(kv_arr) @@ -7974,22 +7950,16 @@ def _truncate_to_block_count(name: str, values: list, *, allow_none: bool = Fals self.gguf_writer.add_value_length(self.hparams["head_dim"]) - # Whether rope_scaling/rope_parameters are applied - # based on attention type, encoded as a small bitmask: - # bit0 -> apply on full_attention (dense layers) - # bit1 -> apply on sliding_attention (SWA layers) - yarn_only_types = self.hparams.get("yarn_only_types", None) - self.gguf_writer.add_rope_scaling_apply_mask(yarn_only_types) - # MoE params self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"]) self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"]) self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["share_expert_dim"]) - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - self.gguf_writer.add_expert_weights_scale(self.hparams.get("moe_router_scaling_factor", 1.0)) - self.gguf_writer.add_expert_weights_norm(bool(self.hparams.get("norm_expert_weight", False))) + if (moe_router_scaling_factor := self.hparams.get("moe_router_scaling_factor")) is not None: + self.gguf_writer.add_expert_weights_scale(moe_router_scaling_factor) + if (norm_expert_weight := self.hparams.get("norm_expert_weight")) is not None: + self.gguf_writer.add_expert_weights_norm(norm_expert_weight) # leading dense blocks leading_dense = 0 @@ -8001,30 +7971,15 @@ def _truncate_to_block_count(name: str, values: list, *, allow_none: bool = Fals self.gguf_writer.add_leading_dense_block_count(leading_dense) self.gguf_writer.add_moe_every_n_layers(int(self.hparams.get("moe_every_n_layer", 1))) - # RoPE: Step35 uses per-layer partial rotary factors; llama.cpp currently only supports a single rope dim. - # Check that partial_rotary_factors exists, is the right length, and all factors > 0 - partial_rotary_factors = self.hparams.get("partial_rotary_factors", None) - if partial_rotary_factors is None: - raise ValueError("partial_rotary_factors must be present in hparams") - partial_rotary_factors = _truncate_to_block_count("partial_rotary_factors", partial_rotary_factors, allow_none=False) - rope_dim_per_layer = [int(self.hparams["head_dim"] * factor) for factor in partial_rotary_factors] - self.gguf_writer.add_rope_dimension_count_per_layer(rope_dim_per_layer) self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5)) - # Step35: per-layer rope_theta support - if rope_theta_per_layer is not None: - rope_theta_per_layer = _truncate_to_block_count("rope_theta", rope_theta_per_layer, allow_none=False) - freq_base_per_layer = [float(v) for v in rope_theta_per_layer] - self.gguf_writer.add_array(f"{self.gguf_writer.arch}.rope.freq_base_per_layer", freq_base_per_layer) - # Optional per-layer SwiGLU clamps (HF: swiglu_limits / swiglu_limits_shared). - for key in ("swiglu_limits", "swiglu_limits_shared"): - limits = self.hparams.get(key, None) - if limits is None: - continue - limits = _truncate_to_block_count(key, limits, allow_none=True) - limits_f = [0.0 if v is None else float(v) for v in limits] - self.gguf_writer.add_array(f"{self.gguf_writer.arch}.{key}", limits_f) + if (limits := self.hparams.get("swiglu_limits")) is not None: + limits_f = [0.0 if v is None else float(v) for v in limits[: self.block_count]] + self.gguf_writer.add_swiglu_limits(limits_f) + if (limits_shared := self.hparams.get("swiglu_limits_shared")) is not None: + limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self.block_count]] + self.gguf_writer.add_swiglu_limits_shared(limits_shared_f) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): # remove mtp layers diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index bd04fec8654..06cc71b5e24 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -146,6 +146,8 @@ class LLM: ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx" ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs" EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input" + SWIGLU_LIMITS = "{arch}.swiglu_limits" + SWIGLU_LIMITS_SHARED = "{arch}.swiglu_limits_shared" DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in" DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out" @@ -187,7 +189,6 @@ class Rope: SCALING_TYPE = "{arch}.rope.scaling.type" SCALING_FACTOR = "{arch}.rope.scaling.factor" SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor" - SCALING_APPLY_MASK = "{arch}.rope.scaling.apply_mask" SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index df269ee09a8..d4a88901f31 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -824,6 +824,12 @@ def add_expert_weights_norm(self, value: bool) -> None: def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None: self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value) + def add_swiglu_limits(self, values: Sequence[float]) -> None: + self.add_array(Keys.LLM.SWIGLU_LIMITS.format(arch=self.arch), values) + + def add_swiglu_limits_shared(self, values: Sequence[float]) -> None: + self.add_array(Keys.LLM.SWIGLU_LIMITS_SHARED.format(arch=self.arch), values) + def add_expert_group_scale(self, value: float) -> None: self.add_float32(Keys.LLM.EXPERT_GROUP_SCALE.format(arch=self.arch), value) @@ -965,16 +971,6 @@ def add_rope_scaling_yarn_beta_fast(self, value: float) -> None: def add_rope_scaling_yarn_beta_slow(self, value: float) -> None: self.add_float32(Keys.Rope.SCALING_YARN_BETA_SLOW.format(arch=self.arch), value) - def add_rope_scaling_apply_mask(self, yarn_only_types: Sequence[str] | None) -> None: - apply_mask = 0x3 # default: apply on all layers (backwards compatible) - if isinstance(yarn_only_types, list): - apply_mask = 0 - if "full_attention" in yarn_only_types: - apply_mask |= 0x1 - if "sliding_attention" in yarn_only_types: - apply_mask |= 0x2 - self.add_uint32(Keys.Rope.SCALING_APPLY_MASK.format(arch=self.arch), int(apply_mask)) - def add_ssm_conv_kernel(self, value: int) -> None: self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 0e727b199d5..388ff4b712c 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -224,7 +224,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, - { LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER, "%s.rope.dimension_count_per_layer" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, { LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 4760b283020..61b12ee2992 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -228,7 +228,6 @@ enum llm_kv { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, LLM_KV_ROPE_DIMENSION_COUNT, - LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER, LLM_KV_ROPE_DIMENSION_SECTIONS, LLM_KV_ROPE_FREQ_BASE, LLM_KV_ROPE_FREQ_BASE_SWA, diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 777f6df490a..ab9285b0c1b 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -231,9 +231,4 @@ uint32_t llama_hparams::n_layer_kv() const { bool llama_hparams::use_mrope() const { return rope_sections[0] > 0 && rope_sections[1] > 0; -} - -uint32_t llama_hparams::rope_n_rot(uint32_t il) const { - const uint32_t v = rope_dim_per_layer[il]; - return v ? v : n_rot; } \ No newline at end of file diff --git a/src/llama-hparams.h b/src/llama-hparams.h index c4cf3c73c42..407852098e9 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -110,17 +110,9 @@ struct llama_hparams { float rope_freq_scale_train; float rope_freq_scale_train_swa = 1.0f; - bool has_rope_freq_base_per_layer = false; - std::array rope_freq_base_per_layer; - uint32_t n_ctx_orig_yarn; float rope_yarn_log_mul = 0.0f; - // Step35: optionally apply rope_scaling only for certain attention types (HF "yarn_only_types"). - // bit0 -> apply on full/dense layers, bit1 -> apply on sliding/SWA layers. - // Default 3 keeps backwards compatibility (apply everywhere). - uint32_t rope_scaling_apply_mask = 0x3; - float yarn_ext_factor = -1.0f; float yarn_attn_factor = 1.0f; float yarn_beta_fast = 32.0f; @@ -214,7 +206,6 @@ struct llama_hparams { enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE; - std::array rope_dim_per_layer; // Step35: optional per-layer clamps for (Swi)GLU std::array swiglu_limits; @@ -340,7 +331,6 @@ struct llama_hparams { bool use_mrope() const; - uint32_t rope_n_rot(uint32_t il) const; }; static_assert(std::is_trivially_copyable::value, "llama_hparams must be trivially copyable"); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 666b401a008..442fc27957c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2502,25 +2502,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_COUNT_PER_LAYER, hparams.rope_dim_per_layer, hparams.n_layer); ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS, hparams.swiglu_limits, hparams.n_layer); ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS_SHARED, hparams.swiglu_limits_shared, hparams.n_layer); - // Optional: Step35-only gating for applying rope scaling (HF: yarn_only_types). - // Default is 3 (apply on all layers) if the key is absent. - ml.get_key( - format("%s.rope.scaling.apply_mask", ml.get_arch_name().c_str()), - hparams.rope_scaling_apply_mask, - false - ); - - hparams.has_rope_freq_base_per_layer = ml.get_key_or_arr( - format("%s.rope.freq_base_per_layer", ml.get_arch_name().c_str()), - hparams.rope_freq_base_per_layer, - hparams.n_layer, - false - ); - type = LLM_TYPE_UNKNOWN; } break; default: throw std::runtime_error("unsupported model architecture"); @@ -7160,7 +7144,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer. uint32_t n_rot_max = 0; for (int i = 0; i < n_layer; ++i) { - n_rot_max = std::max(n_rot_max, hparams.rope_n_rot(i)); + n_rot_max = std::max(n_rot_max, hparams.n_rot); } if (n_rot_max == 0) { n_rot_max = n_rot; @@ -7727,9 +7711,6 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const { } float llama_model::get_rope_freq_base (const llama_cparams & cparams, int il) const { - if (hparams.has_rope_freq_base_per_layer) { - return hparams.rope_freq_base_per_layer[il]; - } return hparams.is_swa(il) ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base; } diff --git a/src/models/step35-iswa.cpp b/src/models/step35-iswa.cpp index 74c114e777d..d97d74de2f6 100644 --- a/src/models/step35-iswa.cpp +++ b/src/models/step35-iswa.cpp @@ -50,15 +50,13 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll } // RoPE (partial rotary factors per layer) - ggml_tensor * rope_factors = nullptr; - // Match HF behavior (Step35Attention): rope_scaling / rope_parameters may be gated by attention type. - // bit0 -> full/dense layers, bit1 -> sliding/SWA layers. + // Step3.5 matches HF behavior: RoPE scaling is applied on dense (full-attention) layers only. + // We already have the SWA pattern in hparams (loaded from sliding_window_pattern), so no extra mask is needed. const bool is_swa = hparams.is_swa(il); - const uint32_t apply_mask = hparams.rope_scaling_apply_mask; - if ((is_swa && (apply_mask & 0x2)) || (!is_swa && (apply_mask & 0x1))) { - rope_factors = model.get_rope_factors(cparams, il); - } - const int64_t n_rot_l = hparams.rope_n_rot(il); + ggml_tensor * rope_factors = is_swa ? nullptr : model.get_rope_factors(cparams, il); + // Step3.5 partial rotary factors are tied to the SWA pattern: + // dense layers use half-rotary, SWA layers use full rotary. + const int64_t n_rot_l = is_swa ? hparams.n_rot : (hparams.n_rot / 2); Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, rope_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, From 34a4d1ab2079aab3874f7a7f5f991b9787b66151 Mon Sep 17 00:00:00 2001 From: lvyichen Date: Wed, 4 Feb 2026 20:15:47 +0800 Subject: [PATCH 08/21] Address review feedback --- convert_hf_to_gguf.py | 12 +++--------- src/llama-graph.cpp | 6 ++---- src/llama-hparams.h | 4 ++-- src/models/step35-iswa.cpp | 4 ---- 4 files changed, 7 insertions(+), 19 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index eba07b15533..c5abfc943eb 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -831,7 +831,7 @@ def prepare_metadata(self, vocab_only: bool): def set_gguf_parameters(self): self.gguf_writer.add_block_count(self.block_count) - if (n_ctx := self.find_hparam(["max_position_embeddings", "max_position_embedding", "n_ctx", "n_positions", "max_length", "max_sequence_length", "model_max_length"], optional=True)) is not None: + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length", "max_sequence_length", "model_max_length"], optional=True)) is not None: self.gguf_writer.add_context_length(n_ctx) logger.info(f"gguf: context length = {n_ctx}") @@ -7988,14 +7988,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): n_main = int(self.hparams.get("num_hidden_layers", self.block_count)) if il >= n_main: return [] - if name.endswith(".weight"): - if ( - name == "model.norm.weight" - or re.fullmatch(r"model\.layers\.\d+\.input_layernorm\.weight", name) is not None - or re.fullmatch(r"model\.layers\.\d+\.post_attention_layernorm\.weight", name) is not None - or re.fullmatch(r"model\.layers\.\d+\.self_attn\.(q_norm|k_norm)\.weight", name) is not None - ): - data_torch = data_torch + 1 + if name.endswith("norm.weight"): + data_torch += 1.0 # Map router bias (expert selection bias) to a GGUF bias tensor if name.endswith(".moe.router_bias"): return [(self.map_tensor_name(name + ".bias"), data_torch)] diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index ca7c4fb6844..3d7ac108859 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1290,10 +1290,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens] cb(weights_sum, "ffn_moe_weights_sum", il); - // Avoid division by zero. - // Step35 HF uses +1e-20 in its renormalization (router_bias_func) - const float min_denom = (arch == LLM_ARCH_STEP35) ? 1e-20f : 6.103515625e-5f; - weights_sum = ggml_clamp(ctx0, weights_sum, min_denom, INFINITY); + // Avoid division by zero, clamp to smallest number representable by F16 + weights_sum = ggml_clamp(ctx0, weights_sum, 6.103515625e-5, INFINITY); cb(weights_sum, "ffn_moe_weights_sum_clamped", il); weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens] diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 407852098e9..9e2770087ec 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -208,8 +208,8 @@ struct llama_hparams { // Step35: optional per-layer clamps for (Swi)GLU - std::array swiglu_limits; - std::array swiglu_limits_shared; + std::array swiglu_clamp_exp; // clamping for expert FFN + std::array swiglu_clamp_shexp; // shared expert // this value n_pattern means that every nth layer is dense (i.e. non-SWA) // dense_first means whether the pattern is start with a dense layer diff --git a/src/models/step35-iswa.cpp b/src/models/step35-iswa.cpp index d97d74de2f6..e6368bc583d 100644 --- a/src/models/step35-iswa.cpp +++ b/src/models/step35-iswa.cpp @@ -50,12 +50,8 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll } // RoPE (partial rotary factors per layer) - // Step3.5 matches HF behavior: RoPE scaling is applied on dense (full-attention) layers only. - // We already have the SWA pattern in hparams (loaded from sliding_window_pattern), so no extra mask is needed. const bool is_swa = hparams.is_swa(il); ggml_tensor * rope_factors = is_swa ? nullptr : model.get_rope_factors(cparams, il); - // Step3.5 partial rotary factors are tied to the SWA pattern: - // dense layers use half-rotary, SWA layers use full rotary. const int64_t n_rot_l = is_swa ? hparams.n_rot : (hparams.n_rot / 2); Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, rope_factors, From 60ab1829107929c80b965b646287b260e4e3b08b Mon Sep 17 00:00:00 2001 From: lvyichen Date: Wed, 4 Feb 2026 21:03:32 +0800 Subject: [PATCH 09/21] rename limits -> clamp --- src/llama-graph.cpp | 4 ++-- src/llama-model.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 3d7ac108859..ffc86193076 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1020,7 +1020,7 @@ ggml_tensor * llm_graph_context::build_ffn( if (gate && type_gate == LLM_FFN_PAR) { // Step35: HF clamps gate (after SiLU) and up before multiplication if (arch == LLM_ARCH_STEP35 && il >= 0) { - const float limit = hparams.swiglu_limits_shared[il]; + const float limit = hparams.swiglu_clamp_shexp[il]; constexpr float eps = 1e-6f; if (limit > eps) { ggml_tensor * gate_act = ggml_silu(ctx0, cur); @@ -1342,7 +1342,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( if (gate_exps) { // Step35: per-layer clamp for routed experts if (arch == LLM_ARCH_STEP35 && il >= 0) { - const float limit = hparams.swiglu_limits[il]; + const float limit = hparams.swiglu_clamp_exp[il]; constexpr float eps = 1e-6f; if (limit > eps) { ggml_tensor * gate_act = ggml_silu(ctx0, cur); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 442fc27957c..5fdb11a128d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2502,8 +2502,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS, hparams.swiglu_limits, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS_SHARED, hparams.swiglu_limits_shared, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS, hparams.swiglu_clamp_exp, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS_SHARED, hparams.swiglu_clamp_shexp, hparams.n_layer); type = LLM_TYPE_UNKNOWN; } break; From 0293d363ae99d962d477c4349cfade3f2552ad9c Mon Sep 17 00:00:00 2001 From: forforever73 <63285796+forforever73@users.noreply.github.com> Date: Wed, 4 Feb 2026 20:19:49 +0800 Subject: [PATCH 10/21] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c5abfc943eb..915bdc32300 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7992,11 +7992,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): data_torch += 1.0 # Map router bias (expert selection bias) to a GGUF bias tensor if name.endswith(".moe.router_bias"): - return [(self.map_tensor_name(name + ".bias"), data_torch)] + name += ".bias" if name.endswith((".self_attn.g_proj.weight", ".moe.gate.weight", ".moe.up_proj.weight", ".moe.gate_proj.weight", ".moe.down_proj.weight")): - w = data_torch.squeeze() - return [(self.map_tensor_name(name), w.contiguous())] + data_torch = data_torch.squeeze().contiguous() return super().modify_tensors(data_torch, name, bid) @@ -8004,9 +8003,9 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: # Step35 can optionally use Llama-3 style RoPE scaling (HF: rope_scaling.rope_type == "llama3"). # llama.cpp represents this via a single extra tensor: "rope_freqs.weight" (aka MODEL_TENSOR.ROPE_FREQS). rope_params = self.rope_parameters.get("full_attention", self.rope_parameters) - rope_type = (rope_params.get("rope_type") or rope_params.get("type") or "") + rope_type = rope_params.get("rope_type") or "" if rope_type.lower() != "llama3": - return () + return # Step35 configs can carry per-layer rope_theta as a list; for llama3 rope factors we use the base value. rope_theta = self.hparams.get("rope_theta", 10000.0) @@ -8015,8 +8014,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: raise ValueError("rope_theta list must not be empty") rope_theta = rope_theta[0] base = float(rope_theta) - dim = self.hparams.get("head_dim") - if dim is None: + if (dim := self.hparams.get("head_dim")) is None: dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] dim = int(dim) From ff62b6c2660d429b6c2f12ce01e686c51001c2b9 Mon Sep 17 00:00:00 2001 From: forforever73 <63285796+forforever73@users.noreply.github.com> Date: Wed, 4 Feb 2026 20:20:44 +0800 Subject: [PATCH 11/21] Apply suggestion from @CISC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 915bdc32300..480970fe2a6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8010,8 +8010,6 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: # Step35 configs can carry per-layer rope_theta as a list; for llama3 rope factors we use the base value. rope_theta = self.hparams.get("rope_theta", 10000.0) if isinstance(rope_theta, list): - if len(rope_theta) == 0: - raise ValueError("rope_theta list must not be empty") rope_theta = rope_theta[0] base = float(rope_theta) if (dim := self.hparams.get("head_dim")) is None: From 512a73537951898d4e8d1b53bfef761f691e3105 Mon Sep 17 00:00:00 2001 From: lvyichen Date: Wed, 4 Feb 2026 21:27:09 +0800 Subject: [PATCH 12/21] rename swiglu limits -> swiglu clamp in LLM_KV --- convert_hf_to_gguf.py | 6 +++--- gguf-py/gguf/constants.py | 4 ++-- gguf-py/gguf/gguf_writer.py | 8 ++++---- src/llama-arch.cpp | 4 ++-- src/llama-arch.h | 4 ++-- src/llama-model.cpp | 4 ++-- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 480970fe2a6..edb75bf45ba 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7973,13 +7973,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5)) - # Optional per-layer SwiGLU clamps (HF: swiglu_limits / swiglu_limits_shared). + # Optional per-layer SwiGLU clamps. if (limits := self.hparams.get("swiglu_limits")) is not None: limits_f = [0.0 if v is None else float(v) for v in limits[: self.block_count]] - self.gguf_writer.add_swiglu_limits(limits_f) + self.gguf_writer.add_swiglu_clamp_exp(limits_f) if (limits_shared := self.hparams.get("swiglu_limits_shared")) is not None: limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self.block_count]] - self.gguf_writer.add_swiglu_limits_shared(limits_shared_f) + self.gguf_writer.add_swiglu_clamp_shexp(limits_shared_f) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): # remove mtp layers diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 06cc71b5e24..bae3045857b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -146,8 +146,8 @@ class LLM: ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx" ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs" EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input" - SWIGLU_LIMITS = "{arch}.swiglu_limits" - SWIGLU_LIMITS_SHARED = "{arch}.swiglu_limits_shared" + SWIGLU_CLAMP_EXP = "{arch}.swiglu_clamp_exp" + SWIGLU_CLAMP_SHEXP = "{arch}.swiglu_clamp_shexp" DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in" DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index d4a88901f31..cf86f579490 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -824,11 +824,11 @@ def add_expert_weights_norm(self, value: bool) -> None: def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None: self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value) - def add_swiglu_limits(self, values: Sequence[float]) -> None: - self.add_array(Keys.LLM.SWIGLU_LIMITS.format(arch=self.arch), values) + def add_swiglu_clamp_exp(self, values: Sequence[float]) -> None: + self.add_array(Keys.LLM.SWIGLU_CLAMP_EXP.format(arch=self.arch), values) - def add_swiglu_limits_shared(self, values: Sequence[float]) -> None: - self.add_array(Keys.LLM.SWIGLU_LIMITS_SHARED.format(arch=self.arch), values) + def add_swiglu_clamp_shexp(self, values: Sequence[float]) -> None: + self.add_array(Keys.LLM.SWIGLU_CLAMP_SHEXP.format(arch=self.arch), values) def add_expert_group_scale(self, value: float) -> None: self.add_float32(Keys.LLM.EXPERT_GROUP_SCALE.format(arch=self.arch), value) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 388ff4b712c..bd78f1e5562 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -163,8 +163,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" }, { LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, "%s.expert_shared_feed_forward_length" }, { LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, "%s.expert_chunk_feed_forward_length" }, - { LLM_KV_SWIGLU_LIMITS, "%s.swiglu_limits" }, - { LLM_KV_SWIGLU_LIMITS_SHARED, "%s.swiglu_limits_shared" }, + { LLM_KV_SWIGLU_CLAMP_EXP, "%s.swiglu_clamp_exp" }, + { LLM_KV_SWIGLU_CLAMP_SHEXP, "%s.swiglu_clamp_shexp" }, { LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" }, { LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" }, { LLM_KV_EXPERT_COUNT, "%s.expert_count" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 61b12ee2992..e8263369b80 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -167,8 +167,8 @@ enum llm_kv { LLM_KV_EXPERT_FEED_FORWARD_LENGTH, LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, LLM_KV_EXPERT_CHUNK_FEED_FORWARD_LENGTH, - LLM_KV_SWIGLU_LIMITS, - LLM_KV_SWIGLU_LIMITS_SHARED, + LLM_KV_SWIGLU_CLAMP_EXP, + LLM_KV_SWIGLU_CLAMP_SHEXP, LLM_KV_USE_PARALLEL_RESIDUAL, LLM_KV_TENSOR_DATA_LAYOUT, LLM_KV_EXPERT_COUNT, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 5fdb11a128d..4a3b80e8536 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2502,8 +2502,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS, hparams.swiglu_clamp_exp, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_SWIGLU_LIMITS_SHARED, hparams.swiglu_clamp_shexp, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer); type = LLM_TYPE_UNKNOWN; } break; From 19cfffe10e425ef8371a82a6b3d2eeee0271365c Mon Sep 17 00:00:00 2001 From: lvyichen Date: Wed, 4 Feb 2026 21:57:25 +0800 Subject: [PATCH 13/21] avoid CI fail --- convert_hf_to_gguf.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index edb75bf45ba..d037ab402e0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7928,19 +7928,19 @@ def set_gguf_parameters(self): super().set_gguf_parameters() - layer_types = self.hparams.get("layer_types", []) + layer_types = self.hparams.get("layer_types") or [] attn_other = self.hparams.get("attention_other_setting") or {} n_head_base = self.hparams["num_attention_heads"] - n_kv_base = self.hparams["num_attention_groups"] + n_kv_base = self.hparams["num_attention_groups"] n_head_swa = attn_other.get("num_attention_heads", n_head_base) - n_kv_swa = attn_other.get("num_attention_groups", n_kv_base) + n_kv_swa = attn_other.get("num_attention_groups", n_kv_base) layer_types = layer_types[: self.block_count] head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types] - kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types] - swa_pat = [1 if lt == "sliding_attention" else 0 for lt in layer_types] + kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types] + swa_pat = [lt == "sliding_attention" for lt in layer_types] self.gguf_writer.add_head_count(head_arr) self.gguf_writer.add_head_count_kv(kv_arr) From f7ca9959f4c31f306e2f214132077ab74bbb00a9 Mon Sep 17 00:00:00 2001 From: lvyichen Date: Thu, 5 Feb 2026 19:23:52 +0800 Subject: [PATCH 14/21] Apply suggestions from code review --- src/llama-graph.cpp | 12 ++++++------ src/llama-graph.h | 2 +- src/llama-hparams.cpp | 2 +- src/llama-model-loader.cpp | 1 - src/llama-model.cpp | 6 ++++-- src/models/models.h | 8 ++++---- 6 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index ffc86193076..7afd118847e 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1028,10 +1028,10 @@ ggml_tensor * llm_graph_context::build_ffn( gate_act = ggml_clamp(ctx0, gate_act, -INFINITY, limit); cb(gate_act, "ffn_silu_clamped", il); - ggml_tensor * up_clamped = ggml_clamp(ctx0, tmp, -limit, limit); - cb(up_clamped, "ffn_up_clamped", il); + tmp = ggml_clamp(ctx0, tmp, -limit, limit); + cb(tmp, "ffn_up_clamped", il); - cur = ggml_mul(ctx0, gate_act, up_clamped); + cur = ggml_mul(ctx0, gate_act, tmp); cb(cur, "ffn_swiglu_limited", il); type_gate = LLM_FFN_SEQ; break; @@ -1350,10 +1350,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn( gate_act = ggml_clamp(ctx0, gate_act, -INFINITY, limit); cb(gate_act, "ffn_moe_silu_clamped", il); - ggml_tensor * up_clamped = ggml_clamp(ctx0, up, -limit, limit); - cb(up_clamped, "ffn_moe_up_clamped", il); + up = ggml_clamp(ctx0, up, -limit, limit); + cb(up, "ffn_moe_up_clamped", il); - cur = ggml_mul(ctx0, gate_act, up_clamped); + cur = ggml_mul(ctx0, gate_act, up); cb(cur, "ffn_moe_swiglu_limited", il); break; } diff --git a/src/llama-graph.h b/src/llama-graph.h index 756e030e9b1..81b314ceefd 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -1015,6 +1015,6 @@ struct llm_graph_context { ggml_tensor * dense_2, ggml_tensor * dense_3) const; }; -void llm_graph_dump_outputs_fp32(ggml_cgraph * gf, const char * out_dir = "dump_out"); + // TODO: better name int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional); diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index ab9285b0c1b..756dda1a7ab 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -231,4 +231,4 @@ uint32_t llama_hparams::n_layer_kv() const { bool llama_hparams::use_mrope() const { return rope_sections[0] > 0 && rope_sections[1] > 0; -} \ No newline at end of file +} diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 67047a4a4c3..1501e392ca8 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -497,7 +497,6 @@ namespace GGUFMeta { template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); - template bool llama_model_loader::get_key_or_arr(const std::string & key, std::array & result, uint32_t n, bool required); llama_model_loader::llama_model_loader( diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 4a3b80e8536..22710bc4bcb 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -560,6 +560,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f); std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f); std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f); + std::fill(hparams.swiglu_clamp_exp.begin(), hparams.swiglu_clamp_exp.end(), 0.0f); + std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f); ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false); ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false); @@ -2502,8 +2504,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false); + ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false); type = LLM_TYPE_UNKNOWN; } break; diff --git a/src/models/models.h b/src/models/models.h index 8781f7fbf28..cfcbb9aaa5b 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -355,10 +355,6 @@ struct llm_build_mimo2_iswa : public llm_graph_context { llm_build_mimo2_iswa(const llama_model & model, const llm_graph_params & params); }; -struct llm_build_step35_iswa : public llm_graph_context { - llm_build_step35_iswa(const llama_model & model, const llm_graph_params & params); -}; - struct llm_build_minicpm3 : public llm_graph_context { llm_build_minicpm3(const llama_model & model, const llm_graph_params & params); }; @@ -587,6 +583,10 @@ struct llm_build_starcoder : public llm_graph_context { llm_build_starcoder(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_step35_iswa : public llm_graph_context { + llm_build_step35_iswa(const llama_model & model, const llm_graph_params & params); +}; + struct llm_build_t5_dec : public llm_graph_context { llm_build_t5_dec(const llama_model & model, const llm_graph_params & params); }; From aea967fd178c9fdf3c9b1ef428cef1d18b9e40b9 Mon Sep 17 00:00:00 2001 From: lvyichen Date: Thu, 5 Feb 2026 19:28:17 +0800 Subject: [PATCH 15/21] Apply suggestions from code review --- gguf-py/gguf/constants.py | 2 -- gguf-py/gguf/gguf_writer.py | 3 --- src/llama-graph.h | 1 + src/llama-hparams.h | 1 - src/models/step35-iswa.cpp | 2 +- 5 files changed, 2 insertions(+), 7 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index bae3045857b..3af4fffe957 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -182,7 +182,6 @@ class Attention: class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" - DIMENSION_COUNT_PER_LAYER = "{arch}.rope.dimension_count_per_layer" DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" FREQ_BASE = "{arch}.rope.freq_base" FREQ_BASE_SWA = "{arch}.rope.freq_base_swa" @@ -3785,7 +3784,6 @@ class VisionProjectorType: # RoPE KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT -KEY_ROPE_DIMENSION_COUNT_PER_LAYER = Keys.Rope.DIMENSION_COUNT_PER_LAYER KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index cf86f579490..62172b24c38 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -931,9 +931,6 @@ def add_num_deepstack_layers(self, count: int) -> None: def add_rope_dimension_count(self, count: int) -> None: self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) - - def add_rope_dimension_count_per_layer(self, values: Sequence[int]) -> None: - self.add_array(Keys.Rope.DIMENSION_COUNT_PER_LAYER.format(arch=self.arch), values) def add_rope_dimension_sections(self, dims: Sequence[int]) -> None: self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims) diff --git a/src/llama-graph.h b/src/llama-graph.h index 81b314ceefd..1d69ff1a6fc 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -755,6 +755,7 @@ struct llm_graph_context { virtual ~llm_graph_context() = default; void cb(ggml_tensor * cur, const char * name, int il) const; + // // common // diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 9e2770087ec..6c695bdbf66 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -330,7 +330,6 @@ struct llama_hparams { bool use_mrope() const; - }; static_assert(std::is_trivially_copyable::value, "llama_hparams must be trivially copyable"); diff --git a/src/models/step35-iswa.cpp b/src/models/step35-iswa.cpp index e6368bc583d..99982a9ea48 100644 --- a/src/models/step35-iswa.cpp +++ b/src/models/step35-iswa.cpp @@ -6,7 +6,7 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll inpL = build_inp_embd(model.tok_embd); ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_iswa(); + auto * inp_attn = build_attn_inp_kv_iswa(); ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { From 4e6e2427ca14cc7b4ef7da39432d3ae8a6ef2d24 Mon Sep 17 00:00:00 2001 From: lvyichen Date: Fri, 6 Feb 2026 00:24:37 +0800 Subject: [PATCH 16/21] disabled KV shifting for LLM_ARCH_STEP35 --- convert_hf_to_gguf.py | 4 ++-- src/CMakeLists.txt | 2 +- src/llama-kv-cache-iswa.cpp | 4 +++- src/llama-kv-cache.cpp | 4 ++++ src/models/step35-iswa.cpp | 6 ++---- 5 files changed, 12 insertions(+), 8 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d037ab402e0..c4b2300be84 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7987,7 +7987,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): il = int(m.group(1)) n_main = int(self.hparams.get("num_hidden_layers", self.block_count)) if il >= n_main: - return [] + return if name.endswith("norm.weight"): data_torch += 1.0 # Map router bias (expert selection bias) to a GGUF bias tensor @@ -7997,7 +7997,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): if name.endswith((".self_attn.g_proj.weight", ".moe.gate.weight", ".moe.up_proj.weight", ".moe.gate_proj.weight", ".moe.down_proj.weight")): data_torch = data_torch.squeeze().contiguous() - return super().modify_tensors(data_torch, name, bid) + yield from super().modify_tensors(data_torch, name, bid) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: # Step35 can optionally use Llama-3 style RoPE scaling (HF: rope_scaling.rope_type == "llama3"). diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7327784e628..2115fc4255f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -93,7 +93,6 @@ add_library(llama models/maincoder.cpp models/mamba.cpp models/mimo2-iswa.cpp - models/step35-iswa.cpp models/minicpm3.cpp models/minimax-m2.cpp models/modern-bert.cpp @@ -136,6 +135,7 @@ add_library(llama models/stablelm.cpp models/starcoder.cpp models/starcoder2.cpp + models/step35-iswa.cpp models/t5-dec.cpp models/t5-enc.cpp models/wavtokenizer-dec.cpp diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp index 3a34102a23d..26e2cb4270b 100644 --- a/src/llama-kv-cache-iswa.cpp +++ b/src/llama-kv-cache-iswa.cpp @@ -218,7 +218,9 @@ llama_memory_context_ptr llama_kv_cache_iswa::init_update(llama_context * lctx, } bool llama_kv_cache_iswa::get_can_shift() const { - return kv_base->get_size() == kv_swa->get_size(); + return kv_base->get_can_shift() && + kv_swa->get_can_shift() && + kv_base->get_size() == kv_swa->get_size(); } void llama_kv_cache_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index c35cd6761b1..cb702b2a59f 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -974,6 +974,10 @@ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch & } bool llama_kv_cache::get_can_shift() const { + // Step35 uses per-layer RoPE dims; K-shift assumes a single global n_rot. + if (model.arch == LLM_ARCH_STEP35) { + return false; + } return true; } diff --git a/src/models/step35-iswa.cpp b/src/models/step35-iswa.cpp index 99982a9ea48..5755da68f1b 100644 --- a/src/models/step35-iswa.cpp +++ b/src/models/step35-iswa.cpp @@ -82,14 +82,12 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll // reshape + broadcast to [n_embd_head_v, n_head_l, n_tokens] ggml_tensor * attn_3d = ggml_reshape_3d(ctx0, attn_out, n_embd_head_v, n_head_l, n_tokens); ggml_tensor * gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens); - gate_3d = ggml_repeat(ctx0, gate_3d, attn_3d); - cb(gate_3d, "attn_gate_bcast", il); + cb(gate_3d, "attn_gate_3d", il); attn_3d = ggml_mul(ctx0, attn_3d, gate_3d); cb(attn_3d, "attn_gated_3d", il); - attn_out = ggml_cont_2d(ctx0, ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens), - n_embd_head_v * n_head_l, n_tokens); + attn_out = ggml_cont_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens); cb(attn_out, "attn_gated", il); } From a7e96cf0d7e375ace07ca92765e1019c6d314c6a Mon Sep 17 00:00:00 2001 From: lvyichen Date: Fri, 6 Feb 2026 10:54:16 +0800 Subject: [PATCH 17/21] Apply suggestions from code review --- src/llama-graph.cpp | 4 ---- src/models/step35-iswa.cpp | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 7afd118847e..a898c78d195 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -9,11 +9,7 @@ #include "llama-memory-hybrid.h" #include "llama-memory-hybrid-iswa.h" #include "llama-memory-recurrent.h" -#include "llama-mmap.h" - -#include #include -#include #include #include #include diff --git a/src/models/step35-iswa.cpp b/src/models/step35-iswa.cpp index 5755da68f1b..4a0694c7f05 100644 --- a/src/models/step35-iswa.cpp +++ b/src/models/step35-iswa.cpp @@ -87,7 +87,7 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll attn_3d = ggml_mul(ctx0, attn_3d, gate_3d); cb(attn_3d, "attn_gated_3d", il); - attn_out = ggml_cont_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens); + attn_out = ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens); cb(attn_out, "attn_gated", il); } From 46e843145abd6f9f866210ce234488abbe9d8d32 Mon Sep 17 00:00:00 2001 From: lvyichen Date: Fri, 6 Feb 2026 11:29:35 +0800 Subject: [PATCH 18/21] mistakenly removed cmath --- src/llama-graph.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index a898c78d195..bba747d37b5 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -9,7 +9,9 @@ #include "llama-memory-hybrid.h" #include "llama-memory-hybrid-iswa.h" #include "llama-memory-recurrent.h" + #include +#include #include #include #include From f542d91c524bc2ba985d949d22f879953613af67 Mon Sep 17 00:00:00 2001 From: lvyichen Date: Fri, 6 Feb 2026 17:00:00 +0800 Subject: [PATCH 19/21] add model size && apply missed suggestion --- convert_hf_to_gguf.py | 4 ++-- src/llama-model.cpp | 6 +++++- src/llama-model.h | 1 + src/models/step35-iswa.cpp | 4 +--- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c4b2300be84..6eeb66c7f35 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7939,8 +7939,8 @@ def set_gguf_parameters(self): layer_types = layer_types[: self.block_count] head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types] - kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types] - swa_pat = [lt == "sliding_attention" for lt in layer_types] + kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types] + swa_pat = [lt == "sliding_attention" for lt in layer_types] self.gguf_writer.add_head_count(head_arr) self.gguf_writer.add_head_count_kv(kv_arr) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 22710bc4bcb..9643f21d8c7 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -130,6 +130,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_100B_A6B: return "100B.A6B"; case LLM_TYPE_102B_A12B: return "102B.A12B"; case LLM_TYPE_106B_A12B: return "106B.A12B"; + case LLM_TYPE_196B_A11B: return "196B.A11B"; case LLM_TYPE_230B_A10B: return "230B.A10B"; case LLM_TYPE_235B_A22B: return "235B.A22B"; case LLM_TYPE_300B_A47B: return "300B.A47B"; @@ -2507,7 +2508,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false); ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false); - type = LLM_TYPE_UNKNOWN; + switch (hparams.n_layer) { + case 45: type = LLM_TYPE_196B_A11B; break; + default: type = LLM_TYPE_UNKNOWN; + } } break; default: throw std::runtime_error("unsupported model architecture"); } diff --git a/src/llama-model.h b/src/llama-model.h index 5b408bcea25..7b580043b33 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -123,6 +123,7 @@ enum llm_type { LLM_TYPE_100B_A6B, LLM_TYPE_102B_A12B, // Solar-Open LLM_TYPE_106B_A12B, // GLM-4.5-Air + LLM_TYPE_196B_A11B, // Step3.5-Flash LLM_TYPE_230B_A10B, // Minimax M2 LLM_TYPE_235B_A22B, LLM_TYPE_300B_A47B, // Ernie MoE big diff --git a/src/models/step35-iswa.cpp b/src/models/step35-iswa.cpp index 4a0694c7f05..e5b2e056c08 100644 --- a/src/models/step35-iswa.cpp +++ b/src/models/step35-iswa.cpp @@ -131,7 +131,7 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll n_expert, n_expert_used, LLM_FFN_SILU, norm_w, scale_w, w_scale, - LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID, + (llama_expert_gating_func_type) hparams.expert_gating_func, il); cb(moe_out, "ffn_moe_out", il); @@ -166,5 +166,3 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll ggml_build_forward_expand(gf, cur); } - - From 430da166d3b140039a68838dfac30cfd9f76bf7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 6 Feb 2026 10:55:20 +0100 Subject: [PATCH 20/21] assert partial_rotary_factors --- convert_hf_to_gguf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 6eeb66c7f35..e3e58ebe4fc 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7929,6 +7929,7 @@ def set_gguf_parameters(self): super().set_gguf_parameters() layer_types = self.hparams.get("layer_types") or [] + partial_rotary_factors = self.hparams.get("partial_rotary_factors") or [] attn_other = self.hparams.get("attention_other_setting") or {} n_head_base = self.hparams["num_attention_heads"] @@ -7938,6 +7939,8 @@ def set_gguf_parameters(self): n_kv_swa = attn_other.get("num_attention_groups", n_kv_base) layer_types = layer_types[: self.block_count] + partial_rotary_factors = partial_rotary_factors[: self.block_count] + assert [1.0 if lt == "sliding_attention" else 0.5 for lt in layer_types] == partial_rotary_factors head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types] kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types] swa_pat = [lt == "sliding_attention" for lt in layer_types] From 402fc2e4ee3149541e0f26e1bc7adad1595bed28 Mon Sep 17 00:00:00 2001 From: lvyichen Date: Fri, 6 Feb 2026 19:54:13 +0800 Subject: [PATCH 21/21] fix CI errors: --- convert_hf_to_gguf.py | 5 ++--- src/models/step35-iswa.cpp | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e3e58ebe4fc..843c00a8969 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7917,10 +7917,8 @@ class Step35Model(TextModel): model_arch = gguf.MODEL_ARCH.STEP35 def set_gguf_parameters(self): - rope_theta_per_layer = None rope_theta = self.hparams.get("rope_theta") if isinstance(rope_theta, list): - rope_theta_per_layer = rope_theta self.hparams["rope_theta"] = float(rope_theta[0]) self.hparams["local_rope_theta"] = float(rope_theta[1]) self.rope_parameters["rope_theta"] = self.hparams["rope_theta"] @@ -7996,7 +7994,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): # Map router bias (expert selection bias) to a GGUF bias tensor if name.endswith(".moe.router_bias"): name += ".bias" - + if name.endswith((".self_attn.g_proj.weight", ".moe.gate.weight", ".moe.up_proj.weight", ".moe.gate_proj.weight", ".moe.down_proj.weight")): data_torch = data_torch.squeeze().contiguous() @@ -8042,6 +8040,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + @ModelBase.register("PanguEmbeddedForCausalLM") class PanguEmbeddedModel(TextModel): model_arch = gguf.MODEL_ARCH.PANGU_EMBED diff --git a/src/models/step35-iswa.cpp b/src/models/step35-iswa.cpp index e5b2e056c08..f8737815a67 100644 --- a/src/models/step35-iswa.cpp +++ b/src/models/step35-iswa.cpp @@ -8,7 +8,7 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll ggml_tensor * inp_pos = build_inp_pos(); auto * inp_attn = build_attn_inp_kv_iswa(); ggml_tensor * inp_out_ids = build_inp_out_ids(); - + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -103,7 +103,7 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il);