From 92ac1e016b4327bb58f62a098cd6bc484d9d6cbf Mon Sep 17 00:00:00 2001 From: Francisco Herrera Date: Sun, 4 Jan 2026 23:39:25 -0500 Subject: [PATCH 1/3] doc: clarify that steps also apply to linux for opencl (#18002) * Clarify setup steps for Linux Added note that setup steps apply to Linux as well. * Added note for backtick replacement * clarify that backtick replacement only applies on linux * clarified Linux specific steps So actually some changes are needed for Linux but they are minor. * clarify change execution * clarify by placing info after steps * clarify which steps * Make instructions consistent across OSes * Rm whitespace * Update docs/backend/OPENCL.md Co-authored-by: Aaron Teo * Update docs/backend/OPENCL.md Co-authored-by: Aaron Teo * Update docs/backend/OPENCL.md Co-authored-by: Aaron Teo --------- Co-authored-by: Aaron Teo --- docs/backend/OPENCL.md | 50 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/docs/backend/OPENCL.md b/docs/backend/OPENCL.md index ce6c7b5605a..0561a74c47e 100644 --- a/docs/backend/OPENCL.md +++ b/docs/backend/OPENCL.md @@ -218,6 +218,56 @@ cmake .. -G Ninja ` ninja ``` +## Linux + +The two steps just above also apply to Linux. When building for linux, the commands are mostly the same as those for PowerShell on Windows, but in the second step they do not have the `-DCMAKE_TOOLCHAIN_FILE` parameter, and then in both steps the backticks are replaced with back slashes. + +If not installed already, install Git, CMake, Clang, Ninja and Python, then run in the terminal the following: + +### I. Setup Environment + +1. **Install OpenCL Headers and Library** + +```bash +mkdir -p ~/dev/llm + +cd ~/dev/llm +git clone https://github.com/KhronosGroup/OpenCL-Headers && cd OpenCL-Headers +mkdir build && cd build +cmake .. -G Ninja \ + -DBUILD_TESTING=OFF \ + -DOPENCL_HEADERS_BUILD_TESTING=OFF \ + -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF \ + -DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl" +cmake --build . --target install + +cd ~/dev/llm +git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && cd OpenCL-ICD-Loader +mkdir build && cd build +cmake .. -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" \ + -DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl" +cmake --build . --target install +``` + +### II. Build llama.cpp + +```bash +mkdir -p ~/dev/llm +cd ~/dev/llm + +git clone https://github.com/ggml-org/llama.cpp && cd llama.cpp +mkdir build && cd build + +cmake .. -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" \ + -DBUILD_SHARED_LIBS=OFF \ + -DGGML_OPENCL=ON +ninja +``` + ## Known Issues - Flash attention does not always improve performance. From 67e3f6f60155870d4b5ce727515aa81e5a7b4753 Mon Sep 17 00:00:00 2001 From: Chenguang Li <757486878@qq.com> Date: Mon, 5 Jan 2026 15:38:18 +0800 Subject: [PATCH 2/3] CANN: add operator fusion support for ADD + RMS_NORM (#17512) This commit implements operator fusion for ADD + RMS_NORM operations in the CANN backend to reduce memory access overhead and improve performance. The fusion is controlled by the GGML_CANN_OPERATOR_FUSION environment variable (default: false). Changes: - Implement ggml_cann_op_add_rms_norm_fused() using ACLNN AddRmsNorm - Add ggml_cann_can_fuse() to check fusion eligibility - Integrate fusion logic into computation graph evaluation - Add test cases for ADD + RMS_NORM fusion - Update documentation with new environment variable The fusion combines ADD and RMS_NORM into a single kernel call, which is more efficient than executing them separately. --- docs/backend/CANN.md | 4 +++ ggml/src/ggml-cann/aclnn_ops.cpp | 55 ++++++++++++++++++++++++++++ ggml/src/ggml-cann/aclnn_ops.h | 14 ++++++++ ggml/src/ggml-cann/ggml-cann.cpp | 44 +++++++++++++++++++++++ tests/test-backend-ops.cpp | 62 ++++++++++++++++++++++++++++++++ 5 files changed, 179 insertions(+) diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md index 37dcfaef9a8..b03c2a122cb 100755 --- a/docs/backend/CANN.md +++ b/docs/backend/CANN.md @@ -327,3 +327,7 @@ Maximum number of compiled CANN graphs kept in the LRU cache, default is 12. Whe ### GGML_CANN_PREFILL_USE_GRAPH Enable ACL graph execution during the prefill stage, default is false. This option is only effective when FA is enabled. + +### GGML_CANN_OPERATOR_FUSION + +Enable operator fusion during computation, default is false. This option fuses compatible operators (e.g., ADD + RMS_NORM) to reduce overhead and improve performance. diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 2180a06fd00..50b6bd00e4c 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -26,6 +26,7 @@ #include "ggml.h" #include +#include #include #include #include @@ -3805,3 +3806,57 @@ void ggml_cann_ssm_conv(ggml_backend_cann_context & ctx, ggml_tensor * dst) { cubeMathType); } + +void ggml_cann_op_add_rms_norm_fused(ggml_backend_cann_context & ctx, + ggml_tensor * add_node, + ggml_tensor * rms_norm_node) { + // Get the two input tensors for ADD operation + ggml_tensor * x1 = add_node->src[0]; + ggml_tensor * x2 = add_node->src[1]; + + // Create ACL tensors for the two ADD inputs + acl_tensor_ptr acl_x1 = ggml_cann_create_tensor(x1); + acl_tensor_ptr acl_x2 = ggml_cann_create_tensor(x2); + + // Get epsilon parameter from rms_norm_tensor + float eps; + memcpy(&eps, rms_norm_node->op_params, sizeof(float)); + + // Build gamma tensor (RMS normalization scaling factor) + // Gamma should match the normalized dimensions (last dimension of x1) + size_t acl_gamma_nb[GGML_MAX_DIMS]; + acl_gamma_nb[0] = ggml_type_size(rms_norm_node->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + acl_gamma_nb[i] = acl_gamma_nb[i - 1] * x1->ne[i - 1]; + } + acl_tensor_ptr acl_gamma = + get_cache_acl_tensor(ctx, &ctx.rms_norm_one_tensor_cache.cache, ctx.rms_norm_one_tensor_cache.size, x1->ne, + acl_gamma_nb, rms_norm_node->type, + 1, // dims - only the last dimension + 1.0f // value + ); + + // Build rstdOut tensor (output for normalized standard deviation) + // Shape should be the dimensions that are NOT normalized + int64_t acl_rstd_ne[] = { 1, x1->ne[1], x1->ne[2], x1->ne[3] }; + size_t acl_rstd_nb[GGML_MAX_DIMS - 1]; + acl_rstd_nb[0] = sizeof(float); + for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { + acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1]; + } + acl_tensor_ptr acl_rstd = + get_cache_acl_tensor(ctx, &ctx.rms_norm_zero_tensor_cache.cache, ctx.rms_norm_zero_tensor_cache.size, + acl_rstd_ne, acl_rstd_nb, GGML_TYPE_F32, GGML_MAX_DIMS, + 0.0f // value + ); + + acl_tensor_ptr acl_xout = ggml_cann_create_tensor(add_node); + + // Create yOut tensor (final output after RMS normalization) + acl_tensor_ptr acl_yout = ggml_cann_create_tensor(rms_norm_node); + + // Call fused ADD + RMS_NORM operator + GGML_CANN_CALL_ACLNN_OP(ctx, AddRmsNorm, acl_x1.get(), acl_x2.get(), acl_gamma.get(), + eps, // double type + acl_yout.get(), acl_rstd.get(), acl_xout.get()); +} diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h index a6ea016c542..08ee7b1fbdf 100644 --- a/ggml/src/ggml-cann/aclnn_ops.h +++ b/ggml/src/ggml-cann/aclnn_ops.h @@ -935,6 +935,20 @@ template void register_acl_resources(std::vectorstream())); } +/** + * @brief Check if CANN backend can fuse the specified operation sequence + * + * This function determines whether an operation sequence starting from the specified node + * can be fused into an optimized operation in the CANN backend. Operation fusion can reduce + * memory access overhead and improve computational efficiency. + * + * @param cgraph Pointer to the computation graph + * @param node_idx Index of the starting node in the computation graph + * @param ops Sequence of operation types to check for fusion + * @return true if the operations can be fused + * @return false if the operations cannot be fused + */ +static bool ggml_cann_can_fuse(const struct ggml_cgraph * cgraph, + int node_idx, + std::initializer_list ops) { + if (!ggml_can_fuse(cgraph, node_idx, ops)) { + return false; + } + + // CANN backend supports fusing ADD + RMS_NORM operations + if ((ops.size() == 2) && ops.begin()[0] == GGML_OP_ADD && ops.begin()[1] == GGML_OP_RMS_NORM) { + ggml_tensor * add_node = cgraph->nodes[node_idx]; + // TODO: support broadcast for ADD + RMS_NORM + if (add_node->src[0]->ne[0] != add_node->src[1]->ne[0] || add_node->src[0]->ne[1] != add_node->src[1]->ne[1] || + add_node->src[0]->ne[2] != add_node->src[1]->ne[2] || add_node->src[0]->ne[3] != add_node->src[1]->ne[3]) { + return false; + } + return true; + } + + return false; +} + /** * @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API. * @@ -2101,9 +2136,18 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx #endif // USE_ACL_GRAPH // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph. // With the use of CANN graphs, the execution will be performed by the graph launch. + static bool opt_fusion = parse_bool(get_env("GGML_CANN_OPERATOR_FUSION").value_or("")); + if (!use_cann_graph || cann_graph_capture_required) { for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; + if (opt_fusion) { + if (ggml_cann_can_fuse(cgraph, i, { GGML_OP_ADD, GGML_OP_RMS_NORM })) { + ggml_cann_op_add_rms_norm_fused(*cann_ctx, node, cgraph->nodes[i + 1]); + i++; + continue; + } + } if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 76abfdaf0af..fa6e80e3fcd 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3431,6 +3431,65 @@ struct test_rms_norm_mul_add : public test_case { } }; +// GGML_OP_ADD + GGML_OP_RMS_NORM (fused operation) +struct test_add_rms_norm : public test_case { + const ggml_type type; + const std::array ne; + const float eps; + const bool broadcast; + + std::string op_desc(ggml_tensor * t) override { + GGML_UNUSED(t); + return "ADD_RMS_NORM"; + } + + bool run_whole_graph() override { return true; } + + std::string vars() override { + return VARS_TO_STR4(type, ne, eps, broadcast); + } + + test_add_rms_norm(ggml_type type = GGML_TYPE_F32, + std::array ne = {64, 5, 4, 3}, + float eps = 1e-6f, bool broadcast = false) + : type(type), ne(ne), eps(eps), broadcast(broadcast) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + std::array broadcast_dims = {ne[0]*2, ne[1]*3, ne[2]*3, ne[3]*4}; + + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, broadcast ? broadcast_dims.data() : ne.data()); + ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data()); + + ggml_set_param(a); + ggml_set_name(a, "a"); + ggml_set_param(b); + ggml_set_name(b, "b"); + + // ADD operation followed by RMS_NORM + ggml_tensor * add_result = ggml_add(ctx, a, b); + ggml_set_name(add_result, "add_result"); + + ggml_tensor * out = ggml_rms_norm(ctx, add_result, eps); + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t, -10.f, 10.f); + } + } + + float grad_eps() override { + return 1.0f; + } + + bool grad_precise() override { + return true; + } +}; + // GGML_OP_SSM_CONV struct test_ssm_conv : public test_case { const ggml_type type; @@ -7393,11 +7452,14 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true)); test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false)); test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true)); + test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false)); + test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true)); } for (uint32_t n : {1, 511, 1025, 8192, 33*512}) { for (bool multi_add : {false, true}) { test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {n, 1, 1, 1}, 1e-6f, false, multi_add)); } + test_cases.emplace_back(new test_add_rms_norm(GGML_TYPE_F32, {n, 1, 1, 1}, 1e-6f, false)); } for (auto multi_add : {false, true}) { From eadc4184caee5b5f68f31f19a2f65c6961748e46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 5 Jan 2026 09:14:04 +0100 Subject: [PATCH 3/3] llama : refactor rope_freq_base/scale_swa conversion and init (#18553) * refactor rope_freq_base/scale_swa conversion and init * safe defaults for unknowns * update relevant models * grammar * add get_rope_freq_scale to modern-bert * const * const * log swa info --- convert_hf_to_gguf.py | 14 +++++++--- src/llama-hparams.h | 4 +-- src/llama-model.cpp | 49 ++++++++++++++++++++++++++-------- src/models/afmoe.cpp | 14 ++++++---- src/models/cohere2-iswa.cpp | 3 +++ src/models/gemma2-iswa.cpp | 7 +++-- src/models/llama-iswa.cpp | 8 ++++-- src/models/modern-bert.cpp | 7 ++--- src/models/openai-moe-iswa.cpp | 7 +++-- src/models/smallthinker.cpp | 16 +++++++---- 10 files changed, 93 insertions(+), 36 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3340a0a7dcb..68446aa44f9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -771,9 +771,14 @@ def __init__(self, *args, **kwargs): self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {} + rope_theta = self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True) + local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "swa_rope_theta", "rope_local_base_freq"], optional=True) + # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters: - if "rope_theta" not in self.rope_parameters and (rope_theta := self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)) is not None: + if local_rope_theta is not None: + self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta} + if "rope_theta" not in self.rope_parameters and rope_theta is not None: self.rope_parameters["rope_theta"] = rope_theta if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None: self.rope_parameters["rope_type"] = rope_type @@ -839,6 +844,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count_kv(n_head_kv) logger.info(f"gguf: key-value head count = {n_head_kv}") + # TODO: Handle "sliding_attention" similarly when models start implementing it rope_params = self.rope_parameters.get("full_attention", self.rope_parameters) if (rope_type := rope_params.get("rope_type")) is not None: rope_factor = rope_params.get("factor") @@ -885,6 +891,9 @@ def set_gguf_parameters(self): if (rope_theta := rope_params.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) logger.info(f"gguf: rope theta = {rope_theta}") + if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base_swa(local_rope_theta) + logger.info(f"gguf: rope theta swa = {local_rope_theta}") if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") @@ -5004,7 +5013,6 @@ def set_gguf_parameters(self): if (sliding_window := self.find_hparam(["window_size", "sliding_window"], optional=True)) is not None: self.gguf_writer.add_sliding_window(sliding_window) self.gguf_writer.add_sliding_window_pattern(self.hparams["sliding_window_pattern"]) - self.gguf_writer.add_rope_freq_base_swa(self.rope_parameters.get("sliding_attention", {"rope_theta": self.hparams.get("rope_local_theta")})["rope_theta"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: @@ -7480,7 +7488,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) self.gguf_writer.add_sliding_window_pattern(self.hparams["hybrid_layer_pattern"]) - self.gguf_writer.add_rope_freq_base_swa(self.hparams["swa_rope_theta"]) self.gguf_writer.add_value_length(self.hparams["v_head_dim"]) self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"]) self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) @@ -10218,7 +10225,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_sliding_window(self.hparams["local_attention"]) if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None: self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) - self.gguf_writer.add_rope_freq_base_swa(self.rope_parameters.get("sliding_attention", {"rope_theta": self.hparams.get("local_rope_theta")})["rope_theta"]) self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 42def73f06f..fc5708fc4b0 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -105,9 +105,9 @@ struct llama_hparams { float rope_attn_factor = 1.0f; float rope_freq_base_train; - float rope_freq_base_train_swa; + float rope_freq_base_train_swa = 10000.0f; float rope_freq_scale_train; - float rope_freq_scale_train_swa; + float rope_freq_scale_train_swa = 1.0f; uint32_t n_ctx_orig_yarn; float rope_yarn_log_mul = 0.0f; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c739b0b48ae..28dcc2840f0 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -578,6 +578,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED); + // TODO: Handle SWA metadata similarly when models start implementing it // rope_freq_scale (inverse of the kv) is optional float ropescale = 0.0f; if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) { @@ -586,10 +587,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { } hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; - // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers - hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; - hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; - ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false); // non-transformer models do not have attention heads @@ -677,6 +674,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.f_attn_temp_scale = 0.1f; hparams.f_attn_temp_offset = 1.0f; hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full + + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); } switch (hparams.n_expert) { @@ -722,6 +723,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { if (hparams.n_swa > 0) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.set_swa_pattern(4); + + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); } else { hparams.swa_type = LLAMA_SWA_TYPE_NONE; } @@ -1243,7 +1248,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { if (found_swa && hparams.n_swa > 0) { uint32_t swa_period = 8; hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; - hparams.rope_freq_scale_train_swa = 1.0f; ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa); ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false); hparams.set_swa_pattern(swa_period); @@ -1309,7 +1313,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.n_swa = 4096; // default value of gemma 2 hparams.set_swa_pattern(2); hparams.attn_soft_cap = true; + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); @@ -1334,8 +1341,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.set_swa_pattern(6); - hparams.rope_freq_base_train_swa = 10000.0f; - hparams.rope_freq_scale_train_swa = 1.0f; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); } else { hparams.swa_type = LLAMA_SWA_TYPE_NONE; } @@ -1365,10 +1371,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.set_swa_pattern(5); hparams.n_layer_kv_from_start = 20; - hparams.rope_freq_base_train_swa = 10000.0f; - hparams.rope_freq_scale_train_swa = 1.0f; hparams.f_attention_scale = 1.0f; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -1384,9 +1389,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.set_swa_pattern(6); hparams.causal_attn = false; // embeddings do not use causal attention - hparams.rope_freq_base_train_swa = 10000.0f; - hparams.rope_freq_scale_train_swa = 1.0f; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); @@ -1525,7 +1529,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.set_swa_pattern(4); + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -1564,6 +1571,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { if (found_swa && hparams.n_swa > 0) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.set_swa_pattern(4); + + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); } else { hparams.swa_type = LLAMA_SWA_TYPE_NONE; } @@ -1906,6 +1917,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.n_swa = 4096; hparams.set_swa_pattern(4); + + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); } ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); @@ -2208,6 +2223,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.set_swa_pattern(2); + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); + switch (hparams.n_layer) { case 24: type = LLM_TYPE_20B; break; case 36: type = LLM_TYPE_120B; break; @@ -2252,6 +2271,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.n_swa = 4096; hparams.set_swa_pattern(4, true); + + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); } else { hparams.swa_type = LLAMA_SWA_TYPE_NONE; hparams.n_no_rope_layer_step = hparams.n_layer; @@ -7098,6 +7121,10 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str()); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); + if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { + LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa); + LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa); + } LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn); LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul); LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); diff --git a/src/models/afmoe.cpp b/src/models/afmoe.cpp index 0192e344ca0..6a752a403f6 100644 --- a/src/models/afmoe.cpp +++ b/src/models/afmoe.cpp @@ -22,8 +22,15 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para const float kq_scale = 1.0f/sqrtf(float(n_embd_head)); for (int il = 0; il < n_layer; ++il) { + const float freq_base_l = model.get_rope_freq_base (cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + ggml_tensor * inpSA = inpL; + // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous + const bool use_rope = hparams.n_no_rope_layer_step > 0 && + (il + 1) % hparams.n_no_rope_layer_step != 0; + // dual attention normalization (pre) cur = build_norm(inpL, model.layers[il].attn_norm, NULL, @@ -56,19 +63,16 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para cb(Qcur, "Qcur_normed", il); cb(Kcur, "Kcur_normed", il); - // RoPE only for sliding_attention layers - const bool use_rope = hparams.n_no_rope_layer_step > 0 && - ((il + 1) % hparams.n_no_rope_layer_step) != 0; if (use_rope) { Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur_rope", il); Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur_rope", il); } diff --git a/src/models/cohere2-iswa.cpp b/src/models/cohere2-iswa.cpp index b18aa8c4e6c..9334b5e4263 100644 --- a/src/models/cohere2-iswa.cpp +++ b/src/models/cohere2-iswa.cpp @@ -21,6 +21,9 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const for (int il = 0; il < n_layer; ++il) { const bool is_swa = hparams.is_swa(il); + // UNUSED: + // const float freq_base_l = model.get_rope_freq_base (cparams, il); + // const float freq_scale_l = model.get_rope_freq_scale(cparams, il); // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il); diff --git a/src/models/gemma2-iswa.cpp b/src/models/gemma2-iswa.cpp index 9cc59a53ee5..7a9198193ac 100644 --- a/src/models/gemma2-iswa.cpp +++ b/src/models/gemma2-iswa.cpp @@ -19,6 +19,9 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + const float freq_base_l = model.get_rope_freq_base (cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, @@ -43,12 +46,12 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); diff --git a/src/models/llama-iswa.cpp b/src/models/llama-iswa.cpp index 03f80616821..61dd2c179f1 100644 --- a/src/models/llama-iswa.cpp +++ b/src/models/llama-iswa.cpp @@ -25,8 +25,12 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_ ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + const float freq_base_l = model.get_rope_freq_base (cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + ggml_tensor * inpSA = inpL; + // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous const bool use_rope = hparams.n_no_rope_layer_step > 0 && (il + 1) % hparams.n_no_rope_layer_step != 0; @@ -67,13 +71,13 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_ if (use_rope) { Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow ); Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow ); } else if (inp_attn_scale) { diff --git a/src/models/modern-bert.cpp b/src/models/modern-bert.cpp index 6df418ecda2..bb12ed819f7 100644 --- a/src/models/modern-bert.cpp +++ b/src/models/modern-bert.cpp @@ -23,7 +23,8 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll auto * inp_attn = build_attn_inp_no_cache(); for (int il = 0; il < n_layer; ++il) { - float freq_base_l = model.get_rope_freq_base(cparams, il); + const float freq_base_l = model.get_rope_freq_base(cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); cur = inpL; @@ -48,13 +49,13 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll // RoPE Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow ); Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow ); diff --git a/src/models/openai-moe-iswa.cpp b/src/models/openai-moe-iswa.cpp index 96596709eec..dbe3ca1851f 100644 --- a/src/models/openai-moe-iswa.cpp +++ b/src/models/openai-moe-iswa.cpp @@ -14,6 +14,9 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model, ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + const float freq_base_l = model.get_rope_freq_base (cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + ggml_tensor * inpSA = inpL; // norm @@ -49,13 +52,13 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model, Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow ); Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow ); diff --git a/src/models/smallthinker.cpp b/src/models/smallthinker.cpp index 277eec29554..4c497ca76f4 100644 --- a/src/models/smallthinker.cpp +++ b/src/models/smallthinker.cpp @@ -26,10 +26,16 @@ llm_build_smallthinker::llm_build_smallthinker(const llama_model & model, ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + const float freq_base_l = model.get_rope_freq_base (cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + ggml_tensor * inpSA = inpL; - ggml_tensor * probs = nullptr; - probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens] + // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous + const bool use_rope = hparams.n_no_rope_layer_step == n_layer || + il % hparams.n_no_rope_layer_step != 0; + + ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens] cb(probs, "ffn_moe_logits", il); // norm @@ -52,11 +58,11 @@ llm_build_smallthinker::llm_build_smallthinker(const llama_model & model, Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) { - Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + if (use_rope) { + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); - Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); } cb(Qcur, "Qcur", il);